diff --git a/eval-queue/AlekseyKorshuk/chatml-pyg-v1_eval_request_False_False_False.json b/eval-queue/AlekseyKorshuk/chatml-pyg-v1_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..bd9563085d54e03455d5e55eb2d15e3695c65ccb --- /dev/null +++ b/eval-queue/AlekseyKorshuk/chatml-pyg-v1_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "AlekseyKorshuk/chatml-pyg-v1", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "job_id": "460959", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "license": "creativeml-openrail-m", "likes": 1, "params": 5.844} \ No newline at end of file diff --git a/eval-queue/AlekseyKorshuk/pygmalion-6b-vicuna-chatml_eval_request_False_False_False.json b/eval-queue/AlekseyKorshuk/pygmalion-6b-vicuna-chatml_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..26a9d0ba47daea39e41633a443622803a80af496 --- /dev/null +++ b/eval-queue/AlekseyKorshuk/pygmalion-6b-vicuna-chatml_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "AlekseyKorshuk/pygmalion-6b-vicuna-chatml", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "weight_type": "Original", "precision": "float16", "job_id": "460478", "model_type": "fine-tuned", "license": "creativeml-openrail-m", "likes": 2, "params": 5.844} \ No newline at end of file diff --git a/eval-queue/AlekseyKorshuk/pygmalion-6b-vicuna-chatml_eval_request_False_float16_Original.json b/eval-queue/AlekseyKorshuk/pygmalion-6b-vicuna-chatml_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..2c0ef102afa3a2fbde12f17e23e403f549f96103 --- /dev/null +++ b/eval-queue/AlekseyKorshuk/pygmalion-6b-vicuna-chatml_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "AlekseyKorshuk/pygmalion-6b-vicuna-chatml", "base_model": "GPT-J", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:50:14Z", "model_type": "fine-tuned", "job_id": "430846", "license": "creativeml-openrail-m", "likes": 2, "params": 5.844} \ No newline at end of file diff --git a/eval-queue/AlekseyKorshuk/vic15-exp-syn-fight-cp3838_eval_request_False_float16_Original.json b/eval-queue/AlekseyKorshuk/vic15-exp-syn-fight-cp3838_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e4690e25b63503f87cd5715f5ca6e2443db34602 --- /dev/null +++ b/eval-queue/AlekseyKorshuk/vic15-exp-syn-fight-cp3838_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "AlekseyKorshuk/vic15-exp-syn-fight-cp3838", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:58:30Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522156", "license": "?", "likes": 0, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/AlekseyKorshuk/vicuna-7b_eval_request_False_False_False.json b/eval-queue/AlekseyKorshuk/vicuna-7b_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..caf53e3a0452029f941348a5fab9e5a4f79af6d5 --- /dev/null +++ b/eval-queue/AlekseyKorshuk/vicuna-7b_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "AlekseyKorshuk/vicuna-7b", "base_model": "", "revision": "main", "private": false, "status": "FAILED_2", "job_id": "176582", "weight_type": "Original", "precision": "float16", "license": "other", "likes": 107, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/Andron00e/Llama-Translation-Answering_eval_request_False_False_True.json b/eval-queue/Andron00e/Llama-Translation-Answering_eval_request_False_False_True.json new file mode 100644 index 0000000000000000000000000000000000000000..82aa893a6092fe35f0aa12bd59fdab097ba987a4 --- /dev/null +++ b/eval-queue/Andron00e/Llama-Translation-Answering_eval_request_False_False_True.json @@ -0,0 +1 @@ +{"model": "Andron00e/Llama-Translation-Answering", "base_model": "openlm-research/open_llama_3b", "revision": "main", "private": false, "status": "FAILED", "submitted_time": "2023-09-09T10:38:12Z", "weight_type": "Delta", "precision": "float16", "job_id": "439922", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/Andron00e/Open-Llama-3B-LoRA-Corpus_eval_request_False_False_True.json b/eval-queue/Andron00e/Open-Llama-3B-LoRA-Corpus_eval_request_False_False_True.json new file mode 100644 index 0000000000000000000000000000000000000000..243c420591f908c1a3d2e6d1d07209c82c538ccb --- /dev/null +++ b/eval-queue/Andron00e/Open-Llama-3B-LoRA-Corpus_eval_request_False_False_True.json @@ -0,0 +1 @@ +{"model": "Andron00e/Open-Llama-3B-LoRA-Corpus", "base_model": "openlm-research/open_llama_3b", "revision": "main", "private": false, "status": "FAILED", "submitted_time": "2023-08-25T13:05:50Z", "weight_type": "Delta", "precision": "float16", "job_id": "391939", "params": 3.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca_eval_request_False_False_False.json b/eval-queue/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..a0f59f4a751cbb0314642b10c160e4c07f27fc4d --- /dev/null +++ b/eval-queue/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca", "base_model": "openlm-research/open_llama_3b", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "weight_type": "Original", "precision": "float16", "job_id": "472121", "model_type": "fine-tuned", "license": "apache-2.0", "likes": 0, "params": 3.426} \ No newline at end of file diff --git a/eval-queue/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca_eval_request_False_False_True.json b/eval-queue/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca_eval_request_False_False_True.json new file mode 100644 index 0000000000000000000000000000000000000000..2697c52c9f384fe3b7252e2d173d356b40a1ea30 --- /dev/null +++ b/eval-queue/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca_eval_request_False_False_True.json @@ -0,0 +1 @@ +{"model": "Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca", "base_model": "openlm-research/open_llama_3b", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "weight_type": "Delta", "precision": "float16", "job_id": "471265", "model_type": "fine-tuned", "license": "apache-2.0", "likes": 0, "params": 3.426} \ No newline at end of file diff --git a/eval-queue/Andron00e/YetAnother_Open-Llama-3B-LoRA_eval_request_False_False_True.json b/eval-queue/Andron00e/YetAnother_Open-Llama-3B-LoRA_eval_request_False_False_True.json new file mode 100644 index 0000000000000000000000000000000000000000..d511ad4504b4a98343eedf9509d2ccc3104be316 --- /dev/null +++ b/eval-queue/Andron00e/YetAnother_Open-Llama-3B-LoRA_eval_request_False_False_True.json @@ -0,0 +1 @@ +{"model": "Andron00e/YetAnother_Open-Llama-3B-LoRA", "base_model": "openlm-research/open_llama_3b", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "weight_type": "Delta", "precision": "float16", "job_id": "461928", "license": "?", "likes": 0, "params": 3.426} \ No newline at end of file diff --git a/eval-queue/ApotheosisRPG/gmaiv1_eval_request_False_bfloat16_Original.json b/eval-queue/ApotheosisRPG/gmaiv1_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..2da7790faf76f622175a1cac5d1e85cafd8c4acb --- /dev/null +++ b/eval-queue/ApotheosisRPG/gmaiv1_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "ApotheosisRPG/gmaiv1", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-11T10:51:27Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 19.994, + "license": "cc-by-nc-4.0", + "job_id": "889985", + "job_start_time": "2023-12-11T12:04:10.339268" +} \ No newline at end of file diff --git a/eval-queue/ApotheosisRPG/imb_eval_request_False_4bit_Original.json b/eval-queue/ApotheosisRPG/imb_eval_request_False_4bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..ccd845aef29068555dc22065d9d97d1d8f94a2c8 --- /dev/null +++ b/eval-queue/ApotheosisRPG/imb_eval_request_False_4bit_Original.json @@ -0,0 +1,16 @@ +{ + "model": "ApotheosisRPG/imb", + "base_model": "", + "revision": "main", + "private": false, + "precision": "4bit", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-11T18:29:46Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 19.994, + "license": "cc-by-nc-4.0", + "job_id": "896916", + "job_start_time": "2023-12-12T01:08:29.784270" +} \ No newline at end of file diff --git a/eval-queue/ApotheosisRPG/imb_eval_request_False_8bit_Original.json b/eval-queue/ApotheosisRPG/imb_eval_request_False_8bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..7453ce3a93cb542ae20d888d2b285d874654ecde --- /dev/null +++ b/eval-queue/ApotheosisRPG/imb_eval_request_False_8bit_Original.json @@ -0,0 +1,16 @@ +{ + "model": "ApotheosisRPG/imb", + "base_model": "", + "revision": "main", + "private": false, + "precision": "8bit", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-11T18:29:50Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 19.994, + "license": "cc-by-nc-4.0", + "job_id": "896918", + "job_start_time": "2023-12-12T01:10:22.542643" +} \ No newline at end of file diff --git a/eval-queue/ApotheosisRPG/imb_eval_request_False_bfloat16_Original.json b/eval-queue/ApotheosisRPG/imb_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..99e0b51e0a4229c1b3a6f65a4e27c656e3abfb7a --- /dev/null +++ b/eval-queue/ApotheosisRPG/imb_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "ApotheosisRPG/imb", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-11T18:20:21Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 19.994, + "license": "cc-by-nc-4.0", + "job_id": "896910", + "job_start_time": "2023-12-12T01:01:37.413995" +} \ No newline at end of file diff --git a/eval-queue/ByteWave/Cheus-11B_eval_request_False_float16_Original.json b/eval-queue/ByteWave/Cheus-11B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..43a5f8cf75d071e8392d0f17e22de0b8359975dd --- /dev/null +++ b/eval-queue/ByteWave/Cheus-11B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "ByteWave/Cheus-11B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-19T15:03:59Z", "model_type": "? : ", "likes": 0, "params": 10.732, "license": "apache-2.0", "job_id": "717795"} \ No newline at end of file diff --git a/eval-queue/ByteWave/Yi-15B-Llama_eval_request_False_float16_Original.json b/eval-queue/ByteWave/Yi-15B-Llama_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..ed155c5f43d4d06d4ff3dc6246e8ebd38d1529fd --- /dev/null +++ b/eval-queue/ByteWave/Yi-15B-Llama_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "ByteWave/Yi-15B-Llama", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-19T18:50:53Z", "model_type": "? : ", "likes": 1, "params": 14.864, "license": "other", "job_id": "717881"} \ No newline at end of file diff --git a/eval-queue/ByteWave/Yi-23B-Llama_eval_request_False_float16_Original.json b/eval-queue/ByteWave/Yi-23B-Llama_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..8b9c3b9bdb89e42b3549e1db68642f939561d2d0 --- /dev/null +++ b/eval-queue/ByteWave/Yi-23B-Llama_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "ByteWave/Yi-23B-Llama", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-19T18:34:54Z", "model_type": "? : ", "likes": 1, "params": 23.232, "license": "other", "job_id": "717879"} \ No newline at end of file diff --git a/eval-queue/ByteWave/Yi-8B-Llama_eval_request_False_float16_Original.json b/eval-queue/ByteWave/Yi-8B-Llama_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..6697c68ac1b9c6913d877a5e66f93d3379a7becb --- /dev/null +++ b/eval-queue/ByteWave/Yi-8B-Llama_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "ByteWave/Yi-8B-Llama", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-19T19:10:03Z", "model_type": "? : ", "likes": 1, "params": 8.728, "license": "other", "job_id": "717884"} \ No newline at end of file diff --git a/eval-queue/Delcos/Mistral-Pygmalion-7b_eval_request_False_float16_Original.json b/eval-queue/Delcos/Mistral-Pygmalion-7b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..2995881cb0417ca969ca627867202e535b13ab2d --- /dev/null +++ b/eval-queue/Delcos/Mistral-Pygmalion-7b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Delcos/Mistral-Pygmalion-7b", "base_model": "llama", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:00:29Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522235", "license": "cc-by-nc-nd-4.0", "likes": 1, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/Delcos/NATE-7b_eval_request_False_float16_Original.json b/eval-queue/Delcos/NATE-7b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..5f8235da42ce002d1ebf854dcea610c0e8c10a5e --- /dev/null +++ b/eval-queue/Delcos/NATE-7b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Delcos/NATE-7b", "base_model": "llama", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "515717", "license": "llama2", "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/Delcos/Starling-LM-11B-alpha_eval_request_False_float16_Original.json b/eval-queue/Delcos/Starling-LM-11B-alpha_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..bd5a11585fd6e199b726bacd62c6cfc1929c4114 --- /dev/null +++ b/eval-queue/Delcos/Starling-LM-11B-alpha_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Delcos/Starling-LM-11B-alpha", + "base_model": "mistral", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-07T08:51:17Z", + "model_type": "\ud83d\udfe2 : pretrained", + "likes": 1, + "params": 11.386, + "license": "cc-by-nc-nd-4.0", + "job_id": "874853", + "job_start_time": "2023-12-09T13:43:38.959536" +} \ No newline at end of file diff --git a/eval-queue/Delcos/Velara_eval_request_False_float16_Original.json b/eval-queue/Delcos/Velara_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..7bc3f20a3aff5ef0656733f5a72b97ef7f31fb07 --- /dev/null +++ b/eval-queue/Delcos/Velara_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Delcos/Velara", + "base_model": "mistral", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-05T19:46:15Z", + "model_type": "\ud83d\udfe2 : pretrained", + "likes": 0, + "params": 11.386, + "license": "cc-by-nc-nd-4.0", + "job_id": "858083", + "job_start_time": "2023-12-06T16:46:40.817451" +} \ No newline at end of file diff --git a/eval-queue/Devio/test-1400_eval_request_False_bfloat16_Original.json b/eval-queue/Devio/test-1400_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..0d5f7d0974f689f5d6b31467503ce96ef3759bc2 --- /dev/null +++ b/eval-queue/Devio/test-1400_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "Devio/test-1400", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "503161", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/Devio/test-2048-1500_eval_request_False_bfloat16_Original.json b/eval-queue/Devio/test-2048-1500_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e19a046f98fb8e48253a8017bd15a06d762cc850 --- /dev/null +++ b/eval-queue/Devio/test-2048-1500_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "Devio/test-2048-1500", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-11T17:44:20Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "440297", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/Devio/test-22B_eval_request_False_bfloat16_Original.json b/eval-queue/Devio/test-22B_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..fe5e12e93b9d1bb275a2b381d6d89e95305a2103 --- /dev/null +++ b/eval-queue/Devio/test-22B_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "Devio/test-22B", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udfe2 : pretrained", "job_id": "498613", "license": "?", "likes": 0, "params": 21.828} \ No newline at end of file diff --git a/eval-queue/Devio/test-3b_eval_request_False_bfloat16_Original.json b/eval-queue/Devio/test-3b_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d1bdd2191c2f50bae5793f5886ee7e43b0f2dfbf --- /dev/null +++ b/eval-queue/Devio/test-3b_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "Devio/test-3b", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "497385", "license": "?", "likes": 0, "params": 3.5} \ No newline at end of file diff --git a/eval-queue/Devio/test-9k-fn_eval_request_False_bfloat16_Original.json b/eval-queue/Devio/test-9k-fn_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..8c66ee23bd89ac12e97841eeb48e7f0467905819 --- /dev/null +++ b/eval-queue/Devio/test-9k-fn_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "Devio/test-9k-fn", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522013", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/Devio/test100_eval_request_False_bfloat16_Original.json b/eval-queue/Devio/test100_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..f55d9a5e97cb426cc4c9b6dcbb9fd94dc588485c --- /dev/null +++ b/eval-queue/Devio/test100_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "Devio/test100", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "521988", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/Devio/test2_eval_request_False_bfloat16_Original.json b/eval-queue/Devio/test2_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..37fc6ece57bb27e8c2417886ba2a8f59a9fff9bd --- /dev/null +++ b/eval-queue/Devio/test2_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "Devio/test2", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-01T23:59:55Z", "model_type": "\ud83d\udfe2 : pretrained", "job_id": "410776", "license": "?", "likes": 0, "params": 21.828} \ No newline at end of file diff --git a/eval-queue/Devio/testB_eval_request_False_bfloat16_Original.json b/eval-queue/Devio/testB_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..be9e3b06a6e517c2ae71829992719f825b1c420e --- /dev/null +++ b/eval-queue/Devio/testB_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "Devio/testB", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-02T04:08:29Z", "model_type": "\ud83d\udfe2 : pretrained", "job_id": "410914", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/Devio/testB_eval_request_False_float16_Original.json b/eval-queue/Devio/testB_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d7774ca294dbbe02ef125d1e63bc3aab31f2d27b --- /dev/null +++ b/eval-queue/Devio/testB_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Devio/testB", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-02T04:03:30Z", "model_type": "\ud83d\udfe2 : pretrained", "job_id": "410909", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/Devio/testC_eval_request_False_bfloat16_Original.json b/eval-queue/Devio/testC_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..37df7160df052644f5bd64226324ca29aed8846a --- /dev/null +++ b/eval-queue/Devio/testC_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "Devio/testC", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "502588", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/Ejafa/vicuna_7B_vanilla_1.1_eval_request_False_False_False.json b/eval-queue/Ejafa/vicuna_7B_vanilla_1.1_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..b42e91625ad8783e755f86228c4419e2e02086cc --- /dev/null +++ b/eval-queue/Ejafa/vicuna_7B_vanilla_1.1_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "Ejafa/vicuna_7B_vanilla_1.1", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "503209", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "?", "likes": 2, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/Enno-Ai/ennodata-13b-8bit-raw-15epoch_eval_request_False_float16_Original.json b/eval-queue/Enno-Ai/ennodata-13b-8bit-raw-15epoch_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..0875f57c3eb3f5430031c7ebe54f859625036f73 --- /dev/null +++ b/eval-queue/Enno-Ai/ennodata-13b-8bit-raw-15epoch_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Enno-Ai/ennodata-13b-8bit-raw-15epoch", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "519919", "license": "?", "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/Enno-Ai/ennodata-7b_eval_request_False_8bit_Original.json b/eval-queue/Enno-Ai/ennodata-7b_eval_request_False_8bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..020a6a1a3956dcb629f0221c12fcd61f075d2330 --- /dev/null +++ b/eval-queue/Enno-Ai/ennodata-7b_eval_request_False_8bit_Original.json @@ -0,0 +1 @@ +{"model": "Enno-Ai/ennodata-7b", "base_model": "", "revision": "main", "private": false, "precision": "8bit", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "518546", "license": "?", "likes": 0, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/Enno-Ai/ennodata-raw-pankajmathur-13b-peft_eval_request_False_8bit_Original.json b/eval-queue/Enno-Ai/ennodata-raw-pankajmathur-13b-peft_eval_request_False_8bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..ddc76ab83ca42bec1ad203240ffcd8764b45e959 --- /dev/null +++ b/eval-queue/Enno-Ai/ennodata-raw-pankajmathur-13b-peft_eval_request_False_8bit_Original.json @@ -0,0 +1 @@ +{"model": "Enno-Ai/ennodata-raw-pankajmathur-13b-peft", "base_model": "", "revision": "main", "private": false, "precision": "8bit", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "518794", "license": "?", "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/Enno-Ai/vigogne2-enno-13b-sft-lora-4bit_eval_request_False_4bit_Original.json b/eval-queue/Enno-Ai/vigogne2-enno-13b-sft-lora-4bit_eval_request_False_4bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..aedf68576aef7228dcc6bfd3307353dbe60bdd52 --- /dev/null +++ b/eval-queue/Enno-Ai/vigogne2-enno-13b-sft-lora-4bit_eval_request_False_4bit_Original.json @@ -0,0 +1 @@ +{"model": "Enno-Ai/vigogne2-enno-13b-sft-lora-4bit", "base_model": "", "revision": "main", "private": false, "precision": "4bit", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "514428", "license": "?", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/Faradaylab/ARIA-70B-V2_eval_request_False_float16_Original.json b/eval-queue/Faradaylab/ARIA-70B-V2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d936d333cb956b02833a8d9d6231b94f2548592c --- /dev/null +++ b/eval-queue/Faradaylab/ARIA-70B-V2_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Faradaylab/ARIA-70B-V2", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517573", "license": "llama2", "likes": 7, "params": 68.715} \ No newline at end of file diff --git a/eval-queue/Faradaylab/ARIA-70B-V3_eval_request_False_float16_Original.json b/eval-queue/Faradaylab/ARIA-70B-V3_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d42c0b84d1b4e134a20293e3f937b2151a6c576f --- /dev/null +++ b/eval-queue/Faradaylab/ARIA-70B-V3_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Faradaylab/ARIA-70B-V3", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517985", "license": "other", "likes": 0, "params": 68.977} \ No newline at end of file diff --git a/eval-queue/Faradaylab/Aria-70B_eval_request_False_float16_Original.json b/eval-queue/Faradaylab/Aria-70B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..9888d33c76239e29678d8c9ce9ba1630b0b9fb1e --- /dev/null +++ b/eval-queue/Faradaylab/Aria-70B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Faradaylab/Aria-70B", "base_model": "petra", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517981", "params": 70.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/Felladrin/TinyMistral-248M-SFT-v3_eval_request_False_float16_Original.json b/eval-queue/Felladrin/TinyMistral-248M-SFT-v3_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..1efc92c94c68439c935f5a75454ae5bf64178626 --- /dev/null +++ b/eval-queue/Felladrin/TinyMistral-248M-SFT-v3_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Felladrin/TinyMistral-248M-SFT-v3", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-03T18:41:37Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 12, + "params": 0.248, + "license": "apache-2.0", + "job_id": "847298", + "job_start_time": "2023-12-04T17:11:32.208540" +} \ No newline at end of file diff --git a/eval-queue/Felladrin/TinyMistral-248M-SFT-v4_eval_request_False_float16_Original.json b/eval-queue/Felladrin/TinyMistral-248M-SFT-v4_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..3fcb4e2a6e561313e8b9f103ffdb446ab5161587 --- /dev/null +++ b/eval-queue/Felladrin/TinyMistral-248M-SFT-v4_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Felladrin/TinyMistral-248M-SFT-v4", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-11T20:12:01Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 12, + "params": 0.248, + "license": "apache-2.0", + "job_id": "896990", + "job_start_time": "2023-12-12T01:33:31.602781" +} \ No newline at end of file diff --git a/eval-queue/GreenNode/GreenNodeLM-7B-v1olet_eval_request_False_float16_Original.json b/eval-queue/GreenNode/GreenNodeLM-7B-v1olet_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..765515f5cbb26f6b2cde04e930f3b3bbbe9c9c2d --- /dev/null +++ b/eval-queue/GreenNode/GreenNodeLM-7B-v1olet_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "GreenNode/GreenNodeLM-7B-v1olet", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-13T15:43:43Z", + "model_type": "\ud83d\udfe6 : RL-tuned", + "likes": 3, + "params": 7.0, + "license": "apache-2.0", + "job_id": "916590", + "job_start_time": "2023-12-13T15:57:05.150396" +} \ No newline at end of file diff --git a/eval-queue/GreenNode/GreenNodeLM-7B-v2leo_eval_request_False_float16_Original.json b/eval-queue/GreenNode/GreenNodeLM-7B-v2leo_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..32eeae24cf3ba71e7ca256cdc981529712da822c --- /dev/null +++ b/eval-queue/GreenNode/GreenNodeLM-7B-v2leo_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "GreenNode/GreenNodeLM-7B-v2leo", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-15T05:44:30Z", + "model_type": "\ud83d\udfe6 : RL-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "924678", + "job_start_time": "2023-12-16T13:14:16.916759" +} \ No newline at end of file diff --git a/eval-queue/GreenNode/GreenNodeLM-7B-v4leo_eval_request_False_float16_Original.json b/eval-queue/GreenNode/GreenNodeLM-7B-v4leo_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..c8c6073096e9b1998a764cbaaf52c200f07754b7 --- /dev/null +++ b/eval-queue/GreenNode/GreenNodeLM-7B-v4leo_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "GreenNode/GreenNodeLM-7B-v4leo", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-16T18:24:40Z", + "model_type": "\ud83d\udfe6 : RL-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "925914", + "job_start_time": "2023-12-16T18:25:34.421459" +} \ No newline at end of file diff --git a/eval-queue/GreenNode/GreenNodeLM-v3olet-7B_eval_request_False_float16_Original.json b/eval-queue/GreenNode/GreenNodeLM-v3olet-7B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..3ab268c8f7bf937a6a85b5d8fefc3b8217dc1a43 --- /dev/null +++ b/eval-queue/GreenNode/GreenNodeLM-v3olet-7B_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "GreenNode/GreenNodeLM-v3olet-7B", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-16T18:48:31Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.0, + "license": "apache-2.0", + "job_id": "925932", + "job_start_time": "2023-12-16T18:50:33.086256" +} \ No newline at end of file diff --git a/eval-queue/GreenNode/GreenNodeLM-yi-34B-sft_eval_request_False_float16_Original.json b/eval-queue/GreenNode/GreenNodeLM-yi-34B-sft_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..344430371b414f43e0e05fec2d6dcf92f18bd832 --- /dev/null +++ b/eval-queue/GreenNode/GreenNodeLM-yi-34B-sft_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "GreenNode/GreenNodeLM-yi-34B-sft", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-15T17:50:10Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 34.0, + "license": "apache-2.0", + "job_id": "924718", + "job_start_time": "2023-12-16T13:59:40.261000" +} \ No newline at end of file diff --git a/eval-queue/GreenNode/Merged-DPO-7B_eval_request_False_float16_Original.json b/eval-queue/GreenNode/Merged-DPO-7B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..db887f36cf63b5fcf99c8b52005f4ac3ad28636d --- /dev/null +++ b/eval-queue/GreenNode/Merged-DPO-7B_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "GreenNode/Merged-DPO-7B", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-12T11:59:24Z", + "model_type": "\ud83d\udfe6 : RL-tuned", + "likes": 0, + "params": 7.0, + "license": "apache-2.0", + "job_id": "901221", + "job_start_time": "2023-12-12T12:00:13.835978" +} \ No newline at end of file diff --git a/eval-queue/HyperbeeAI/Tulpar-7b-v0_eval_request_False_float16_Original.json b/eval-queue/HyperbeeAI/Tulpar-7b-v0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..0a021c1a88a11547ed4c4c6c9aee0a61a8465d15 --- /dev/null +++ b/eval-queue/HyperbeeAI/Tulpar-7b-v0_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "HyperbeeAI/Tulpar-7b-v0", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "460914", "license": "llama2", "likes": 22, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/HyperbeeAI/Tulpar-7b-v1_eval_request_False_float16_Original.json b/eval-queue/HyperbeeAI/Tulpar-7b-v1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..781d09cc11eea1b7519d37e0aa9580255c19a9cc --- /dev/null +++ b/eval-queue/HyperbeeAI/Tulpar-7b-v1_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "HyperbeeAI/Tulpar-7b-v1", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "514399", "license": "llama2", "likes": 1, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/HyperbeeAI/Tulpar-7b-v2_eval_request_False_bfloat16_Original.json b/eval-queue/HyperbeeAI/Tulpar-7b-v2_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..1d16be0d7e04ee49821153473c36e1d478a53c2c --- /dev/null +++ b/eval-queue/HyperbeeAI/Tulpar-7b-v2_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "HyperbeeAI/Tulpar-7b-v2", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-05T07:57:11Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "858033", + "job_start_time": "2023-12-06T16:17:35.997560" +} \ No newline at end of file diff --git a/eval-queue/IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1_eval_request_False_False_False.json b/eval-queue/IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..7781a0d2b100ea1f9aa834a7cf4b16f293695e47 --- /dev/null +++ b/eval-queue/IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "495041", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "gpl-3.0", "likes": 20, "params": 12.89} \ No newline at end of file diff --git a/eval-queue/IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1_eval_request_False_False_True.json b/eval-queue/IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1_eval_request_False_False_True.json new file mode 100644 index 0000000000000000000000000000000000000000..3cb399abcbc20ee224ecfd076cd6b4703d48b642 --- /dev/null +++ b/eval-queue/IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1_eval_request_False_False_True.json @@ -0,0 +1 @@ +{"model": "IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1", "base_model": "decapoda-research/llama-13b-hf", "revision": "main", "private": false, "status": "FINISHED", "job_id": "497958", "weight_type": "Delta", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-10-16T12:46:18Z", "license": "gpl-3.0", "likes": 20, "params": 12.89} \ No newline at end of file diff --git a/eval-queue/IDEA-CCNL/Ziya-LLaMA-13B-v1_eval_request_False_False_False.json b/eval-queue/IDEA-CCNL/Ziya-LLaMA-13B-v1_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..1a11bf6a7be03dd656c017c14c89e64e95a20428 --- /dev/null +++ b/eval-queue/IDEA-CCNL/Ziya-LLaMA-13B-v1_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "IDEA-CCNL/Ziya-LLaMA-13B-v1", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "461898", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-10-16T12:46:18Z", "license": "gpl-3.0", "likes": 251, "params": 12.89} \ No newline at end of file diff --git a/eval-queue/Intel/gpt-j-6B-int8-dynamic_eval_request_False_False_False.json b/eval-queue/Intel/gpt-j-6B-int8-dynamic_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..c05bf0d409453e91df02508e0711b5593f59eea6 --- /dev/null +++ b/eval-queue/Intel/gpt-j-6B-int8-dynamic_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "Intel/gpt-j-6B-int8-dynamic", "base_model": "", "revision": "main", "private": false, "status": "FAILED_2", "job_id": "176564", "weight_type": "Original", "precision": "float16", "license": "apache-2.0", "likes": 16, "params": 5.844} \ No newline at end of file diff --git a/eval-queue/Intel/neural-chat-7b-v3-1_eval_request_False_4bit_Original.json b/eval-queue/Intel/neural-chat-7b-v3-1_eval_request_False_4bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..2f6481e5996e83b0dcb165b18e06337231ec820a --- /dev/null +++ b/eval-queue/Intel/neural-chat-7b-v3-1_eval_request_False_4bit_Original.json @@ -0,0 +1 @@ +{"model": "Intel/neural-chat-7b-v3-1", "base_model": "", "revision": "main", "private": false, "precision": "4bit", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-15T12:56:15Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 12, "params": 7.0, "license": "apache-2.0", "job_id": "697794"} \ No newline at end of file diff --git a/eval-queue/Intel/neural-chat-7b-v3-1_eval_request_False_8bit_Original.json b/eval-queue/Intel/neural-chat-7b-v3-1_eval_request_False_8bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..f89518b95120f99e4047902083debf3187fbbf79 --- /dev/null +++ b/eval-queue/Intel/neural-chat-7b-v3-1_eval_request_False_8bit_Original.json @@ -0,0 +1 @@ +{"model": "Intel/neural-chat-7b-v3-1", "base_model": "", "revision": "main", "private": false, "precision": "8bit", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-15T12:55:37Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 12, "params": 7.0, "license": "apache-2.0", "job_id": "697788"} \ No newline at end of file diff --git a/eval-queue/Intel/neural-chat-7b-v3-1_eval_request_False_GPTQ_Original.json b/eval-queue/Intel/neural-chat-7b-v3-1_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..ec8f07eee1da98f5ce928e7f128820dc69f080b8 --- /dev/null +++ b/eval-queue/Intel/neural-chat-7b-v3-1_eval_request_False_GPTQ_Original.json @@ -0,0 +1 @@ +{"model": "Intel/neural-chat-7b-v3-1", "base_model": "", "revision": "main", "private": false, "precision": "GPTQ", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-15T12:55:52Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 12, "params": 56.0, "license": "apache-2.0", "job_id": null} \ No newline at end of file diff --git a/eval-queue/Intel/neural-chat-7b-v3-1_eval_request_False_bfloat16_Original.json b/eval-queue/Intel/neural-chat-7b-v3-1_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..fd57a17f2900525141ebdec33eaaa0921d96bb99 --- /dev/null +++ b/eval-queue/Intel/neural-chat-7b-v3-1_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "Intel/neural-chat-7b-v3-1", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-15T11:55:02Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 12, "params": 7.0, "license": "apache-2.0", "job_id": "697784"} \ No newline at end of file diff --git a/eval-queue/Intel/neural-chat-7b-v3-1_eval_request_False_float16_Original.json b/eval-queue/Intel/neural-chat-7b-v3-1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..35f1174d22ea1a9e321ee1f20f852d77a8bbb673 --- /dev/null +++ b/eval-queue/Intel/neural-chat-7b-v3-1_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Intel/neural-chat-7b-v3-1", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-15T11:51:07Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 12, "params": 7.0, "license": "apache-2.0", "job_id": "697781"} \ No newline at end of file diff --git a/eval-queue/Intel/neural-chat-7b-v3-2_eval_request_False_float16_Original.json b/eval-queue/Intel/neural-chat-7b-v3-2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..75a74636c169ffbc6ccf56f8ddae06160e3f846e --- /dev/null +++ b/eval-queue/Intel/neural-chat-7b-v3-2_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Intel/neural-chat-7b-v3-2", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-30T15:31:24Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 5, + "params": 7.0, + "license": "apache-2.0", + "job_id": "845986", + "job_start_time": "2023-12-04T13:56:29.846345" +} \ No newline at end of file diff --git a/eval-queue/Intel/neural-chat-7b-v3-3-Slerp_eval_request_False_float16_Original.json b/eval-queue/Intel/neural-chat-7b-v3-3-Slerp_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..62d339d3fb730d3b5378da064550110bf88dc163 --- /dev/null +++ b/eval-queue/Intel/neural-chat-7b-v3-3-Slerp_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Intel/neural-chat-7b-v3-3-Slerp", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-10T15:53:32Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 1, + "params": 7.0, + "license": "apache-2.0", + "job_id": "886312", + "job_start_time": "2023-12-10T15:53:53.029812" +} \ No newline at end of file diff --git a/eval-queue/Intel/neural-chat-7b-v3-3_eval_request_False_float16_Original.json b/eval-queue/Intel/neural-chat-7b-v3-3_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..9a3728d624bbf97218abd1f6ebd6573591d99368 --- /dev/null +++ b/eval-queue/Intel/neural-chat-7b-v3-3_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Intel/neural-chat-7b-v3-3", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-09T17:15:48Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.0, + "license": "apache-2.0", + "job_id": "875168", + "job_start_time": "2023-12-09T18:30:39.369064" +} \ No newline at end of file diff --git a/eval-queue/Intel/neural-chat-7b-v3_eval_request_False_float16_Original.json b/eval-queue/Intel/neural-chat-7b-v3_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..4afbacc33a0ff24cff2b42284e0934dbd228f340 --- /dev/null +++ b/eval-queue/Intel/neural-chat-7b-v3_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Intel/neural-chat-7b-v3", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-14T04:41:10Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 2, "params": 7.0, "license": "apache-2.0", "job_id": "650750"} \ No newline at end of file diff --git a/eval-queue/Jiayi-Pan/Tiny-Vicuna-1B_eval_request_False_float16_Original.json b/eval-queue/Jiayi-Pan/Tiny-Vicuna-1B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..89a2ed7b71091e3388744e1d69ca9d4d8511645f --- /dev/null +++ b/eval-queue/Jiayi-Pan/Tiny-Vicuna-1B_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Jiayi-Pan/Tiny-Vicuna-1B", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-25T01:41:41Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 1.1, + "license": "apache-2.0", + "job_id": "800141", + "job_start_time": "2023-11-27T13:48:57.324959" +} \ No newline at end of file diff --git a/eval-queue/JosephusCheung/Guanaco_eval_request_False_False_False.json b/eval-queue/JosephusCheung/Guanaco_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..7762d076202dd0cf6e9d8cfadd89672376fe6a77 --- /dev/null +++ b/eval-queue/JosephusCheung/Guanaco_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "JosephusCheung/Guanaco", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "weight_type": "Original", "precision": "float16", "submitted_time": "2023-10-16T12:48:18Z", "job_id": "472551", "license": "gpl-3.0", "likes": 213, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/JosephusCheung/LL7M_eval_request_False_bfloat16_Original.json b/eval-queue/JosephusCheung/LL7M_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..960dc7a5ff68053ae2ba9b440ceabc5c94c3ff6c --- /dev/null +++ b/eval-queue/JosephusCheung/LL7M_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "JosephusCheung/LL7M", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "515410", "license": "cc-by-nc-nd-4.0", "likes": 34, "params": 6.638} \ No newline at end of file diff --git a/eval-queue/JosephusCheung/Pwen-14B-Chat-20_30_eval_request_False_bfloat16_Original.json b/eval-queue/JosephusCheung/Pwen-14B-Chat-20_30_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..1e61fbffb0197e92c51cab79034bdb29cd805bd4 --- /dev/null +++ b/eval-queue/JosephusCheung/Pwen-14B-Chat-20_30_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "JosephusCheung/Pwen-14B-Chat-20_30", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "519999", "params": 14.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/JosephusCheung/Pwen-7B-Chat-20_30_eval_request_False_bfloat16_Original.json b/eval-queue/JosephusCheung/Pwen-7B-Chat-20_30_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..aefdaa049f3afa279c395080c0e8b75534ae61fa --- /dev/null +++ b/eval-queue/JosephusCheung/Pwen-7B-Chat-20_30_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "JosephusCheung/Pwen-7B-Chat-20_30", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "518372", "params": 7.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/JosephusCheung/Pwen-VL-Chat-20_30_eval_request_False_bfloat16_Original.json b/eval-queue/JosephusCheung/Pwen-VL-Chat-20_30_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..92c61b215a68d832217c2221e76e6c2a1c71a14c --- /dev/null +++ b/eval-queue/JosephusCheung/Pwen-VL-Chat-20_30_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "JosephusCheung/Pwen-VL-Chat-20_30", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "631812", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/JosephusCheung/Qwen-LLaMAfied-7B-Chat_eval_request_False_float16_Original.json b/eval-queue/JosephusCheung/Qwen-LLaMAfied-7B-Chat_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..5d93311f9359c835190a8f76d43206c70491e67f --- /dev/null +++ b/eval-queue/JosephusCheung/Qwen-LLaMAfied-7B-Chat_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "JosephusCheung/Qwen-LLaMAfied-7B-Chat", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:27:38Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "523013", "params": 7.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/JosephusCheung/Qwen-VL-LLaMAfied-7B-Chat_eval_request_False_bfloat16_Original.json b/eval-queue/JosephusCheung/Qwen-VL-LLaMAfied-7B-Chat_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..9d02bc6c73dc42a3368b2b2cde4e10ab9cc1cb16 --- /dev/null +++ b/eval-queue/JosephusCheung/Qwen-VL-LLaMAfied-7B-Chat_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "JosephusCheung/Qwen-VL-LLaMAfied-7B-Chat", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "633074", "params": 7.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/JosephusCheung/Yee-34B-200K-Chat_eval_request_False_bfloat16_Original.json b/eval-queue/JosephusCheung/Yee-34B-200K-Chat_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..844baa990c61dd10451d9f2b77b9e04a24643ea5 --- /dev/null +++ b/eval-queue/JosephusCheung/Yee-34B-200K-Chat_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "JosephusCheung/Yee-34B-200K-Chat", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-04T08:54:00Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 34.0, + "license": "gpl-3.0", + "job_id": "847421", + "job_start_time": "2023-12-04T18:32:46.003914" +} \ No newline at end of file diff --git a/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-100step-flan-v2_eval_request_False_float16_Original.json b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-100step-flan-v2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..2ff4a1760a7ac2e981d487ebc704bf73a0f698e1 --- /dev/null +++ b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-100step-flan-v2_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Korabbit/Llama-2-7b-chat-hf-afr-100step-flan-v2", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-04T12:44:42Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 7.0, + "license": "llama2", + "job_id": "856895", + "job_start_time": "2023-12-06T14:44:02.242839" +} \ No newline at end of file diff --git a/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-100step-flan_eval_request_False_float16_Original.json b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-100step-flan_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..146dbc49d59ee5407a3481af9045f657651f964a --- /dev/null +++ b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-100step-flan_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Korabbit/Llama-2-7b-chat-hf-afr-100step-flan", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-30T11:40:16Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 7.0, + "license": "llama2", + "job_id": "845770", + "job_start_time": "2023-12-04T09:27:54.123187" +} \ No newline at end of file diff --git a/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-100step-v2_eval_request_False_float16_Original.json b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-100step-v2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..364dbdef769a8daf264090bf699db7343e0c0b7b --- /dev/null +++ b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-100step-v2_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Korabbit/Llama-2-7b-chat-hf-afr-100step-v2", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-22T11:22:46Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 7.0, + "license": "llama2", + "job_id": "758125", + "job_start_time": "2023-11-23T13:32:02.219932" +} \ No newline at end of file diff --git a/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-200step-flan-v2_eval_request_False_float16_Original.json b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-200step-flan-v2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..4012d1a353f995df16412a220c10a32a47e304db --- /dev/null +++ b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-200step-flan-v2_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Korabbit/Llama-2-7b-chat-hf-afr-200step-flan-v2", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-04T12:44:49Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 7.0, + "license": "llama2", + "job_id": "856897", + "job_start_time": "2023-12-06T14:45:21.829893" +} \ No newline at end of file diff --git a/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-200step-flan_eval_request_False_float16_Original.json b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-200step-flan_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..8020cd2b365a8a1b84f4593ea523471403d2a97f --- /dev/null +++ b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-200step-flan_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Korabbit/Llama-2-7b-chat-hf-afr-200step-flan", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-30T12:38:28Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 7.0, + "license": "llama2", + "job_id": "845977", + "job_start_time": "2023-12-04T13:50:35.922388" +} \ No newline at end of file diff --git a/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-200step-merged_eval_request_False_float16_Original.json b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-200step-merged_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..6d2618787db11a10c3f2613d531395af37fdbfac --- /dev/null +++ b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-200step-merged_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Korabbit/Llama-2-7b-chat-hf-afr-200step-merged", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-21T11:22:14Z", "model_type": "\u2b55 : instruction-tuned", "likes": 0, "params": 7.0, "license": "llama2", "job_id": "734725"} \ No newline at end of file diff --git a/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-200step-v2_eval_request_False_float16_Original.json b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-200step-v2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..a63433be020c1f8fc2a59f2602accfb6b5d42ed0 --- /dev/null +++ b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-200step-v2_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Korabbit/Llama-2-7b-chat-hf-afr-200step-v2", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-22T11:09:13Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 7.0, + "license": "llama2", + "job_id": "755107", + "job_start_time": "2023-11-23T13:15:22.616395" +} \ No newline at end of file diff --git a/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-200step_eval_request_False_4bit_Adapter.json b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-200step_eval_request_False_4bit_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..3b61cdc5d68ecfe25b5a14492b59e6ed34211163 --- /dev/null +++ b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-200step_eval_request_False_4bit_Adapter.json @@ -0,0 +1 @@ +{"model": "Korabbit/Llama-2-7b-chat-hf-afr-200step", "base_model": "meta-llama/Llama-2-7b-chat-hf", "revision": "main", "private": false, "precision": "4bit", "weight_type": "Adapter", "status": "FAILED", "submitted_time": "2023-11-19T09:34:55Z", "model_type": "\u2b55 : instruction-tuned", "likes": 0, "params": 7.0, "license": "llama2", "job_id": "704960"} \ No newline at end of file diff --git a/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-300step-flan-v2_eval_request_False_float16_Original.json b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-300step-flan-v2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..df689f395e4d0b17a1f0174ddb34dc2cc320963e --- /dev/null +++ b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-300step-flan-v2_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Korabbit/Llama-2-7b-chat-hf-afr-300step-flan-v2", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-04T12:44:53Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 7.0, + "license": "llama2", + "job_id": "856898", + "job_start_time": "2023-12-06T14:46:18.832968" +} \ No newline at end of file diff --git a/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-441step-flan-v2_eval_request_False_float16_Original.json b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-441step-flan-v2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..9eb354148d83867d9b5a725b05be72411ffffffa --- /dev/null +++ b/eval-queue/Korabbit/Llama-2-7b-chat-hf-afr-441step-flan-v2_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Korabbit/Llama-2-7b-chat-hf-afr-441step-flan-v2", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-04T12:45:04Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 7.0, + "license": "llama2", + "job_id": "857962", + "job_start_time": "2023-12-06T15:41:55.202448" +} \ No newline at end of file diff --git a/eval-queue/Korabbit/llama-2-7b-chat-hf-afr-200steps_eval_request_False_float16_Adapter.json b/eval-queue/Korabbit/llama-2-7b-chat-hf-afr-200steps_eval_request_False_float16_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..bd53cb96577c15c9fda4fad3c5fc1ea09d83d686 --- /dev/null +++ b/eval-queue/Korabbit/llama-2-7b-chat-hf-afr-200steps_eval_request_False_float16_Adapter.json @@ -0,0 +1 @@ +{"model": "Korabbit/llama-2-7b-chat-hf-afr-200steps", "base_model": "meta-llama/Llama-2-7b-chat-hf", "revision": "main", "private": false, "precision": "float16", "weight_type": "Adapter", "status": "FAILED", "submitted_time": "2023-11-20T08:27:24Z", "model_type": "\u2b55 : instruction-tuned", "likes": 0, "params": 7.0, "license": "llama2", "job_id": "719426"} \ No newline at end of file diff --git a/eval-queue/Lazycuber/Janemalion-6B_eval_request_False_False_False.json b/eval-queue/Lazycuber/Janemalion-6B_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..d08e4c6c6fadbe729481121a6b5e11fe0fe2e6bb --- /dev/null +++ b/eval-queue/Lazycuber/Janemalion-6B_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "Lazycuber/Janemalion-6B", "base_model": "GPT J", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "weight_type": "Original", "precision": "float16", "job_id": "632897", "model_type": "fine-tuned", "license": "mit", "likes": 1, "params": 5.844} \ No newline at end of file diff --git a/eval-queue/Lazycuber/L2-7b-Base-Guanaco-Uncensored_eval_request_False_float16_Original.json b/eval-queue/Lazycuber/L2-7b-Base-Guanaco-Uncensored_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..982a743feec3738c6e438b03f9076aeed410ba45 --- /dev/null +++ b/eval-queue/Lazycuber/L2-7b-Base-Guanaco-Uncensored_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Lazycuber/L2-7b-Base-Guanaco-Uncensored", "base_model": "NousResearch/Llama-2-7b-hf", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517907", "license": "?", "likes": 0, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/Lazycuber/L2-7b-Base-Guanaco-Vicuna_eval_request_False_float16_Original.json b/eval-queue/Lazycuber/L2-7b-Base-Guanaco-Vicuna_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..212fd5f76faddb7e702b9bf556e2e718e5a707ce --- /dev/null +++ b/eval-queue/Lazycuber/L2-7b-Base-Guanaco-Vicuna_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Lazycuber/L2-7b-Base-Guanaco-Vicuna", "base_model": "LLAMA V2", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-21T12:18:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "477386", "license": "?", "likes": 0, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/Lazycuber/L2-7b-Guanaco-Random-Test_eval_request_False_float16_Original.json b/eval-queue/Lazycuber/L2-7b-Guanaco-Random-Test_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..df84105ea8b9e3bd1f1dd65ea4e0c4dbfadfc075 --- /dev/null +++ b/eval-queue/Lazycuber/L2-7b-Guanaco-Random-Test_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Lazycuber/L2-7b-Guanaco-Random-Test", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "518520", "params": 7.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/Lazycuber/L2-7b-Guanaco-Uncensored_eval_request_False_float16_Original.json b/eval-queue/Lazycuber/L2-7b-Guanaco-Uncensored_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..7caaf007d7699bff95c5030f8e3fc850a28f0091 --- /dev/null +++ b/eval-queue/Lazycuber/L2-7b-Guanaco-Uncensored_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Lazycuber/L2-7b-Guanaco-Uncensored", "base_model": "NousResearch/Llama-2-7b-chat-hf", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517508", "license": "?", "likes": 0, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/Lazycuber/L2-7b-Orca-WVG-Test_eval_request_False_float16_Original.json b/eval-queue/Lazycuber/L2-7b-Orca-WVG-Test_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..2a10635d3d58de907b211e8cf2e3d722503c2740 --- /dev/null +++ b/eval-queue/Lazycuber/L2-7b-Orca-WVG-Test_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Lazycuber/L2-7b-Orca-WVG-Test", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "519102", "license": "?", "likes": 0, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/Lazycuber/pyg-instruct-wizardlm_eval_request_False_float16_Original.json b/eval-queue/Lazycuber/pyg-instruct-wizardlm_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b25286c65e10dcc49660abce863daf9fbf63cefd --- /dev/null +++ b/eval-queue/Lazycuber/pyg-instruct-wizardlm_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Lazycuber/pyg-instruct-wizardlm", "base_model": "GPT-J", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:58:30Z", "job_id": "522161", "model_type": "fine-tuned", "license": "?", "likes": 1, "params": 5.844} \ No newline at end of file diff --git a/eval-queue/LumiOpen/Poro-34B_eval_request_False_bfloat16_Original.json b/eval-queue/LumiOpen/Poro-34B_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..29e9094d8276db0612d34ee4cf359a3d06090e29 --- /dev/null +++ b/eval-queue/LumiOpen/Poro-34B_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "LumiOpen/Poro-34B", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-15T10:30:36Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 24, "params": 34.217, "license": "apache-2.0", "job_id": "652311"} \ No newline at end of file diff --git a/eval-queue/Minirecord/Mini_DPO_test02_eval_request_False_float16_Original.json b/eval-queue/Minirecord/Mini_DPO_test02_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..da50c80c45da014513dbb34dc36d2f3ed6b222e9 --- /dev/null +++ b/eval-queue/Minirecord/Mini_DPO_test02_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Minirecord/Mini_DPO_test02", + "base_model": "Minirecord/Mini_synatra_7b_02", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-30T06:16:37Z", + "model_type": "\ud83d\udfe6 : RL-tuned", + "likes": 0, + "params": 7.242, + "license": "cc-by-sa-4.0", + "job_id": "845741", + "job_start_time": "2023-12-04T09:22:52.907521" +} \ No newline at end of file diff --git a/eval-queue/Minirecord/Mini_synatra_7b_02_eval_request_False_float16_Original.json b/eval-queue/Minirecord/Mini_synatra_7b_02_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..29fdb39ea48138d412161923fb0fb8d0aabb6fad --- /dev/null +++ b/eval-queue/Minirecord/Mini_synatra_7b_02_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Minirecord/Mini_synatra_7b_02", + "base_model": "maywell/Synatra-7B-v0.3-dpo", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-11-24T07:13:49Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.0, + "license": "cc-by-sa-4.0", + "job_id": "800065", + "job_start_time": "2023-11-27T13:19:48.440856" +} \ No newline at end of file diff --git a/eval-queue/MrNJK/gpt2-xl-sft_eval_request_False_float16_Original.json b/eval-queue/MrNJK/gpt2-xl-sft_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..35cb08fde82224e4c6f584433301ecd0d8156f41 --- /dev/null +++ b/eval-queue/MrNJK/gpt2-xl-sft_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "MrNJK/gpt2-xl-sft", "base_model": "gpt2-xl", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "fine-tuned", "job_id": "461621", "license": "apache-2.0", "likes": 0, "params": 1.558} \ No newline at end of file diff --git a/eval-queue/NExtNewChattingAI/shark_tank_ai_7_b_eval_request_False_float16_Original.json b/eval-queue/NExtNewChattingAI/shark_tank_ai_7_b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..51f93e7462902b32eb565c8d1bfb55f5497d6105 --- /dev/null +++ b/eval-queue/NExtNewChattingAI/shark_tank_ai_7_b_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "NExtNewChattingAI/shark_tank_ai_7_b", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "RUNNING", + "submitted_time": "2023-12-17T16:56:59Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "933101", + "job_start_time": "2023-12-17T19:26:50.161153" +} \ No newline at end of file diff --git a/eval-queue/PulsarAI/2x-LoRA-Assemble-13B_eval_request_False_float16_Original.json b/eval-queue/PulsarAI/2x-LoRA-Assemble-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..a1129ba22e9f0c9ab1b258c48a3b49abc632d3a4 --- /dev/null +++ b/eval-queue/PulsarAI/2x-LoRA-Assemble-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "PulsarAI/2x-LoRA-Assemble-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:00:29Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522225", "license": "cc-by-nc-4.0", "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/PulsarAI/2x-LoRA-Assemble-Nova-13B_eval_request_False_float16_Original.json b/eval-queue/PulsarAI/2x-LoRA-Assemble-Nova-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..db529c614cb0aa2e69e106fd67cac405b9f3446c --- /dev/null +++ b/eval-queue/PulsarAI/2x-LoRA-Assemble-Nova-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "PulsarAI/2x-LoRA-Assemble-Nova-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "518507", "license": "?", "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/PulsarAI/2x-LoRA-Assemble-Platypus2-13B_eval_request_False_float16_Original.json b/eval-queue/PulsarAI/2x-LoRA-Assemble-Platypus2-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e6f9e4c7ae6d6362a563fd0d16deb34b16fc3bd4 --- /dev/null +++ b/eval-queue/PulsarAI/2x-LoRA-Assemble-Platypus2-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "PulsarAI/2x-LoRA-Assemble-Platypus2-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "518425", "license": "?", "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/PulsarAI/Chat-AYB-Nova-13B_eval_request_False_float16_Original.json b/eval-queue/PulsarAI/Chat-AYB-Nova-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..7fdd1cfa706c523cccdb3f64d813b818017484c0 --- /dev/null +++ b/eval-queue/PulsarAI/Chat-AYB-Nova-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "PulsarAI/Chat-AYB-Nova-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "520985", "license": "?", "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/PulsarAI/Chat-AYB-Platypus2-13B_eval_request_False_float16_Original.json b/eval-queue/PulsarAI/Chat-AYB-Platypus2-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..60e3c9f44b040e511ee70c605c4d5693417b230e --- /dev/null +++ b/eval-queue/PulsarAI/Chat-AYB-Platypus2-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "PulsarAI/Chat-AYB-Platypus2-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:19:55Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "522420", "license": "?", "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/PulsarAI/CollectiveCognition-v1.1-Nebula-7B_eval_request_False_float16_Original.json b/eval-queue/PulsarAI/CollectiveCognition-v1.1-Nebula-7B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..fd315096da3c89ddec64441492b5f0732431e8c2 --- /dev/null +++ b/eval-queue/PulsarAI/CollectiveCognition-v1.1-Nebula-7B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "PulsarAI/CollectiveCognition-v1.1-Nebula-7B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-15T14:35:11Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 0, "params": 7.242, "license": "cc-by-nc-4.0", "job_id": "648661"} \ No newline at end of file diff --git a/eval-queue/PulsarAI/EnsembleV5-Nova-13B_eval_request_False_float16_Original.json b/eval-queue/PulsarAI/EnsembleV5-Nova-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..647ace354b27eb43206f1084355bd03fd877fef7 --- /dev/null +++ b/eval-queue/PulsarAI/EnsembleV5-Nova-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "PulsarAI/EnsembleV5-Nova-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "513932", "license": "?", "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/PulsarAI/GenAI-Nova-13B_eval_request_False_float16_Original.json b/eval-queue/PulsarAI/GenAI-Nova-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..44468677e7187162aa0e647a35ab60ce37f3670b --- /dev/null +++ b/eval-queue/PulsarAI/GenAI-Nova-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "PulsarAI/GenAI-Nova-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:19:55Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "522802", "license": "?", "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/PulsarAI/MetaMath-Chupacabra-7B-v2.01-Slerp_eval_request_False_float16_Original.json b/eval-queue/PulsarAI/MetaMath-Chupacabra-7B-v2.01-Slerp_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..1e329f44e4b2addb12a320ded2780d4ae28ba1e8 --- /dev/null +++ b/eval-queue/PulsarAI/MetaMath-Chupacabra-7B-v2.01-Slerp_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "PulsarAI/MetaMath-Chupacabra-7B-v2.01-Slerp", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-08T11:41:10Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "875009", + "job_start_time": "2023-12-09T15:59:23.962948" +} \ No newline at end of file diff --git a/eval-queue/PulsarAI/MetaMath-OpenHermes-2.5-neural-chat-v3-3-Slerp_eval_request_False_bfloat16_Original.json b/eval-queue/PulsarAI/MetaMath-OpenHermes-2.5-neural-chat-v3-3-Slerp_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..57529e5a4eb94a257876a36e4bb029f49ad0b7c1 --- /dev/null +++ b/eval-queue/PulsarAI/MetaMath-OpenHermes-2.5-neural-chat-v3-3-Slerp_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "PulsarAI/MetaMath-OpenHermes-2.5-neural-chat-v3-3-Slerp", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-10T00:37:26Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "875496", + "job_start_time": "2023-12-10T00:38:21.959568" +} \ No newline at end of file diff --git a/eval-queue/PulsarAI/MetaMath-Tulpar-7b-v2-Slerp_eval_request_False_float16_Original.json b/eval-queue/PulsarAI/MetaMath-Tulpar-7b-v2-Slerp_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..1327dbdcb28cfc741cf1c6cb8ad18bd3aa900e27 --- /dev/null +++ b/eval-queue/PulsarAI/MetaMath-Tulpar-7b-v2-Slerp_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "PulsarAI/MetaMath-Tulpar-7b-v2-Slerp", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-08T11:03:12Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "875004", + "job_start_time": "2023-12-09T15:56:05.675242" +} \ No newline at end of file diff --git a/eval-queue/PulsarAI/Nebula-7B_eval_request_False_float16_Original.json b/eval-queue/PulsarAI/Nebula-7B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..7b679437212eebdfd21fe8faa1d2f8197a3669a8 --- /dev/null +++ b/eval-queue/PulsarAI/Nebula-7B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "PulsarAI/Nebula-7B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "513938", "license": "cc-by-nc-4.0", "likes": 0, "params": 7.242} \ No newline at end of file diff --git a/eval-queue/PulsarAI/Nebula-v2-7B_eval_request_False_float16_Original.json b/eval-queue/PulsarAI/Nebula-v2-7B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..8f8cc2817c2d11162b96298d717366cdf9ddeebc --- /dev/null +++ b/eval-queue/PulsarAI/Nebula-v2-7B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "PulsarAI/Nebula-v2-7B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-21T10:20:21Z", "model_type": "\u2b55 : instruction-tuned", "likes": 0, "params": 7.242, "license": "cc-by-nc-4.0", "job_id": "734661"} \ No newline at end of file diff --git a/eval-queue/PulsarAI/Neural-una-cybertron-7b_eval_request_False_bfloat16_Original.json b/eval-queue/PulsarAI/Neural-una-cybertron-7b_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..eb48ed1b28d121a561e4fe14692ea83a3f2dd769 --- /dev/null +++ b/eval-queue/PulsarAI/Neural-una-cybertron-7b_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "PulsarAI/Neural-una-cybertron-7b", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-09T09:45:09Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "875125", + "job_start_time": "2023-12-09T17:46:38.714258" +} \ No newline at end of file diff --git a/eval-queue/PulsarAI/OpenHermes-2.5-neural-chat-v3-2-Slerp_eval_request_False_float16_Original.json b/eval-queue/PulsarAI/OpenHermes-2.5-neural-chat-v3-2-Slerp_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..2543b1c3468851f55a030711fe2c9fa1ef41f973 --- /dev/null +++ b/eval-queue/PulsarAI/OpenHermes-2.5-neural-chat-v3-2-Slerp_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "PulsarAI/OpenHermes-2.5-neural-chat-v3-2-Slerp", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-08T13:36:07Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "875015", + "job_start_time": "2023-12-09T16:05:18.723462" +} \ No newline at end of file diff --git a/eval-queue/PulsarAI/OpenHermes-2.5-neural-chat-v3-3-Slerp_eval_request_False_bfloat16_Original.json b/eval-queue/PulsarAI/OpenHermes-2.5-neural-chat-v3-3-Slerp_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..52c04b67a51bc3658ee53d120b39a7d25bf71de0 --- /dev/null +++ b/eval-queue/PulsarAI/OpenHermes-2.5-neural-chat-v3-3-Slerp_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "PulsarAI/OpenHermes-2.5-neural-chat-v3-3-Slerp", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-09T23:43:39Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "875442", + "job_start_time": "2023-12-09T23:45:18.483177" +} \ No newline at end of file diff --git a/eval-queue/PulsarAI/SlimOpenOrca-Mistral-7B-v2_eval_request_False_bfloat16_Original.json b/eval-queue/PulsarAI/SlimOpenOrca-Mistral-7B-v2_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..7528cdcac412e4947923e603396e65fad19d1899 --- /dev/null +++ b/eval-queue/PulsarAI/SlimOpenOrca-Mistral-7B-v2_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "PulsarAI/SlimOpenOrca-Mistral-7B-v2", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-09T22:45:28Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 1, "params": 7.242, "license": "cc-by-nc-4.0", "job_id": "650313"} \ No newline at end of file diff --git a/eval-queue/PulsarAI/SlimOpenOrca-Mistral-7B-v2_eval_request_False_float16_Original.json b/eval-queue/PulsarAI/SlimOpenOrca-Mistral-7B-v2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..1ebbd21fd955522777e0ec208de1819d66e7d600 --- /dev/null +++ b/eval-queue/PulsarAI/SlimOpenOrca-Mistral-7B-v2_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "PulsarAI/SlimOpenOrca-Mistral-7B-v2", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-15T12:31:06Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 0, "params": 7.242, "license": "cc-by-nc-4.0", "job_id": "648659"} \ No newline at end of file diff --git a/eval-queue/PygmalionAI/metharme-1.3b_eval_request_False_False_False.json b/eval-queue/PygmalionAI/metharme-1.3b_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..e87db133d5b3699e93c6a268b9a2f76bc9cb49eb --- /dev/null +++ b/eval-queue/PygmalionAI/metharme-1.3b_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "PygmalionAI/metharme-1.3b", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "job_id": "471396", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "license": "apache-2.0", "likes": 18, "params": 1.515} \ No newline at end of file diff --git a/eval-queue/PygmalionAI/mythalion-13b_eval_request_False_float16_Original.json b/eval-queue/PygmalionAI/mythalion-13b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d53bf6e768d9065c4d91bc6d79d19d694617a10a --- /dev/null +++ b/eval-queue/PygmalionAI/mythalion-13b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "PygmalionAI/mythalion-13b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "518504", "license": "llama2", "likes": 58, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/PygmalionAI/pygmalion-1.3b_eval_request_False_False_False.json b/eval-queue/PygmalionAI/pygmalion-1.3b_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..8c004ead172fb97287e9ca51374cfc0d24b4d3dc --- /dev/null +++ b/eval-queue/PygmalionAI/pygmalion-1.3b_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "PygmalionAI/pygmalion-1.3b", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "497956", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-10-16T12:48:18Z", "license": "agpl-3.0", "likes": 51, "params": 1.515} \ No newline at end of file diff --git a/eval-queue/PygmalionAI/pygmalion-2-13b_eval_request_False_float16_Original.json b/eval-queue/PygmalionAI/pygmalion-2-13b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..f1a8411fb2c62a95df7c74a85d6fd00dd5fe2cf0 --- /dev/null +++ b/eval-queue/PygmalionAI/pygmalion-2-13b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "PygmalionAI/pygmalion-2-13b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "514418", "license": "llama2", "likes": 29, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/PygmalionAI/pygmalion-2-7b_eval_request_False_float16_Original.json b/eval-queue/PygmalionAI/pygmalion-2-7b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..76b00bd3ae19cd4036bfdab0693d36648bdbed53 --- /dev/null +++ b/eval-queue/PygmalionAI/pygmalion-2-7b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "PygmalionAI/pygmalion-2-7b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:58:30Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522180", "license": "llama2", "likes": 25, "params": 6.738} \ No newline at end of file diff --git a/eval-queue/PygmalionAI/pygmalion-2.7b_eval_request_False_False_False.json b/eval-queue/PygmalionAI/pygmalion-2.7b_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..2f572e6ccc9c171021a930f9ae8339c477530fe4 --- /dev/null +++ b/eval-queue/PygmalionAI/pygmalion-2.7b_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "PygmalionAI/pygmalion-2.7b", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "471262", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "creativeml-openrail-m", "likes": 48, "params": 2.651} \ No newline at end of file diff --git a/eval-queue/PygmalionAI/pygmalion-350m_eval_request_False_False_False.json b/eval-queue/PygmalionAI/pygmalion-350m_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..5ecd4d46771a751e889f7e806fd67cf273391321 --- /dev/null +++ b/eval-queue/PygmalionAI/pygmalion-350m_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "PygmalionAI/pygmalion-350m", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "497406", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-10-16T12:46:18Z", "license": "?", "likes": 48, "params": 0.331} \ No newline at end of file diff --git a/eval-queue/PygmalionAI/pygmalion-6b_eval_request_False_False_False.json b/eval-queue/PygmalionAI/pygmalion-6b_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..da53de077ce1c34e9197f3b0874609e3b729de16 --- /dev/null +++ b/eval-queue/PygmalionAI/pygmalion-6b_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "PygmalionAI/pygmalion-6b", "base_model": "", "revision": "dev", "private": false, "status": "FINISHED", "job_id": "461606", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-10-16T12:48:18Z", "license": "creativeml-openrail-m", "likes": 690, "params": 5.844} \ No newline at end of file diff --git a/eval-queue/PygmalionAI/pygmalion-6b_eval_request_False_float16_Original.json b/eval-queue/PygmalionAI/pygmalion-6b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..5c38ab42433fac2b842348b03afd0d6343d28331 --- /dev/null +++ b/eval-queue/PygmalionAI/pygmalion-6b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "PygmalionAI/pygmalion-6b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-07T09:44:49Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "486934", "license": "creativeml-openrail-m", "likes": 690, "params": 5.844} \ No newline at end of file diff --git a/eval-queue/RoversX/llama-2-7b-hf-small-shards-Samantha-V1-SFT_eval_request_False_4bit_Original.json b/eval-queue/RoversX/llama-2-7b-hf-small-shards-Samantha-V1-SFT_eval_request_False_4bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..295f225a4ccdf2a133a65c652b451b0378c6ee3c --- /dev/null +++ b/eval-queue/RoversX/llama-2-7b-hf-small-shards-Samantha-V1-SFT_eval_request_False_4bit_Original.json @@ -0,0 +1 @@ +{"model": "RoversX/llama-2-7b-hf-small-shards-Samantha-V1-SFT", "base_model": "", "revision": "main", "private": false, "precision": "4bit", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "495569", "license": "?", "likes": 0, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/S4sch/Open-Hermes-2.5-neural-chat-3.1-frankenmerge-11b_eval_request_False_bfloat16_Original.json b/eval-queue/S4sch/Open-Hermes-2.5-neural-chat-3.1-frankenmerge-11b_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..aee1bd9f82dfbb565302631aa43083ce3f212498 --- /dev/null +++ b/eval-queue/S4sch/Open-Hermes-2.5-neural-chat-3.1-frankenmerge-11b_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "S4sch/Open-Hermes-2.5-neural-chat-3.1-frankenmerge-11b", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-11-28T00:19:43Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 11.386, + "license": "apache-2.0", + "job_id": "802273", + "job_start_time": "2023-11-28T05:31:37.446488" +} \ No newline at end of file diff --git a/eval-queue/S4sch/Open-Hermes-2.5-neural-chat-3.1-frankenmerge-11b_eval_request_False_float16_Original.json b/eval-queue/S4sch/Open-Hermes-2.5-neural-chat-3.1-frankenmerge-11b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b27be7377b55f156e11964c4096d1c2ab0c05988 --- /dev/null +++ b/eval-queue/S4sch/Open-Hermes-2.5-neural-chat-3.1-frankenmerge-11b_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "S4sch/Open-Hermes-2.5-neural-chat-3.1-frankenmerge-11b", + "base_model": "mistral", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-11-28T04:06:17Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 1, + "params": 11.386, + "license": "apache-2.0", + "job_id": "802288", + "job_start_time": "2023-11-28T05:43:29.912569" +} \ No newline at end of file diff --git a/eval-queue/S4sch/zephyr-neural-chat-frankenmerge11b_eval_request_False_float16_Original.json b/eval-queue/S4sch/zephyr-neural-chat-frankenmerge11b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..258a187d758765831f5c5701d6f454ca6e1906e4 --- /dev/null +++ b/eval-queue/S4sch/zephyr-neural-chat-frankenmerge11b_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "S4sch/zephyr-neural-chat-frankenmerge11b", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-02T19:16:31Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 2, + "params": 11.386, + "license": "apache-2.0", + "job_id": "846095", + "job_start_time": "2023-12-04T14:35:02.930549" +} \ No newline at end of file diff --git a/eval-queue/SLAM-group/NewHope_eval_request_False_bfloat16_Original.json b/eval-queue/SLAM-group/NewHope_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..916be021cd080ec464686cca4f3be2cb9931949e --- /dev/null +++ b/eval-queue/SLAM-group/NewHope_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "SLAM-group/NewHope", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "fine-tuned", "job_id": "520488", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/SLAM-group/NewHope_eval_request_False_float16_Original.json b/eval-queue/SLAM-group/NewHope_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..af847b24287f033276f943ee246a7be5123f23fc --- /dev/null +++ b/eval-queue/SLAM-group/NewHope_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "SLAM-group/NewHope", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:19:55Z", "model_type": "fine-tuned", "job_id": "522756", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/SaylorTwift/gpt2_test_eval_request_False_False_False.json b/eval-queue/SaylorTwift/gpt2_test_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..c851fb084e3253ce5d844e6f180756534e6e2c2d --- /dev/null +++ b/eval-queue/SaylorTwift/gpt2_test_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "SaylorTwift/gpt2_test", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "job_id": "470972", "weight_type": "Original", "precision": "float16", "model_type": "pretrained", "license": "mit", "likes": 0, "params": 0.137} \ No newline at end of file diff --git a/eval-queue/Taekyoon/llama2-ko-7b-test_eval_request_False_float16_Original.json b/eval-queue/Taekyoon/llama2-ko-7b-test_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..43bcd76905f91257cfc4a5df90818d14a3086ffb --- /dev/null +++ b/eval-queue/Taekyoon/llama2-ko-7b-test_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Taekyoon/llama2-ko-7b-test", "base_model": "Llama-2-7b", "revision": "blog_20b_300k", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "514567", "params": 7.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/Taekyoon/llama2-koen-7b-test_eval_request_False_float16_Original.json b/eval-queue/Taekyoon/llama2-koen-7b-test_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e1f916962f0f09e18ebe0e0a74f7ecd9491d736c --- /dev/null +++ b/eval-queue/Taekyoon/llama2-koen-7b-test_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Taekyoon/llama2-koen-7b-test", "base_model": "meta-llama/Llama-2-7b-hf", "revision": "stage1_20b_300k", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-20T21:10:28Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "470530", "params": 7.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/Taekyoon/llama2-org-koen-7b_eval_request_False_float16_Original.json b/eval-queue/Taekyoon/llama2-org-koen-7b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..23610c8481f976d44500d8d281a56a546734a296 --- /dev/null +++ b/eval-queue/Taekyoon/llama2-org-koen-7b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Taekyoon/llama2-org-koen-7b", "base_model": "", "revision": "stage2_20b_450k", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-17T08:55:02Z", "model_type": "\ud83d\udfe2 : pretrained", "likes": 0, "params": 7.0, "license": "cc-by-nc-sa-4.0", "job_id": "700861"} \ No newline at end of file diff --git a/eval-queue/TheBloke/Airoboros-L2-13B-2.1-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/Airoboros-L2-13B-2.1-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..3953caaefe199cf7975c05cdb55f8003c570d8ce --- /dev/null +++ b/eval-queue/TheBloke/Airoboros-L2-13B-2.1-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Airoboros-L2-13B-2.1-GPTQ", "base_model": "", "revision": "main", "private": false, "precision": "GPTQ", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "470957", "license": "llama2", "likes": 10, "params": 16.232} \ No newline at end of file diff --git a/eval-queue/TheBloke/Airoboros-L2-70B-2.1-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/Airoboros-L2-70B-2.1-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..72507ffa49f70e7df192b2689755d6a3bf11b3be --- /dev/null +++ b/eval-queue/TheBloke/Airoboros-L2-70B-2.1-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Airoboros-L2-70B-2.1-GPTQ", "base_model": "", "revision": "gptq-4bit-32g-actorder_True", "private": false, "precision": "GPTQ", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "634934", "license": "llama2", "likes": 14, "params": 72.816} \ No newline at end of file diff --git a/eval-queue/TheBloke/BigTranslate-13B-GPTQ_eval_request_False_float16_Original.json b/eval-queue/TheBloke/BigTranslate-13B-GPTQ_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..724c4442ef3e5e63b87929112cd60ec331d29c8b --- /dev/null +++ b/eval-queue/TheBloke/BigTranslate-13B-GPTQ_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/BigTranslate-13B-GPTQ", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "634826", "license": "other", "likes": 15, "params": 17.992} \ No newline at end of file diff --git a/eval-queue/TheBloke/CausalLM-14B-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/CausalLM-14B-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d330e4e3bee980047a09af9d77dcc3b0e8f02f4b --- /dev/null +++ b/eval-queue/TheBloke/CausalLM-14B-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/CausalLM-14B-GPTQ", "base_model": "", "revision": "main", "private": false, "precision": "GPTQ", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-27T12:39:33Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 6, "params": 25.984, "license": "wtfpl", "job_id": "649744"} \ No newline at end of file diff --git a/eval-queue/TheBloke/Chinese-Alpaca-33B-SuperHOT-8K-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/Chinese-Alpaca-33B-SuperHOT-8K-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..4cf14d78dbb10bf059e3ce349c2b774af33055fd --- /dev/null +++ b/eval-queue/TheBloke/Chinese-Alpaca-33B-SuperHOT-8K-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Chinese-Alpaca-33B-SuperHOT-8K-fp16", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "fine-tuned", "job_id": "634700", "license": "other", "likes": 4, "params": 32.435} \ No newline at end of file diff --git a/eval-queue/TheBloke/CodeLlama-13B-Instruct-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/CodeLlama-13B-Instruct-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..f60841f9cd887c404f386e5b0bc7091eedf7a408 --- /dev/null +++ b/eval-queue/TheBloke/CodeLlama-13B-Instruct-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/CodeLlama-13B-Instruct-fp16", "base_model": "petra", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "511680", "license": "llama2", "likes": 28, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/TheBloke/CodeLlama-13B-Python-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/CodeLlama-13B-Python-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e81446768298c25a31d4a45e80c4d01925ab3888 --- /dev/null +++ b/eval-queue/TheBloke/CodeLlama-13B-Python-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/CodeLlama-13B-Python-fp16", "base_model": "petra", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "511655", "license": "llama2", "likes": 26, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/TheBloke/CodeLlama-34B-Instruct-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/CodeLlama-34B-Instruct-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..ef5f0548e84995f7fe20a211cbd2785434626eae --- /dev/null +++ b/eval-queue/TheBloke/CodeLlama-34B-Instruct-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/CodeLlama-34B-Instruct-fp16", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "511557", "license": "llama2", "likes": 9, "params": 33.744} \ No newline at end of file diff --git a/eval-queue/TheBloke/CodeLlama-7B-Instruct-GGML_eval_request_False_8bit_Original.json b/eval-queue/TheBloke/CodeLlama-7B-Instruct-GGML_eval_request_False_8bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..59ee80956f085404cd5e297bab5d61b2b328acca --- /dev/null +++ b/eval-queue/TheBloke/CodeLlama-7B-Instruct-GGML_eval_request_False_8bit_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/CodeLlama-7B-Instruct-GGML", "base_model": "", "revision": "main", "private": false, "precision": "8bit", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-27T18:36:44Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "479754", "license": "llama2", "likes": 16, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/TheBloke/CodeLlama-7B-Python-GGML_eval_request_False_8bit_Original.json b/eval-queue/TheBloke/CodeLlama-7B-Python-GGML_eval_request_False_8bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..19bff034cb5066cc1b6364a48e4597edba489b3d --- /dev/null +++ b/eval-queue/TheBloke/CodeLlama-7B-Python-GGML_eval_request_False_8bit_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/CodeLlama-7B-Python-GGML", "base_model": "", "revision": "main", "private": false, "precision": "8bit", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-27T18:35:10Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "479747", "license": "llama2", "likes": 24, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/TheBloke/DiscoLM-mixtral-8x7b-v2-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/DiscoLM-mixtral-8x7b-v2-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..10bc6349b786f1a5696b7a03a8c7a566ca59115d --- /dev/null +++ b/eval-queue/TheBloke/DiscoLM-mixtral-8x7b-v2-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1,16 @@ +{ + "model": "TheBloke/DiscoLM-mixtral-8x7b-v2-GPTQ", + "base_model": "", + "revision": "main", + "private": false, + "precision": "GPTQ", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-12T05:14:31Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 14, + "params": 48.736, + "license": "apache-2.0", + "job_id": "898685", + "job_start_time": "2023-12-12T05:15:42.110552" +} \ No newline at end of file diff --git a/eval-queue/TheBloke/FashionGPT-70B-v1.2-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/FashionGPT-70B-v1.2-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b18fa23908a5c0e2cb4db9dd8f1b459d9783dc55 --- /dev/null +++ b/eval-queue/TheBloke/FashionGPT-70B-v1.2-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/FashionGPT-70B-v1.2-GPTQ", "base_model": "", "revision": "main", "private": false, "precision": "GPTQ", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-13T00:47:39Z", "model_type": "\ud83d\udd36 : fine-tuned", "license": "llama2", "likes": 1, "params": 72.816, "job_id": "643124"} \ No newline at end of file diff --git a/eval-queue/TheBloke/GPlatty-30B-SuperHOT-8K-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/GPlatty-30B-SuperHOT-8K-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..c1c9427d2e9250dce310567062d35c1271437628 --- /dev/null +++ b/eval-queue/TheBloke/GPlatty-30B-SuperHOT-8K-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/GPlatty-30B-SuperHOT-8K-fp16", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "fine-tuned", "job_id": "639018", "license": "other", "likes": 1, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/TheBloke/Genz-70b-GPTQ_eval_request_False_float16_Original.json b/eval-queue/TheBloke/Genz-70b-GPTQ_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d8174041620638ed904bda5a520f7d7d3709f1f1 --- /dev/null +++ b/eval-queue/TheBloke/Genz-70b-GPTQ_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Genz-70b-GPTQ", "base_model": "", "revision": "main", "private": false, "precision": "GPTQ", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "641707", "license": "llama2", "likes": 34, "params": 77.576} \ No newline at end of file diff --git a/eval-queue/TheBloke/Guanaco-3B-Uncensored-v2-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/Guanaco-3B-Uncensored-v2-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..11d4692ac20751a8b242b2cffd4c4aac81b1cd0a --- /dev/null +++ b/eval-queue/TheBloke/Guanaco-3B-Uncensored-v2-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Guanaco-3B-Uncensored-v2-GPTQ", "base_model": "", "revision": "main", "private": false, "precision": "GPTQ", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:19:55Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522880", "license": "apache-2.0", "likes": 8, "params": 4.776} \ No newline at end of file diff --git a/eval-queue/TheBloke/Kimiko-13B-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/Kimiko-13B-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..440bd8b77d8d646772d867a7589c5c10911311a9 --- /dev/null +++ b/eval-queue/TheBloke/Kimiko-13B-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Kimiko-13B-fp16", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "fine-tuned", "job_id": "512761", "license": "other", "likes": 2, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/TheBloke/Kimiko-v2-13B-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/Kimiko-v2-13B-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..1c41ae873db69db701e55ca30a1af147e6d02813 --- /dev/null +++ b/eval-queue/TheBloke/Kimiko-v2-13B-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Kimiko-v2-13B-fp16", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "512301", "license": "llama2", "likes": 2, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/TheBloke/LLaMa-65B-GPTQ-3bit_eval_request_False_False_False.json b/eval-queue/TheBloke/LLaMa-65B-GPTQ-3bit_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..da9635c1409ae7e2e51a3788fb6e2ef8439b087c --- /dev/null +++ b/eval-queue/TheBloke/LLaMa-65B-GPTQ-3bit_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/LLaMa-65B-GPTQ-3bit", "base_model": "", "revision": "main", "private": false, "status": "FAILED", "submitted_time": "2023-08-25T13:06:20Z", "weight_type": "Original", "precision": "float16", "job_id": "392413", "params": 520.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/TheBloke/Lemur-70B-Chat-v1-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/Lemur-70B-Chat-v1-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..a4a94d0c85c5952b4395522303d23c51d5f25d3d --- /dev/null +++ b/eval-queue/TheBloke/Lemur-70B-Chat-v1-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Lemur-70B-Chat-v1-GPTQ", "base_model": "", "revision": "main", "private": false, "precision": "GPTQ", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "fine-tuned", "job_id": "634895", "license": "cc-by-nc-4.0", "likes": 2, "params": 72.824} \ No newline at end of file diff --git a/eval-queue/TheBloke/Llama-2-13B-GPTQ_eval_request_False_float16_Original.json b/eval-queue/TheBloke/Llama-2-13B-GPTQ_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d61eb9f407226bee811ffbc2722fda34378cf99c --- /dev/null +++ b/eval-queue/TheBloke/Llama-2-13B-GPTQ_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Llama-2-13B-GPTQ", "base_model": "", "revision": "gptq-4bit-128g-actorder_True", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "518541", "license": "llama2", "likes": 99, "params": 16.232} \ No newline at end of file diff --git a/eval-queue/TheBloke/Llama-2-13B-fp16_eval_request_False_False_False.json b/eval-queue/TheBloke/Llama-2-13B-fp16_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..304c1abdb5fd133ba8497e58c4ea7e7758e3dc05 --- /dev/null +++ b/eval-queue/TheBloke/Llama-2-13B-fp16_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/Llama-2-13B-fp16", "base_model": "TheBloke/Llama-2-13B-fp16", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "weight_type": "Original", "precision": "float16", "job_id": "513035", "model_type": "pretrained", "license": "?", "likes": 51, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/TheBloke/Llama-2-70B-chat-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/Llama-2-70B-chat-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..91d17ac7b375be9411081b72de4be060bb3aec57 --- /dev/null +++ b/eval-queue/TheBloke/Llama-2-70B-chat-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Llama-2-70B-chat-GPTQ", "base_model": "", "revision": "main", "private": false, "precision": "GPTQ", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "641582", "license": "llama2", "likes": 204, "params": 72.816} \ No newline at end of file diff --git a/eval-queue/TheBloke/Llama-2-70B-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/Llama-2-70B-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..91f871a56724371e7df18861b4ee0b02a9cbd80c --- /dev/null +++ b/eval-queue/TheBloke/Llama-2-70B-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Llama-2-70B-fp16", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "fine-tuned", "job_id": "512319", "license": "other", "likes": 40, "params": 68.977} \ No newline at end of file diff --git a/eval-queue/TheBloke/Llama-2-7B-GGUF_eval_request_False_float16_Original.json b/eval-queue/TheBloke/Llama-2-7B-GGUF_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..af976fe9d3be5c6ae851c992f4e211804ee7a761 --- /dev/null +++ b/eval-queue/TheBloke/Llama-2-7B-GGUF_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Llama-2-7B-GGUF", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-27T18:25:27Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "479688", "license": "llama2", "likes": 46, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/TheBloke/Llama-2-7b-Chat-AWQ_eval_request_False_float16_Original.json b/eval-queue/TheBloke/Llama-2-7b-Chat-AWQ_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..51da8181590a5a39f1df9f6657e5e7feff6404fa --- /dev/null +++ b/eval-queue/TheBloke/Llama-2-7b-Chat-AWQ_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Llama-2-7b-Chat-AWQ", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "515383", "license": "llama2", "likes": 1, "params": 1.129} \ No newline at end of file diff --git a/eval-queue/TheBloke/LongChat-13B-GPTQ_eval_request_False_4bit_Original.json b/eval-queue/TheBloke/LongChat-13B-GPTQ_eval_request_False_4bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..2226521c62fd953444a8a03471f0166d885b9f9c --- /dev/null +++ b/eval-queue/TheBloke/LongChat-13B-GPTQ_eval_request_False_4bit_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/LongChat-13B-GPTQ", "base_model": "", "revision": "main", "private": false, "precision": "4bit", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "job_id": "641705", "license": "other", "likes": 25, "params": 16.224} \ No newline at end of file diff --git a/eval-queue/TheBloke/MPT-7B-Instruct-GGML_eval_request_False_float16_Original.json b/eval-queue/TheBloke/MPT-7B-Instruct-GGML_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..ac40258a3ee1b94e7f012c2075356c9299d31cab --- /dev/null +++ b/eval-queue/TheBloke/MPT-7B-Instruct-GGML_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/MPT-7B-Instruct-GGML", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-08-25T13:05:50Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "391951", "license": "cc-by-sa-3.0", "likes": 28, "params": 1.311} \ No newline at end of file diff --git a/eval-queue/TheBloke/Nous-Capybara-34B-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/Nous-Capybara-34B-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..234cbd96e490ea43257d6a9ac4705bf85571c629 --- /dev/null +++ b/eval-queue/TheBloke/Nous-Capybara-34B-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Nous-Capybara-34B-GPTQ", "base_model": "", "revision": "main", "private": false, "precision": "GPTQ", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-15T01:49:30Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 1, "params": 40.912, "license": ["mit"], "job_id": "651929"} \ No newline at end of file diff --git a/eval-queue/TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..936d1f1a8014b6de4c4a443466e907a37b9a903d --- /dev/null +++ b/eval-queue/TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "fine-tuned", "job_id": "512792", "license": "other", "likes": 2, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/TheBloke/OpenOrca-Platypus2-13B-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/OpenOrca-Platypus2-13B-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..637ccbc7eac451b45ef254f02b2857fccf45b415 --- /dev/null +++ b/eval-queue/TheBloke/OpenOrca-Platypus2-13B-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/OpenOrca-Platypus2-13B-GPTQ", "base_model": "", "revision": "gptq-4bit-32g-actorder_True", "private": false, "precision": "GPTQ", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-20T03:46:26Z", "model_type": "fine-tuned", "job_id": "470316", "license": "cc-by-nc-4.0", "likes": 49, "params": 16.24} \ No newline at end of file diff --git a/eval-queue/TheBloke/Phind-CodeLlama-34B-v2-GPTQ_eval_request_False_float16_Original.json b/eval-queue/TheBloke/Phind-CodeLlama-34B-v2-GPTQ_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..6db8718eadaaa4a1b928ed57f59caa4a540fcfa1 --- /dev/null +++ b/eval-queue/TheBloke/Phind-CodeLlama-34B-v2-GPTQ_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Phind-CodeLlama-34B-v2-GPTQ", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-19T15:17:10Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "470156", "license": "llama2", "likes": 67, "params": 37.504} \ No newline at end of file diff --git a/eval-queue/TheBloke/Platypus2-70B-Instruct-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/Platypus2-70B-Instruct-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..799f2e2eef9e1256a4faf6603e612f680d74de45 --- /dev/null +++ b/eval-queue/TheBloke/Platypus2-70B-Instruct-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Platypus2-70B-Instruct-GPTQ", "base_model": "", "revision": "gptq-4bit-64g-actorder_True", "private": false, "precision": "GPTQ", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "641656", "license": "cc-by-nc-4.0", "likes": 20, "params": 72.816} \ No newline at end of file diff --git a/eval-queue/TheBloke/Project-Baize-v2-7B-GPTQ_eval_request_False_False_False.json b/eval-queue/TheBloke/Project-Baize-v2-7B-GPTQ_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..dc749df39df5926e158a31155f1006f4c3703baa --- /dev/null +++ b/eval-queue/TheBloke/Project-Baize-v2-7B-GPTQ_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/Project-Baize-v2-7B-GPTQ", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "513043", "weight_type": "Original", "precision": "float16", "submitted_time": "2023-09-09T10:52:17Z", "license": "other", "likes": 4, "params": 9.04} \ No newline at end of file diff --git a/eval-queue/TheBloke/Speechless-Llama2-Hermes-Orca-Platypus-WizardLM-13B-GPTQ_eval_request_False_float16_Original.json b/eval-queue/TheBloke/Speechless-Llama2-Hermes-Orca-Platypus-WizardLM-13B-GPTQ_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..5a45293079912d7ff4d58204eaf836c3f8537898 --- /dev/null +++ b/eval-queue/TheBloke/Speechless-Llama2-Hermes-Orca-Platypus-WizardLM-13B-GPTQ_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Speechless-Llama2-Hermes-Orca-Platypus-WizardLM-13B-GPTQ", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-10T16:14:38Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "490514", "license": "llama2", "likes": 19, "params": 16.232} \ No newline at end of file diff --git a/eval-queue/TheBloke/StableBeluga-13B-GGML_eval_request_False_4bit_Original.json b/eval-queue/TheBloke/StableBeluga-13B-GGML_eval_request_False_4bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b90b7ffce1604eff279899f0f0c9b62522c9b23c --- /dev/null +++ b/eval-queue/TheBloke/StableBeluga-13B-GGML_eval_request_False_4bit_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/StableBeluga-13B-GGML", "base_model": "", "revision": "main", "private": false, "precision": "4bit", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-08-25T13:06:27Z", "model_type": "pretrained", "job_id": "392474", "license": "llama2", "likes": 34, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/TheBloke/StableBeluga2-GPTQ_eval_request_False_4bit_Original.json b/eval-queue/TheBloke/StableBeluga2-GPTQ_eval_request_False_4bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e4af4bf3f4c65ec0843d57f29993612051397b01 --- /dev/null +++ b/eval-queue/TheBloke/StableBeluga2-GPTQ_eval_request_False_4bit_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/StableBeluga2-GPTQ", "base_model": "gptq_model-4bit--1g", "revision": "main", "private": false, "precision": "4bit", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-08-25T13:06:34Z", "model_type": "pretrained", "job_id": "392540", "license": "llama2", "likes": 89, "params": 72.816} \ No newline at end of file diff --git a/eval-queue/TheBloke/Synthia-7B-v1.3-AWQ_eval_request_False_float16_Original.json b/eval-queue/TheBloke/Synthia-7B-v1.3-AWQ_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..108f60defb4453ff7153a295d15d26a5d9274ece --- /dev/null +++ b/eval-queue/TheBloke/Synthia-7B-v1.3-AWQ_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Synthia-7B-v1.3-AWQ", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-06T09:58:22Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "486876", "license": "apache-2.0", "likes": 5, "params": 1.196} \ No newline at end of file diff --git a/eval-queue/TheBloke/Synthia-7B-v1.3-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/Synthia-7B-v1.3-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..57577d8b8f7c11db580dc3d42670e5cff07b0e6f --- /dev/null +++ b/eval-queue/TheBloke/Synthia-7B-v1.3-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Synthia-7B-v1.3-GPTQ", "base_model": "", "revision": "main", "private": false, "precision": "GPTQ", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-09T07:54:01Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": null, "license": "apache-2.0", "likes": 6, "params": 9.592} \ No newline at end of file diff --git a/eval-queue/TheBloke/Tess-XL-v1.0-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/Tess-XL-v1.0-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..60f555f7cc1da05122e2948efa7305cf5a97631b --- /dev/null +++ b/eval-queue/TheBloke/Tess-XL-v1.0-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1,16 @@ +{ + "model": "TheBloke/Tess-XL-v1.0-GPTQ", + "base_model": "", + "revision": "main", + "private": false, + "precision": "GPTQ", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-11-26T09:39:54Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 2, + "params": 121.712, + "license": "llama2", + "job_id": "801968", + "job_start_time": "2023-11-28T01:05:33.619060" +} \ No newline at end of file diff --git a/eval-queue/TheBloke/UltraLM-13B-fp16_eval_request_False_False_False.json b/eval-queue/TheBloke/UltraLM-13B-fp16_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..b27cdb67e8d58df2b18bc45cdd6a7414deb443a5 --- /dev/null +++ b/eval-queue/TheBloke/UltraLM-13B-fp16_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/UltraLM-13B-fp16", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "job_id": "512509", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "license": "other", "likes": 4, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/TheBloke/VicUnlocked-30B-LoRA-HF_eval_request_False_False_False.json b/eval-queue/TheBloke/VicUnlocked-30B-LoRA-HF_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..6440c7f318ccc94a2848704b32a8f542816e91d1 --- /dev/null +++ b/eval-queue/TheBloke/VicUnlocked-30B-LoRA-HF_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/VicUnlocked-30B-LoRA-HF", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "weight_type": "Original", "precision": "float16", "job_id": "513349", "submitted_time": "2023-09-09T10:52:17Z", "license": "other", "likes": 1, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/TheBloke/VicUnlocked-30B-LoRA-HF_eval_request_False_float16_Original.json b/eval-queue/TheBloke/VicUnlocked-30B-LoRA-HF_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..51b69d81dd704a7a7f99d81447728edc5869d918 --- /dev/null +++ b/eval-queue/TheBloke/VicUnlocked-30B-LoRA-HF_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/VicUnlocked-30B-LoRA-HF", "base_model": "TheBloke/Llama-2-70B-fp16", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "512491", "license": "other", "likes": 1, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16_eval_request_False_False_False.json b/eval-queue/TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..894e533655a8e4d445af25ff641467b63cd83fc0 --- /dev/null +++ b/eval-queue/TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "weight_type": "Original", "precision": "float16", "job_id": "512503", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "other", "likes": 9, "params": 65.024} \ No newline at end of file diff --git a/eval-queue/TheBloke/Vicuna-13B-CoT-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/Vicuna-13B-CoT-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..91bd72f1bfaf388058ea814250c1217ee4a6261d --- /dev/null +++ b/eval-queue/TheBloke/Vicuna-13B-CoT-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Vicuna-13B-CoT-fp16", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "fine-tuned", "job_id": "511847", "license": "other", "likes": 3, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/TheBloke/Vicuna-33B-1-3-SuperHOT-8K-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/Vicuna-33B-1-3-SuperHOT-8K-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..94d6d2f5da9a7c9cca9e9b49ac9c986d9e261148 --- /dev/null +++ b/eval-queue/TheBloke/Vicuna-33B-1-3-SuperHOT-8K-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Vicuna-33B-1-3-SuperHOT-8K-fp16", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "fine-tuned", "job_id": "636984", "license": "other", "likes": 6, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/TheBloke/Wizard-Vicuna-13B-Uncensored-HF_eval_request_False_False_False.json b/eval-queue/TheBloke/Wizard-Vicuna-13B-Uncensored-HF_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..2170d7fa1e1723ab4142c37298ba7086d7fdd682 --- /dev/null +++ b/eval-queue/TheBloke/Wizard-Vicuna-13B-Uncensored-HF_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/Wizard-Vicuna-13B-Uncensored-HF", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "513151", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "other", "likes": 197, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ_eval_request_False_False_False.json b/eval-queue/TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..1085d1b1c8b517c463d26d95e64ef6c4dcbd8268 --- /dev/null +++ b/eval-queue/TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "weight_type": "Original", "precision": "float16", "job_id": "640078", "submitted_time": "2023-11-06T10:31:15Z", "license": "other", "likes": 380, "params": 35.584} \ No newline at end of file diff --git a/eval-queue/TheBloke/Wizard-Vicuna-30B-Uncensored-fp16_eval_request_False_False_False.json b/eval-queue/TheBloke/Wizard-Vicuna-30B-Uncensored-fp16_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..83940936000ea5fbf3dee854eb5fb87380e35b75 --- /dev/null +++ b/eval-queue/TheBloke/Wizard-Vicuna-30B-Uncensored-fp16_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/Wizard-Vicuna-30B-Uncensored-fp16", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "506114", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "other", "likes": 16, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/TheBloke/Wizard-Vicuna-7B-Uncensored-HF_eval_request_False_False_False.json b/eval-queue/TheBloke/Wizard-Vicuna-7B-Uncensored-HF_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..a9c2b62582da4f1025682162dc347d6e9216ecc9 --- /dev/null +++ b/eval-queue/TheBloke/Wizard-Vicuna-7B-Uncensored-HF_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/Wizard-Vicuna-7B-Uncensored-HF", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "513109", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "other", "likes": 19, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..72ce50ddc6ad2f72f00f1c7d04c1a1894371dcd3 --- /dev/null +++ b/eval-queue/TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "fine-tuned", "job_id": "511591", "license": "other", "likes": 4, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/TheBloke/WizardLM-30B-Uncensored-GPTQ_eval_request_False_False_False.json b/eval-queue/TheBloke/WizardLM-30B-Uncensored-GPTQ_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..ea4fa99bc35e357620eb4e6ca8ce65464161ca09 --- /dev/null +++ b/eval-queue/TheBloke/WizardLM-30B-Uncensored-GPTQ_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/WizardLM-30B-Uncensored-GPTQ", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "weight_type": "Original", "precision": "float16", "job_id": "634828", "submitted_time": "2023-11-06T10:31:15Z", "license": "other", "likes": 107, "params": 35.584} \ No newline at end of file diff --git a/eval-queue/TheBloke/WizardLM-30B-fp16_eval_request_False_False_False.json b/eval-queue/TheBloke/WizardLM-30B-fp16_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..ef78810f88a4f3a3eaeb132dc8062e0fa7212427 --- /dev/null +++ b/eval-queue/TheBloke/WizardLM-30B-fp16_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/WizardLM-30B-fp16", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "weight_type": "Original", "precision": "float16", "job_id": "511830", "model_type": "fine-tuned", "license": "other", "likes": 10, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/TheBloke/WizardLM-30B-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/WizardLM-30B-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..33539d47ad5243d4038f710a8f3be359780874a2 --- /dev/null +++ b/eval-queue/TheBloke/WizardLM-30B-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/WizardLM-30B-fp16", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "fine-tuned", "job_id": "512886", "license": "other", "likes": 10, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/TheBloke/WizardLM-33B-V1.0-Uncensored-GPTQ_eval_request_False_float16_Original.json b/eval-queue/TheBloke/WizardLM-33B-V1.0-Uncensored-GPTQ_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..2b681bec71aaaa992fe569eb43f505210d2a826f --- /dev/null +++ b/eval-queue/TheBloke/WizardLM-33B-V1.0-Uncensored-GPTQ_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/WizardLM-33B-V1.0-Uncensored-GPTQ", "base_model": "", "revision": "gptq-4bit-64g-actorder_True", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "fine-tuned", "job_id": "512485", "license": "other", "likes": 37, "params": 35.584} \ No newline at end of file diff --git a/eval-queue/TheBloke/WizardLM-70B-V1.0-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/WizardLM-70B-V1.0-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..733c41388fd5e83aa573c2223f5b1a4cb757c7ab --- /dev/null +++ b/eval-queue/TheBloke/WizardLM-70B-V1.0-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/WizardLM-70B-V1.0-GPTQ", "base_model": "", "revision": "main", "private": false, "precision": "GPTQ", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "641215", "license": "llama2", "likes": 26, "params": 72.824} \ No newline at end of file diff --git a/eval-queue/TheBloke/WizardLM-7B-uncensored-GPTQ_eval_request_False_False_False.json b/eval-queue/TheBloke/WizardLM-7B-uncensored-GPTQ_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..e2b14f79122aa8bb3c9cd45a7ed5ad5f59f7be59 --- /dev/null +++ b/eval-queue/TheBloke/WizardLM-7B-uncensored-GPTQ_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/WizardLM-7B-uncensored-GPTQ", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "510621", "weight_type": "Original", "precision": "float16", "submitted_time": "2023-09-09T10:52:17Z", "license": "other", "likes": 150, "params": 9.04} \ No newline at end of file diff --git a/eval-queue/TheBloke/WizardLM-Uncensored-SuperCOT-StoryTelling-30B-GPTQ_eval_request_False_False_False.json b/eval-queue/TheBloke/WizardLM-Uncensored-SuperCOT-StoryTelling-30B-GPTQ_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..4049a1438b44b51087063fc979b8e8d7dbd71b30 --- /dev/null +++ b/eval-queue/TheBloke/WizardLM-Uncensored-SuperCOT-StoryTelling-30B-GPTQ_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/WizardLM-Uncensored-SuperCOT-StoryTelling-30B-GPTQ", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "job_id": "641731", "weight_type": "Original", "precision": "float16", "license": "other", "likes": 69, "params": 35.584} \ No newline at end of file diff --git a/eval-queue/TheBloke/WizardMath-13B-V1.0-GGML_eval_request_False_float16_Original.json b/eval-queue/TheBloke/WizardMath-13B-V1.0-GGML_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..4c6d87eb0432d00b09404dfb21533e3800443cce --- /dev/null +++ b/eval-queue/TheBloke/WizardMath-13B-V1.0-GGML_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/WizardMath-13B-V1.0-GGML", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-08-29T14:05:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "396908", "license": "llama2", "likes": 13, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/TheBloke/WizardMath-70B-V1.0-GGML_eval_request_False_float16_Original.json b/eval-queue/TheBloke/WizardMath-70B-V1.0-GGML_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d76353836491bc2143481a20823f3116a7d1966b --- /dev/null +++ b/eval-queue/TheBloke/WizardMath-70B-V1.0-GGML_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/WizardMath-70B-V1.0-GGML", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-08-25T13:06:27Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "392476", "license": "llama2", "likes": 10, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/TheBloke/Yarn-Llama-2-7B-128K-GGML_eval_request_False_float16_Original.json b/eval-queue/TheBloke/Yarn-Llama-2-7B-128K-GGML_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b453df41a4481747332fc9c21629a1471c0515c1 --- /dev/null +++ b/eval-queue/TheBloke/Yarn-Llama-2-7B-128K-GGML_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/Yarn-Llama-2-7B-128K-GGML", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-27T18:34:11Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "479735", "license": "llama2", "likes": 5, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/TheBloke/airoboros-33B-gpt4-1-4-SuperHOT-8K-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/airoboros-33B-gpt4-1-4-SuperHOT-8K-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..668a3c9e544aa9e7600f8dbc1b12095dd1a711c0 --- /dev/null +++ b/eval-queue/TheBloke/airoboros-33B-gpt4-1-4-SuperHOT-8K-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/airoboros-33B-gpt4-1-4-SuperHOT-8K-fp16", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "fine-tuned", "job_id": "634723", "license": "other", "likes": 5, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/TheBloke/airoboros-7b-gpt4-fp16_eval_request_False_False_False.json b/eval-queue/TheBloke/airoboros-7b-gpt4-fp16_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..77eeba6b4044aa49934c32e48580510ee1b725f3 --- /dev/null +++ b/eval-queue/TheBloke/airoboros-7b-gpt4-fp16_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/airoboros-7b-gpt4-fp16", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "job_id": "511639", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "license": "other", "likes": 3, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/TheBloke/alpaca-lora-65B-HF_eval_request_False_False_False.json b/eval-queue/TheBloke/alpaca-lora-65B-HF_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..ae11a55a06695de8c8bac3a4ef4f97bc5a95c49f --- /dev/null +++ b/eval-queue/TheBloke/alpaca-lora-65B-HF_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/alpaca-lora-65B-HF", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "weight_type": "Original", "precision": "float16", "job_id": "512309", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "other", "likes": 3, "params": 65.024} \ No newline at end of file diff --git a/eval-queue/TheBloke/chronos-wizardlm-uc-scot-st-13B-GPTQ_eval_request_False_False_False.json b/eval-queue/TheBloke/chronos-wizardlm-uc-scot-st-13B-GPTQ_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..6f8594d9dff41483e3550712f696c9815dbe1781 --- /dev/null +++ b/eval-queue/TheBloke/chronos-wizardlm-uc-scot-st-13B-GPTQ_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/chronos-wizardlm-uc-scot-st-13B-GPTQ", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "job_id": "641292", "weight_type": "Original", "precision": "float16", "license": "other", "likes": 6, "params": 16.224} \ No newline at end of file diff --git a/eval-queue/TheBloke/dolphin-2.1-mistral-7B-GPTQ_eval_request_False_float16_Original.json b/eval-queue/TheBloke/dolphin-2.1-mistral-7B-GPTQ_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..9469b76579472d2167fe5103fd72105fe9c819be --- /dev/null +++ b/eval-queue/TheBloke/dolphin-2.1-mistral-7B-GPTQ_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/dolphin-2.1-mistral-7B-GPTQ", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-27T15:15:56Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 25, "params": 9.592, "license": "apache-2.0", "job_id": "649779"} \ No newline at end of file diff --git a/eval-queue/TheBloke/dolphin-2.5-mixtral-8x7b-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/dolphin-2.5-mixtral-8x7b-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..703d8ae99c8a1001410b5987a94074318c3afd44 --- /dev/null +++ b/eval-queue/TheBloke/dolphin-2.5-mixtral-8x7b-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1,16 @@ +{ + "model": "TheBloke/dolphin-2.5-mixtral-8x7b-GPTQ", + "base_model": "", + "revision": "main", + "private": false, + "precision": "GPTQ", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-18T00:41:04Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 8, + "params": 48.744, + "license": "apache-2.0", + "job_id": "934307", + "job_start_time": "2023-12-18T00:42:02.974233" +} \ No newline at end of file diff --git a/eval-queue/TheBloke/fiction.live-Kimiko-V2-70B-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/fiction.live-Kimiko-V2-70B-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..066c89acb4af73ddc6a1d6f8885f065ab437b0b1 --- /dev/null +++ b/eval-queue/TheBloke/fiction.live-Kimiko-V2-70B-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/fiction.live-Kimiko-V2-70B-fp16", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "513149", "license": "llama2", "likes": 3, "params": 68.715} \ No newline at end of file diff --git a/eval-queue/TheBloke/goliath-120b-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/goliath-120b-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..19d60a5236450c6b0854a33a0be13f87f97e3077 --- /dev/null +++ b/eval-queue/TheBloke/goliath-120b-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1,16 @@ +{ + "model": "TheBloke/goliath-120b-GPTQ", + "base_model": "", + "revision": "main", + "private": false, + "precision": "GPTQ", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-11-26T09:13:58Z", + "model_type": "\ud83d\udfe2 : pretrained", + "likes": 4, + "params": 121.712, + "license": "llama2", + "job_id": "801952", + "job_start_time": "2023-11-28T00:46:53.042281" +} \ No newline at end of file diff --git a/eval-queue/TheBloke/gpt4-alpaca-lora-13B-HF_eval_request_False_False_False.json b/eval-queue/TheBloke/gpt4-alpaca-lora-13B-HF_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..0450cb4faf6c0184c7b5bfa780069a911f7193c9 --- /dev/null +++ b/eval-queue/TheBloke/gpt4-alpaca-lora-13B-HF_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/gpt4-alpaca-lora-13B-HF", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "513113", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "other", "likes": 4, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/TheBloke/gpt4-alpaca-lora-30b-HF_eval_request_False_False_False.json b/eval-queue/TheBloke/gpt4-alpaca-lora-30b-HF_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..0b8be29ce8c4ffe008337245a049a16ce2787f43 --- /dev/null +++ b/eval-queue/TheBloke/gpt4-alpaca-lora-30b-HF_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/gpt4-alpaca-lora-30b-HF", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "weight_type": "Original", "precision": "float16", "job_id": "461832", "submitted_time": "2023-09-09T10:52:17Z", "license": "other", "likes": 9, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/TheBloke/gpt4-alpaca-lora-30b-HF_eval_request_False_float16_Original.json b/eval-queue/TheBloke/gpt4-alpaca-lora-30b-HF_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d8a8d67c869a8ddafb0280d312c3596b3fa2563c --- /dev/null +++ b/eval-queue/TheBloke/gpt4-alpaca-lora-30b-HF_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/gpt4-alpaca-lora-30b-HF", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "pretrained", "job_id": "361031", "license": "other", "likes": 9, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/TheBloke/gpt4-alpaca-lora_mlp-65B-HF_eval_request_False_False_False.json b/eval-queue/TheBloke/gpt4-alpaca-lora_mlp-65B-HF_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..0be6404fae5254eac706c3a84ea9ba4332f997b3 --- /dev/null +++ b/eval-queue/TheBloke/gpt4-alpaca-lora_mlp-65B-HF_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/gpt4-alpaca-lora_mlp-65B-HF", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "weight_type": "Original", "precision": "float16", "job_id": "512804", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "other", "likes": 7, "params": 65.024} \ No newline at end of file diff --git a/eval-queue/TheBloke/gpt4-x-vicuna-13B-HF_eval_request_False_False_False.json b/eval-queue/TheBloke/gpt4-x-vicuna-13B-HF_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..00455eba660598cd3c757f541b77b0d962ba08fc --- /dev/null +++ b/eval-queue/TheBloke/gpt4-x-vicuna-13B-HF_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/gpt4-x-vicuna-13B-HF", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "512282", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "params": 13.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/TheBloke/guanaco-33B-GPTQ_eval_request_False_False_False.json b/eval-queue/TheBloke/guanaco-33B-GPTQ_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..09d9d45c960d6f7262fff2c771eb9d5c57886fc4 --- /dev/null +++ b/eval-queue/TheBloke/guanaco-33B-GPTQ_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/guanaco-33B-GPTQ", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "weight_type": "Original", "precision": "float16", "job_id": "641277", "submitted_time": "2023-11-06T10:31:15Z", "license": "other", "likes": 71, "params": 35.584} \ No newline at end of file diff --git a/eval-queue/TheBloke/landmark-attention-llama7b-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/landmark-attention-llama7b-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..125672aa967c0e33e547e4c225fda10274375456 --- /dev/null +++ b/eval-queue/TheBloke/landmark-attention-llama7b-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/landmark-attention-llama7b-fp16", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "fine-tuned", "job_id": "512324", "license": "other", "likes": 8, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/TheBloke/llama-2-70b-Guanaco-QLoRA-fp16_eval_request_False_False_False.json b/eval-queue/TheBloke/llama-2-70b-Guanaco-QLoRA-fp16_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..4d3b972c47af5b2e6db7509c65d61791f78d89de --- /dev/null +++ b/eval-queue/TheBloke/llama-2-70b-Guanaco-QLoRA-fp16_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/llama-2-70b-Guanaco-QLoRA-fp16", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "weight_type": "Original", "precision": "float16", "job_id": "510379", "model_type": "fine-tuned", "license": "other", "likes": 53, "params": 68.715} \ No newline at end of file diff --git a/eval-queue/TheBloke/manticore-13b-chat-pyg-GPTQ_eval_request_False_False_False.json b/eval-queue/TheBloke/manticore-13b-chat-pyg-GPTQ_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..37750be938a7484ab2b129aaaaaf8cc124042935 --- /dev/null +++ b/eval-queue/TheBloke/manticore-13b-chat-pyg-GPTQ_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/manticore-13b-chat-pyg-GPTQ", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "641309", "weight_type": "Original", "precision": "float16", "submitted_time": "2023-11-06T10:31:15Z", "license": "other", "likes": 33, "params": 16.224} \ No newline at end of file diff --git a/eval-queue/TheBloke/medalpaca-13B-GPTQ-4bit_eval_request_False_False_False.json b/eval-queue/TheBloke/medalpaca-13B-GPTQ-4bit_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..f31b385f8c7e639cc3cf18652de93256dc22b956 --- /dev/null +++ b/eval-queue/TheBloke/medalpaca-13B-GPTQ-4bit_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/medalpaca-13B-GPTQ-4bit", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "634888", "weight_type": "Original", "precision": "float16", "submitted_time": "2023-11-06T10:31:15Z", "license": "other", "likes": 28, "params": 16.216} \ No newline at end of file diff --git a/eval-queue/TheBloke/model_007-70B-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/model_007-70B-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..cea6bb0d4a1e07270103b6a7252e7f6ca232274d --- /dev/null +++ b/eval-queue/TheBloke/model_007-70B-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/model_007-70B-GPTQ", "base_model": "", "revision": "main", "private": false, "precision": "GPTQ", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-05T23:16:49Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "437261", "license": "llama2", "likes": 1, "params": 72.816} \ No newline at end of file diff --git a/eval-queue/TheBloke/neural-chat-7B-v3-2-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/neural-chat-7B-v3-2-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..f25eff81fa6ad826592a4947a52253622dde3e78 --- /dev/null +++ b/eval-queue/TheBloke/neural-chat-7B-v3-2-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1,16 @@ +{ + "model": "TheBloke/neural-chat-7B-v3-2-GPTQ", + "base_model": "Intel/neural-chat-7b-v3-2", + "revision": "main", + "private": false, + "precision": "GPTQ", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-10T21:27:34Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 1, + "params": 9.592, + "license": "apache-2.0", + "job_id": "888717", + "job_start_time": "2023-12-10T21:28:41.813345" +} \ No newline at end of file diff --git a/eval-queue/TheBloke/open-instruct-human-mix-65B-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/open-instruct-human-mix-65B-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d9f1d762f00be1a042825c7cee178e85e6cbaeae --- /dev/null +++ b/eval-queue/TheBloke/open-instruct-human-mix-65B-fp16_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "TheBloke/open-instruct-human-mix-65B-fp16", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-11T20:16:14Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 65.286, + "license": "other", + "job_id": "897001", + "job_start_time": "2023-12-12T01:35:05.144047" +} \ No newline at end of file diff --git a/eval-queue/TheBloke/openbuddy-openllama-7B-v12-bf16-GGUF_eval_request_False_float16_Original.json b/eval-queue/TheBloke/openbuddy-openllama-7B-v12-bf16-GGUF_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d03d44860475c511c90c9cd59b29700b5805a478 --- /dev/null +++ b/eval-queue/TheBloke/openbuddy-openllama-7B-v12-bf16-GGUF_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/openbuddy-openllama-7B-v12-bf16-GGUF", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-27T18:33:19Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "479733", "license": "apache-2.0", "likes": 2, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/TheBloke/openbuddy-openllama-7B-v12-bf16-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/openbuddy-openllama-7B-v12-bf16-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..a75afb21bc67bcc7231ffd5e3a4f1d9d977168af --- /dev/null +++ b/eval-queue/TheBloke/openbuddy-openllama-7B-v12-bf16-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/openbuddy-openllama-7B-v12-bf16-GPTQ", "base_model": "", "revision": "main", "private": false, "precision": "GPTQ", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-27T18:32:38Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "479730", "license": "apache-2.0", "likes": 0, "params": 9.384} \ No newline at end of file diff --git a/eval-queue/TheBloke/openchat_v2_openorca_preview-GPTQ_eval_request_False_False_False.json b/eval-queue/TheBloke/openchat_v2_openorca_preview-GPTQ_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..7bcf6a9332cbd2d0c55bde877ef8c4ec064095bb --- /dev/null +++ b/eval-queue/TheBloke/openchat_v2_openorca_preview-GPTQ_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/openchat_v2_openorca_preview-GPTQ", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "weight_type": "Original", "precision": "float16", "job_id": "641701", "license": "other", "likes": 14, "params": 16.224} \ No newline at end of file diff --git a/eval-queue/TheBloke/orca_mini_13B-GPTQ_eval_request_False_False_False.json b/eval-queue/TheBloke/orca_mini_13B-GPTQ_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..7c82dfa8af01ef8c2951e5c1604675f231ca5aea --- /dev/null +++ b/eval-queue/TheBloke/orca_mini_13B-GPTQ_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/orca_mini_13B-GPTQ", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "job_id": "634858", "weight_type": "Original", "precision": "float16", "license": "mit", "likes": 43, "params": 16.224} \ No newline at end of file diff --git a/eval-queue/TheBloke/orca_mini_v3_70B-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/orca_mini_v3_70B-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..80a33662bfd0b72a8b40c55223eb17a936a401f0 --- /dev/null +++ b/eval-queue/TheBloke/orca_mini_v3_70B-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/orca_mini_v3_70B-GPTQ", "base_model": "", "revision": "main", "private": false, "precision": "GPTQ", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-05T23:18:15Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "437265", "license": "other", "likes": 10, "params": 72.816} \ No newline at end of file diff --git a/eval-queue/TheBloke/orca_mini_v3_7B-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/orca_mini_v3_7B-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..733a75b49f29e9c2938a3daf2b8e3ac6067eaae9 --- /dev/null +++ b/eval-queue/TheBloke/orca_mini_v3_7B-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1,16 @@ +{ + "model": "TheBloke/orca_mini_v3_7B-GPTQ", + "base_model": "", + "revision": "main", + "private": false, + "precision": "GPTQ", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-29T21:04:22Z", + "model_type": [], + "likes": 9, + "params": 9.048, + "license": "other", + "job_id": "845729", + "job_start_time": "2023-12-04T09:13:55.121703" +} \ No newline at end of file diff --git a/eval-queue/TheBloke/robin-33B-v2-GPTQ_eval_request_False_False_False.json b/eval-queue/TheBloke/robin-33B-v2-GPTQ_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..cc7557d8b7830b678aaf62dfdaa9429b8963beed --- /dev/null +++ b/eval-queue/TheBloke/robin-33B-v2-GPTQ_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/robin-33B-v2-GPTQ", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "weight_type": "Original", "precision": "float16", "job_id": "634715", "license": "other", "likes": 13, "params": 35.584} \ No newline at end of file diff --git a/eval-queue/TheBloke/robin-33B-v2-fp16_eval_request_False_False_False.json b/eval-queue/TheBloke/robin-33B-v2-fp16_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..c486b5a5fc4d70c7cec197d578c61c3f1a92e298 --- /dev/null +++ b/eval-queue/TheBloke/robin-33B-v2-fp16_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/robin-33B-v2-fp16", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "weight_type": "Original", "precision": "float16", "job_id": "641639", "model_type": "fine-tuned", "license": "other", "likes": 3, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/TheBloke/robin-33B-v2-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/robin-33B-v2-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..3de314a627e3a89101a90fd39e01a153811cf3ae --- /dev/null +++ b/eval-queue/TheBloke/robin-33B-v2-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/robin-33B-v2-fp16", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "fine-tuned", "job_id": "641637", "license": "other", "likes": 3, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/TheBloke/robin-65b-v2-fp16_eval_request_False_float16_Original.json b/eval-queue/TheBloke/robin-65b-v2-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..224ac1f303ef8ae99113477cfa509efe68425db1 --- /dev/null +++ b/eval-queue/TheBloke/robin-65b-v2-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/robin-65b-v2-fp16", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "fine-tuned", "job_id": "513237", "license": "other", "likes": 3, "params": 65.024} \ No newline at end of file diff --git a/eval-queue/TheBloke/stable-vicuna-13B-GGUF_eval_request_False_8bit_Original.json b/eval-queue/TheBloke/stable-vicuna-13B-GGUF_eval_request_False_8bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..02c96f2435aea7e4903301bcb0d230ca36af9f2f --- /dev/null +++ b/eval-queue/TheBloke/stable-vicuna-13B-GGUF_eval_request_False_8bit_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/stable-vicuna-13B-GGUF", "base_model": "", "revision": "main", "private": false, "precision": "8bit", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-27T18:42:30Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "479795", "license": "cc-by-nc-sa-4.0", "likes": 1, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/TheBloke/tulu-2-dpo-70B-AWQ_eval_request_False_GPTQ_Original.json b/eval-queue/TheBloke/tulu-2-dpo-70B-AWQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..ab83f4ccdd211372aae8194487dc9810866f2850 --- /dev/null +++ b/eval-queue/TheBloke/tulu-2-dpo-70B-AWQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1,16 @@ +{ + "model": "TheBloke/tulu-2-dpo-70B-AWQ", + "base_model": "", + "revision": "main", + "private": false, + "precision": "GPTQ", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-01T05:50:27Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 2, + "params": 77.472, + "license": "other", + "job_id": "846013", + "job_start_time": "2023-12-04T14:09:50.997377" +} \ No newline at end of file diff --git a/eval-queue/TheBloke/tulu-30B-fp16_eval_request_False_False_False.json b/eval-queue/TheBloke/tulu-30B-fp16_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..1fb60e678db8906a57e06142616758e802ab8ed0 --- /dev/null +++ b/eval-queue/TheBloke/tulu-30B-fp16_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/tulu-30B-fp16", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "weight_type": "Original", "precision": "float16", "job_id": "511840", "license": "other", "likes": 5, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/TheBloke/vicuna-13B-1.1-HF_eval_request_False_False_False.json b/eval-queue/TheBloke/vicuna-13B-1.1-HF_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..b23974498521eefcd78c5c02f446411c9684881d --- /dev/null +++ b/eval-queue/TheBloke/vicuna-13B-1.1-HF_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/vicuna-13B-1.1-HF", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "513296", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "?", "likes": 96, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/TheBloke/vicuna-33B-GGUF_eval_request_False_8bit_Original.json b/eval-queue/TheBloke/vicuna-33B-GGUF_eval_request_False_8bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..9ed5133578b01d4e399369858ff40c794ea7db81 --- /dev/null +++ b/eval-queue/TheBloke/vicuna-33B-GGUF_eval_request_False_8bit_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/vicuna-33B-GGUF", "base_model": "", "revision": "main", "private": false, "precision": "8bit", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-27T18:40:36Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "479793", "license": "other", "likes": 4, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/TheBloke/wizard-vicuna-13B-GGUF_eval_request_False_8bit_Original.json b/eval-queue/TheBloke/wizard-vicuna-13B-GGUF_eval_request_False_8bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..932b393e8c5f18b176096d5e026329fd788f147c --- /dev/null +++ b/eval-queue/TheBloke/wizard-vicuna-13B-GGUF_eval_request_False_8bit_Original.json @@ -0,0 +1 @@ +{"model": "TheBloke/wizard-vicuna-13B-GGUF", "base_model": "", "revision": "main", "private": false, "precision": "8bit", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-27T18:44:49Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "479799", "license": "other", "likes": 1, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/TheBloke/wizard-vicuna-13B-GPTQ_eval_request_False_False_False.json b/eval-queue/TheBloke/wizard-vicuna-13B-GPTQ_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..c6f2f3aa346ca84e48ec6615da81226498ee88d7 --- /dev/null +++ b/eval-queue/TheBloke/wizard-vicuna-13B-GPTQ_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/wizard-vicuna-13B-GPTQ", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "641752", "weight_type": "Original", "precision": "float16", "submitted_time": "2023-11-06T10:31:15Z", "license": "other", "likes": 99, "params": 16.216} \ No newline at end of file diff --git a/eval-queue/TheBloke/wizard-vicuna-13B-HF_eval_request_False_False_False.json b/eval-queue/TheBloke/wizard-vicuna-13B-HF_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..bab146a37c13491a1731dec26da81f679f7d0d03 --- /dev/null +++ b/eval-queue/TheBloke/wizard-vicuna-13B-HF_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/wizard-vicuna-13B-HF", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "511577", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "?", "likes": 48, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/TheBloke/wizardLM-7B-HF_eval_request_False_False_False.json b/eval-queue/TheBloke/wizardLM-7B-HF_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..8e5fc9d97bbb9481719f7215456fd70b6fba242c --- /dev/null +++ b/eval-queue/TheBloke/wizardLM-7B-HF_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "TheBloke/wizardLM-7B-HF", "base_model": "llama 7B", "revision": "main", "private": false, "status": "FINISHED", "job_id": "641728", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-11-06T10:31:15Z", "license": "other", "likes": 89, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/TinyPixel/elm-test_eval_request_False_bfloat16_Original.json b/eval-queue/TinyPixel/elm-test_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..be9637ed10ea86ab730d0147c4a146de417f6ac1 --- /dev/null +++ b/eval-queue/TinyPixel/elm-test_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "TinyPixel/elm-test", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:00:29Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522279", "license": "?", "likes": 0, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/TinyPixel/lima-test_eval_request_False_bfloat16_Original.json b/eval-queue/TinyPixel/lima-test_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..7a31c18fec176ef9f2d92e9242e0111477bc2cc2 --- /dev/null +++ b/eval-queue/TinyPixel/lima-test_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "TinyPixel/lima-test", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "504666", "license": "?", "likes": 0, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/TinyPixel/llama2-7b-instruct_eval_request_False_bfloat16_Original.json b/eval-queue/TinyPixel/llama2-7b-instruct_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..cafaf7c4e5d01beeab92fb39d56b956a5e853f28 --- /dev/null +++ b/eval-queue/TinyPixel/llama2-7b-instruct_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "TinyPixel/llama2-7b-instruct", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "518417", "params": 7.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/TinyPixel/llama2-7b-oa_eval_request_False_bfloat16_Original.json b/eval-queue/TinyPixel/llama2-7b-oa_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..3ca840fee8cfbe552ab44dd8b77c63092735df5e --- /dev/null +++ b/eval-queue/TinyPixel/llama2-7b-oa_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "TinyPixel/llama2-7b-oa", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:27:38Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "522969", "params": 7.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/TinyPixel/testmodel-3_eval_request_False_bfloat16_Original.json b/eval-queue/TinyPixel/testmodel-3_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..c63dfcee5f1b4f39b98a0acc08f4ffc6424f3c6a --- /dev/null +++ b/eval-queue/TinyPixel/testmodel-3_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "TinyPixel/testmodel-3", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:00:29Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522186", "license": "?", "likes": 0, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/TinyPixel/testmodel2_eval_request_False_bfloat16_Original.json b/eval-queue/TinyPixel/testmodel2_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..a5940778c021d5fb607b517c106665bd9d616127 --- /dev/null +++ b/eval-queue/TinyPixel/testmodel2_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "TinyPixel/testmodel2", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517111", "license": "?", "likes": 0, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/Undi95/Amethyst-13B-Mistral_eval_request_False_float16_Original.json b/eval-queue/Undi95/Amethyst-13B-Mistral_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..1d09df53188599508110c5f2ba215f1311aae971 --- /dev/null +++ b/eval-queue/Undi95/Amethyst-13B-Mistral_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/Amethyst-13B-Mistral", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "520704", "license": "cc-by-nc-4.0", "likes": 3, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/Undi95/Amethyst-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/Amethyst-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..9798771d47bf4f550b54064c96d0f6d2e429f83a --- /dev/null +++ b/eval-queue/Undi95/Amethyst-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/Amethyst-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517590", "license": "cc-by-nc-4.0", "likes": 1, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/Undi95/Clover3-17B_eval_request_False_float16_Original.json b/eval-queue/Undi95/Clover3-17B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..282e262bad8905ab27972bcbd06809f30322186a --- /dev/null +++ b/eval-queue/Undi95/Clover3-17B_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Undi95/Clover3-17B", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-12T00:14:12Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 16.839, + "license": "cc-by-nc-4.0", + "job_id": "897077", + "job_start_time": "2023-12-12T01:51:53.622594" +} \ No newline at end of file diff --git a/eval-queue/Undi95/CodeEngine_eval_request_False_float16_Original.json b/eval-queue/Undi95/CodeEngine_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..4b031a773c6523ffce222ae77d6db3de2d554a40 --- /dev/null +++ b/eval-queue/Undi95/CodeEngine_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/CodeEngine", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517527", "license": "cc-by-nc-4.0", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/Undi95/CreativityEngine_eval_request_False_float16_Original.json b/eval-queue/Undi95/CreativityEngine_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e6baf3f7bea706205595d1c3a24ff6f01b59ec96 --- /dev/null +++ b/eval-queue/Undi95/CreativityEngine_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/CreativityEngine", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522071", "license": "cc-by-nc-4.0", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/Undi95/Dawn-v2-70B_eval_request_False_float16_Original.json b/eval-queue/Undi95/Dawn-v2-70B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..aafb4131094e822e2bac14ff51eabb50487b1cb1 --- /dev/null +++ b/eval-queue/Undi95/Dawn-v2-70B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/Dawn-v2-70B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-14T14:54:26Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 7, "params": 68.977, "license": "cc-by-nc-4.0", "job_id": "651435"} \ No newline at end of file diff --git a/eval-queue/Undi95/Emerald-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/Emerald-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..09a65a9ee8b25a1bdd503f9eb6a02ae1de8eae82 --- /dev/null +++ b/eval-queue/Undi95/Emerald-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/Emerald-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "514935", "license": "cc-by-nc-4.0", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/Undi95/Emerhyst-20B_eval_request_False_float16_Original.json b/eval-queue/Undi95/Emerhyst-20B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..2a62a958e986f9e731f155b067a2d33a1e51be20 --- /dev/null +++ b/eval-queue/Undi95/Emerhyst-20B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/Emerhyst-20B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "518799", "license": "cc-by-nc-4.0", "likes": 16, "params": 19.994} \ No newline at end of file diff --git a/eval-queue/Undi95/LewdEngine_eval_request_False_float16_Original.json b/eval-queue/Undi95/LewdEngine_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e8fc9b32f33aed12fcecef1ddcfe2f7ad57b5d6a --- /dev/null +++ b/eval-queue/Undi95/LewdEngine_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/LewdEngine", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "504780", "license": "cc-by-nc-4.0", "likes": 1, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/Undi95/Llama2-13B-no_robots-alpaca-lora_eval_request_False_float16_Adapter.json b/eval-queue/Undi95/Llama2-13B-no_robots-alpaca-lora_eval_request_False_float16_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..999c059e2995bad56020380f3b47871ff0462f35 --- /dev/null +++ b/eval-queue/Undi95/Llama2-13B-no_robots-alpaca-lora_eval_request_False_float16_Adapter.json @@ -0,0 +1 @@ +{"model": "Undi95/Llama2-13B-no_robots-alpaca-lora", "base_model": "TheBloke/Llama-2-13B-fp16", "revision": "main", "private": false, "precision": "float16", "weight_type": "Adapter", "status": "FINISHED", "submitted_time": "2023-11-15T02:17:52Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 1, "params": 13.0, "license": "cc-by-nc-4.0", "job_id": "651945"} \ No newline at end of file diff --git a/eval-queue/Undi95/Llamix2-MLewd-4x13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/Llamix2-MLewd-4x13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..3295390f4104674313e409ffc70e808ba1212871 --- /dev/null +++ b/eval-queue/Undi95/Llamix2-MLewd-4x13B_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Undi95/Llamix2-MLewd-4x13B", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "RUNNING", + "submitted_time": "2023-12-15T17:43:30Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 8, + "params": 38.497, + "license": "cc-by-nc-4.0", + "job_id": "924697", + "job_start_time": "2023-12-16T13:32:33.846982" +} \ No newline at end of file diff --git a/eval-queue/Undi95/Llamix2-Xwin-MoE-4x13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/Llamix2-Xwin-MoE-4x13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..48652519a1bb02442baa8ba272862a59d13e562d --- /dev/null +++ b/eval-queue/Undi95/Llamix2-Xwin-MoE-4x13B_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Undi95/Llamix2-Xwin-MoE-4x13B", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-16T01:58:11Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 38.497, + "license": "llama2", + "job_id": "924750", + "job_start_time": "2023-12-16T14:41:31.297694" +} \ No newline at end of file diff --git a/eval-queue/Undi95/MLewd-Chat-v2-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/MLewd-Chat-v2-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..9b139ede6c23068155c474fb7387ebe37dc200e5 --- /dev/null +++ b/eval-queue/Undi95/MLewd-Chat-v2-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/MLewd-Chat-v2-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517454", "license": "cc-by-nc-4.0", "likes": 7, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/Undi95/MLewd-L2-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/MLewd-L2-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..22be1a45f4d220b0c02fb6a1adb78a4a33cb872f --- /dev/null +++ b/eval-queue/Undi95/MLewd-L2-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/MLewd-L2-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "504885", "license": "cc-by-nc-4.0", "likes": 1, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/Undi95/MLewd-L2-Chat-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/MLewd-L2-Chat-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..165a22cce9c9535bddec04c6fc5d60436f80e0b7 --- /dev/null +++ b/eval-queue/Undi95/MLewd-L2-Chat-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/MLewd-L2-Chat-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "633527", "license": "cc-by-nc-4.0", "likes": 13, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/Undi95/MLewd-ReMM-L2-Chat-20B-Inverted_eval_request_False_float16_Original.json b/eval-queue/Undi95/MLewd-ReMM-L2-Chat-20B-Inverted_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..123fedff0b1537ba3ee1ba274574731220f6bc0d --- /dev/null +++ b/eval-queue/Undi95/MLewd-ReMM-L2-Chat-20B-Inverted_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/MLewd-ReMM-L2-Chat-20B-Inverted", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:27:38Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "523049", "license": "cc-by-nc-4.0", "likes": 1, "params": 19.994} \ No newline at end of file diff --git a/eval-queue/Undi95/MLewd-ReMM-L2-Chat-20B_eval_request_False_float16_Original.json b/eval-queue/Undi95/MLewd-ReMM-L2-Chat-20B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d468da0ac8d4be115034552b4835ea9753860dff --- /dev/null +++ b/eval-queue/Undi95/MLewd-ReMM-L2-Chat-20B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/MLewd-ReMM-L2-Chat-20B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "515142", "license": "cc-by-nc-4.0", "likes": 6, "params": 19.994} \ No newline at end of file diff --git a/eval-queue/Undi95/MLewd-v2.4-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/MLewd-v2.4-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e8ffc4ee1849b7b041f4ce27fdf7cddc346b1cb1 --- /dev/null +++ b/eval-queue/Undi95/MLewd-v2.4-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/MLewd-v2.4-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "632853", "license": "cc-by-nc-4.0", "likes": 8, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/Undi95/MLewdBoros-L2-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/MLewdBoros-L2-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..5bafd75171b0612498f5c736ab2607ef5faf7eab --- /dev/null +++ b/eval-queue/Undi95/MLewdBoros-L2-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/MLewdBoros-L2-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:19:55Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522556", "license": "cc-by-nc-4.0", "likes": 11, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/Undi95/MM-ReMM-L2-20B_eval_request_False_float16_Original.json b/eval-queue/Undi95/MM-ReMM-L2-20B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d5c68df8ada01231c0b6ce81856fd2ff0d3753fa --- /dev/null +++ b/eval-queue/Undi95/MM-ReMM-L2-20B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/MM-ReMM-L2-20B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "633458", "license": "cc-by-nc-4.0", "likes": 1, "params": 19.994} \ No newline at end of file diff --git a/eval-queue/Undi95/MXLewd-L2-20B_eval_request_False_float16_Original.json b/eval-queue/Undi95/MXLewd-L2-20B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..55510f6ab22bb791c38a7473d1f5d64c4ae67a19 --- /dev/null +++ b/eval-queue/Undi95/MXLewd-L2-20B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/MXLewd-L2-20B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "518557", "license": "cc-by-nc-4.0", "likes": 7, "params": 19.994} \ No newline at end of file diff --git a/eval-queue/Undi95/Mistral-11B-TestBench10_eval_request_False_bfloat16_Original.json b/eval-queue/Undi95/Mistral-11B-TestBench10_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..eb3d1b44f08551e0dacd2bd23a8a704e58837748 --- /dev/null +++ b/eval-queue/Undi95/Mistral-11B-TestBench10_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/Mistral-11B-TestBench10", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:58:30Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522166", "params": 11.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/Undi95/Mistral-11B-TestBench11_eval_request_False_bfloat16_Original.json b/eval-queue/Undi95/Mistral-11B-TestBench11_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..25af133e616231dfdf0325bac6d67dc2f7dc7b22 --- /dev/null +++ b/eval-queue/Undi95/Mistral-11B-TestBench11_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/Mistral-11B-TestBench11", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "521980", "license": "cc-by-nc-4.0", "likes": 7, "params": 10.732} \ No newline at end of file diff --git a/eval-queue/Undi95/Mistral-11B-TestBench3_eval_request_False_float16_Original.json b/eval-queue/Undi95/Mistral-11B-TestBench3_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..159ee33911b4cea2b779b440e449ffebb67b6271 --- /dev/null +++ b/eval-queue/Undi95/Mistral-11B-TestBench3_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/Mistral-11B-TestBench3", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "514986", "params": 11.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/Undi95/Mistral-11B-TestBench7_eval_request_False_bfloat16_Original.json b/eval-queue/Undi95/Mistral-11B-TestBench7_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..458e2fa7542fae967d2c2601934da4e1db10e112 --- /dev/null +++ b/eval-queue/Undi95/Mistral-11B-TestBench7_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/Mistral-11B-TestBench7", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "515401", "params": 11.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/Undi95/Mistral-11B-TestBench9_eval_request_False_bfloat16_Original.json b/eval-queue/Undi95/Mistral-11B-TestBench9_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..bc021f4d575b1ca3b98a584fccfa453be172b358 --- /dev/null +++ b/eval-queue/Undi95/Mistral-11B-TestBench9_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/Mistral-11B-TestBench9", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "633377", "license": "cc-by-nc-4.0", "likes": 0, "params": 10.732} \ No newline at end of file diff --git a/eval-queue/Undi95/Mistral-11B-v0.1_eval_request_False_float16_Original.json b/eval-queue/Undi95/Mistral-11B-v0.1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..41af535ede7e3c8ede92574ab59f33515c557c3b --- /dev/null +++ b/eval-queue/Undi95/Mistral-11B-v0.1_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/Mistral-11B-v0.1", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-14T14:11:12Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 0, "params": 10.732, "license": "apache-2.0", "job_id": "643162"} \ No newline at end of file diff --git a/eval-queue/Undi95/Mixtral-4x7B-DPO-RPChat_eval_request_False_bfloat16_Original.json b/eval-queue/Undi95/Mixtral-4x7B-DPO-RPChat_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..9b8475f1fecee4fa0eb7c415183dd2c49ffc8c9e --- /dev/null +++ b/eval-queue/Undi95/Mixtral-4x7B-DPO-RPChat_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Undi95/Mixtral-4x7B-DPO-RPChat", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-15T00:20:15Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 1, + "params": 24.154, + "license": "cc-by-nc-4.0", + "job_id": "924672", + "job_start_time": "2023-12-16T13:08:45.579669" +} \ No newline at end of file diff --git a/eval-queue/Undi95/Mixtral-8x7B-MoE-RP-Story_eval_request_False_bfloat16_Original.json b/eval-queue/Undi95/Mixtral-8x7B-MoE-RP-Story_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..6e8de3e1742a35ec571f898b589e4830348a4db4 --- /dev/null +++ b/eval-queue/Undi95/Mixtral-8x7B-MoE-RP-Story_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Undi95/Mixtral-8x7B-MoE-RP-Story", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-15T20:42:12Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 11, + "params": 46.703, + "license": "cc-by-nc-4.0", + "job_id": "924734", + "job_start_time": "2023-12-16T14:22:42.484573" +} \ No newline at end of file diff --git a/eval-queue/Undi95/Nete-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/Nete-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..879f7de29be2fc827605f807312714f7cf02dcf1 --- /dev/null +++ b/eval-queue/Undi95/Nete-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/Nete-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-14T14:57:37Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 3, "params": 13.0, "license": "cc-by-nc-4.0", "job_id": "651441"} \ No newline at end of file diff --git a/eval-queue/Undi95/Nethena-MLewd-Xwin-23B_eval_request_False_float16_Original.json b/eval-queue/Undi95/Nethena-MLewd-Xwin-23B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..6a0e83aa2a34b039ca57be2762a01fc1372cd39b --- /dev/null +++ b/eval-queue/Undi95/Nethena-MLewd-Xwin-23B_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Undi95/Nethena-MLewd-Xwin-23B", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-07T19:14:53Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 9, + "params": 22.849, + "license": "cc-by-nc-4.0", + "job_id": "874872", + "job_start_time": "2023-12-09T13:57:48.474602" +} \ No newline at end of file diff --git a/eval-queue/Undi95/Nous-Hermes-13B-Code_eval_request_False_float16_Original.json b/eval-queue/Undi95/Nous-Hermes-13B-Code_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..0ea55b1518e8b521506261ceec082e0b2fd136c8 --- /dev/null +++ b/eval-queue/Undi95/Nous-Hermes-13B-Code_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/Nous-Hermes-13B-Code", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "503063", "license": "cc-by-nc-4.0", "likes": 3, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/Undi95/OpenRP-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/OpenRP-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..c91caab9bc3a9a7d7e8a761e55ebb8f35c340d47 --- /dev/null +++ b/eval-queue/Undi95/OpenRP-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/OpenRP-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:19:55Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522603", "license": "cc-by-nc-4.0", "likes": 2, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/Undi95/ReMM-L2-13B-PIPPA_eval_request_False_float16_Original.json b/eval-queue/Undi95/ReMM-L2-13B-PIPPA_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..8eba6e70a5553497b64e162498e95aeb6a45727e --- /dev/null +++ b/eval-queue/Undi95/ReMM-L2-13B-PIPPA_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/ReMM-L2-13B-PIPPA", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "498617", "license": "cc-by-nc-4.0", "likes": 1, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/Undi95/ReMM-L2-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/ReMM-L2-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..45c956ab5d54c6003c96067371f03b844e6f7a4f --- /dev/null +++ b/eval-queue/Undi95/ReMM-L2-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/ReMM-L2-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "502259", "license": "cc-by-nc-4.0", "likes": 2, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/Undi95/ReMM-Mistral-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/ReMM-Mistral-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..890423ce79543f7e0f3e01964d73c4a0e6dedda3 --- /dev/null +++ b/eval-queue/Undi95/ReMM-Mistral-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/ReMM-Mistral-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "520695", "license": "cc-by-nc-4.0", "likes": 4, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/Undi95/ReMM-SLERP-L2-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/ReMM-SLERP-L2-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..6cd419c8cd1c60a701ffc3e6c2da19a358c01eb8 --- /dev/null +++ b/eval-queue/Undi95/ReMM-SLERP-L2-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/ReMM-SLERP-L2-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:19:55Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522763", "license": "cc-by-nc-4.0", "likes": 8, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/Undi95/ReMM-v2-L2-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/ReMM-v2-L2-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..9aab58bd661dac1c7be9dc128acd70e86e46fc55 --- /dev/null +++ b/eval-queue/Undi95/ReMM-v2-L2-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/ReMM-v2-L2-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "515468", "license": "cc-by-nc-4.0", "likes": 2, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/Undi95/ReMM-v2.1-L2-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/ReMM-v2.1-L2-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..9381c35bf72f4a62896e90c4b9f9147f20a998aa --- /dev/null +++ b/eval-queue/Undi95/ReMM-v2.1-L2-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/ReMM-v2.1-L2-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "520981", "license": "cc-by-nc-4.0", "likes": 1, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/Undi95/ReMM-v2.2-L2-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/ReMM-v2.2-L2-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..591f3d9b304faf7586b1bf32a5425b20fc81ded7 --- /dev/null +++ b/eval-queue/Undi95/ReMM-v2.2-L2-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/ReMM-v2.2-L2-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "514667", "license": "cc-by-nc-4.0", "likes": 1, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/Undi95/ReasoningEngine_eval_request_False_float16_Original.json b/eval-queue/Undi95/ReasoningEngine_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..9fb7366efe443704682053a2ae1e9d0f1085ea58 --- /dev/null +++ b/eval-queue/Undi95/ReasoningEngine_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/ReasoningEngine", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-05T15:14:14Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "434055", "license": "cc-by-nc-4.0", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/Undi95/Toppy-M-7B_eval_request_False_bfloat16_Original.json b/eval-queue/Undi95/Toppy-M-7B_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..0e837add6a5b509b367c0e4f40919561f6314f3a --- /dev/null +++ b/eval-queue/Undi95/Toppy-M-7B_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/Toppy-M-7B", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-14T14:53:25Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 8, "params": 7.242, "license": "cc-by-nc-4.0", "job_id": "651431"} \ No newline at end of file diff --git a/eval-queue/Undi95/Toppy-M-7B_eval_request_False_float16_Original.json b/eval-queue/Undi95/Toppy-M-7B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b19f371b2f9319a676870c89e5ff0971154c714a --- /dev/null +++ b/eval-queue/Undi95/Toppy-M-7B_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Undi95/Toppy-M-7B", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-11-25T13:54:44Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 13, + "params": 7.242, + "license": "cc-by-nc-4.0", + "job_id": "800155", + "job_start_time": "2023-11-27T13:59:51.089527" +} \ No newline at end of file diff --git a/eval-queue/Undi95/U-Amethyst-20B_eval_request_False_float16_Original.json b/eval-queue/Undi95/U-Amethyst-20B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..414a38173577605d66985c1272bf8453fd91d3f2 --- /dev/null +++ b/eval-queue/Undi95/U-Amethyst-20B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/U-Amethyst-20B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "633536", "license": "cc-by-nc-4.0", "likes": 7, "params": 19.994} \ No newline at end of file diff --git a/eval-queue/Undi95/UndiMix-v1-13b_eval_request_False_float16_Original.json b/eval-queue/Undi95/UndiMix-v1-13b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..28839a7dd6c785ebe680a043b8160205e4c008b4 --- /dev/null +++ b/eval-queue/Undi95/UndiMix-v1-13b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/UndiMix-v1-13b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "499530", "license": "cc-by-nc-4.0", "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/Undi95/UndiMix-v4-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/UndiMix-v4-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..2a0e5ae82cda321dea963c2f65850a4320e4e224 --- /dev/null +++ b/eval-queue/Undi95/UndiMix-v4-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/UndiMix-v4-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "518811", "license": "cc-by-nc-4.0", "likes": 3, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/Undi95/Unholy-v1-12L-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/Unholy-v1-12L-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..219ea5117d673cc5b934a6c8030525e6f6117633 --- /dev/null +++ b/eval-queue/Undi95/Unholy-v1-12L-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/Unholy-v1-12L-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:27:38Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "523039", "license": "cc-by-nc-4.0", "likes": 25, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/Undi95/Utopia-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/Utopia-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d01fbcd95e5e9dfee62b33acc034e323dd4b2bd7 --- /dev/null +++ b/eval-queue/Undi95/Utopia-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/Utopia-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-14T14:55:55Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 6, "params": 13.016, "license": "cc-by-nc-4.0", "job_id": "651437"} \ No newline at end of file diff --git a/eval-queue/Undi95/UtopiaXL-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/UtopiaXL-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..451a187a83db4ad084fc3c612f16a63b0dcebe0f --- /dev/null +++ b/eval-queue/Undi95/UtopiaXL-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/UtopiaXL-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-06T10:58:03Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 7, "params": 13.016, "license": "cc-by-nc-4.0", "job_id": "650235"} \ No newline at end of file diff --git a/eval-queue/Undi95/X-MythoChronos-13B_eval_request_False_float16_Original.json b/eval-queue/Undi95/X-MythoChronos-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..99e1e72a19393f607c9b489a5cd9090ba2cdc600 --- /dev/null +++ b/eval-queue/Undi95/X-MythoChronos-13B_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Undi95/X-MythoChronos-13B", + "base_model": "llama", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-06T17:52:18Z", + "model_type": "\ud83d\udfe2 : pretrained", + "likes": 10, + "params": 13.016, + "license": "cc-by-nc-4.0", + "job_id": "874803", + "job_start_time": "2023-12-09T13:10:38.375016" +} \ No newline at end of file diff --git a/eval-queue/Undi95/Xwin-MLewd-7B-V0.2_eval_request_False_float16_Original.json b/eval-queue/Undi95/Xwin-MLewd-7B-V0.2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..58ead7f06074e0fd2bde67d0d89c07dcfb82196f --- /dev/null +++ b/eval-queue/Undi95/Xwin-MLewd-7B-V0.2_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "Undi95/Xwin-MLewd-7B-V0.2", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-12T05:06:13Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 7, "params": 7.0, "license": "cc-by-nc-4.0", "job_id": "650407"} \ No newline at end of file diff --git a/eval-queue/Undi95/llama2-to-mistral-diff_eval_request_False_bfloat16_Adapter.json b/eval-queue/Undi95/llama2-to-mistral-diff_eval_request_False_bfloat16_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..94df90ab133b20e0c0473f69e2a3702c24977cf8 --- /dev/null +++ b/eval-queue/Undi95/llama2-to-mistral-diff_eval_request_False_bfloat16_Adapter.json @@ -0,0 +1 @@ +{"model": "undi95/llama2-to-mistral-diff", "base_model": "NousResearch/Llama-2-7b-hf", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Adapter", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "516571", "license": "apache-2.0", "likes": 10, "params": 0} \ No newline at end of file diff --git a/eval-queue/Vezora/Mistral-14b-Merge-Base_eval_request_False_float16_Original.json b/eval-queue/Vezora/Mistral-14b-Merge-Base_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..205b68cf89404ea15c907e1c476d7cfec0480c18 --- /dev/null +++ b/eval-queue/Vezora/Mistral-14b-Merge-Base_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Vezora/Mistral-14b-Merge-Base", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-12T06:42:22Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 2, + "params": 14.221, + "license": "apache-2.0", + "job_id": "899789", + "job_start_time": "2023-12-12T06:44:27.840213" +} \ No newline at end of file diff --git a/eval-queue/Vezora/Mistral-Narwhal-7b_eval_request_False_bfloat16_Original.json b/eval-queue/Vezora/Mistral-Narwhal-7b_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..09c3965d12aabe9c2307cbec91332afa0620c6ac --- /dev/null +++ b/eval-queue/Vezora/Mistral-Narwhal-7b_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "Vezora/Mistral-Narwhal-7b", "base_model": "model-00001-of-00002.safetensors\nmodel-00002-of-00002.safetensors\n(Safe-Tensor format is in the folder called Safe-Tensor-Version)", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-12T01:22:27Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "493322", "license": "apache-2.0", "likes": 0, "params": 7.111} \ No newline at end of file diff --git a/eval-queue/Vezora/Narwhal-7b-v3_eval_request_False_bfloat16_Original.json b/eval-queue/Vezora/Narwhal-7b-v3_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..68fa0c390313a92f0fa98403e933ad8d861d8821 --- /dev/null +++ b/eval-queue/Vezora/Narwhal-7b-v3_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "Vezora/Narwhal-7b-v3", + "base_model": "Mistral 7b", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-05T03:22:33Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "858001", + "job_start_time": "2023-12-06T16:00:38.667785" +} \ No newline at end of file diff --git a/eval-queue/WhoTookMyAmogusNickname/NewHope_HF_not_official_eval_request_False_float16_Original.json b/eval-queue/WhoTookMyAmogusNickname/NewHope_HF_not_official_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..51ad85a82c3bdcdc67a571035d9acf84d568ca55 --- /dev/null +++ b/eval-queue/WhoTookMyAmogusNickname/NewHope_HF_not_official_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "WhoTookMyAmogusNickname/NewHope_HF_not_official", "base_model": "petra-99B", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "460925", "license": "llama2", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/Yehoon/yehoon_llama2_eval_request_False_float16_Adapter.json b/eval-queue/Yehoon/yehoon_llama2_eval_request_False_float16_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..b39938edb2bdcacf2f9ac1aac0e67deecacb0bd4 --- /dev/null +++ b/eval-queue/Yehoon/yehoon_llama2_eval_request_False_float16_Adapter.json @@ -0,0 +1 @@ +{"model": "Yehoon/yehoon_llama2", "base_model": "circulus/Llama-2-7b-orca-v1", "revision": "main", "private": false, "precision": "float16", "weight_type": "Adapter", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "517275", "license": "?", "likes": 0, "params": 0} \ No newline at end of file diff --git a/eval-queue/aisquared/chopt-1_3b_eval_request_False_False_False.json b/eval-queue/aisquared/chopt-1_3b_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..043bf22b62b8d80dcd42bfc4befc302793599ee5 --- /dev/null +++ b/eval-queue/aisquared/chopt-1_3b_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "aisquared/chopt-1_3b", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "517421", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-10-16T12:48:18Z", "license": "other", "likes": 0, "params": 1.316} \ No newline at end of file diff --git a/eval-queue/aisquared/chopt-2_7b_eval_request_False_False_False.json b/eval-queue/aisquared/chopt-2_7b_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..761b8131c3c448a285cead40ddcc1e5f49bc9286 --- /dev/null +++ b/eval-queue/aisquared/chopt-2_7b_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "aisquared/chopt-2_7b", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "494935", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "other", "likes": 0, "params": 2.652} \ No newline at end of file diff --git a/eval-queue/aisquared/dlite-v1-124m_eval_request_False_False_False.json b/eval-queue/aisquared/dlite-v1-124m_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..97ad990f5393d18d0c073a8da01f5d062866055a --- /dev/null +++ b/eval-queue/aisquared/dlite-v1-124m_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "aisquared/dlite-v1-124m", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "503256", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "apache-2.0", "likes": 0, "params": 0.124} \ No newline at end of file diff --git a/eval-queue/aisquared/dlite-v1-1_5b_eval_request_False_False_False.json b/eval-queue/aisquared/dlite-v1-1_5b_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..a36eada726ceb43a7dd376a602e95950252fd95e --- /dev/null +++ b/eval-queue/aisquared/dlite-v1-1_5b_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "aisquared/dlite-v1-1_5b", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "472867", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "apache-2.0", "likes": 1, "params": 1.558} \ No newline at end of file diff --git a/eval-queue/aisquared/dlite-v1-355m_eval_request_False_False_False.json b/eval-queue/aisquared/dlite-v1-355m_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..51fd0fcf7f081b91860c4e033e576d9ef62fda62 --- /dev/null +++ b/eval-queue/aisquared/dlite-v1-355m_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "aisquared/dlite-v1-355m", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "520963", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-10-16T12:54:17Z", "license": "apache-2.0", "likes": 1, "params": 0.355} \ No newline at end of file diff --git a/eval-queue/aisquared/dlite-v1-774m_eval_request_False_False_False.json b/eval-queue/aisquared/dlite-v1-774m_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..9930e65865f15938b94095e05e8ce1c12b25b37e --- /dev/null +++ b/eval-queue/aisquared/dlite-v1-774m_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "aisquared/dlite-v1-774m", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "504899", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "apache-2.0", "likes": 0, "params": 0.774} \ No newline at end of file diff --git a/eval-queue/aisquared/dlite-v2-124m_eval_request_False_False_False.json b/eval-queue/aisquared/dlite-v2-124m_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..0d92e8bace30016f0ce3989a3a3ebc78ec444777 --- /dev/null +++ b/eval-queue/aisquared/dlite-v2-124m_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "aisquared/dlite-v2-124m", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "520078", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-10-16T12:54:17Z", "license": "apache-2.0", "likes": 4, "params": 0.124} \ No newline at end of file diff --git a/eval-queue/aisquared/dlite-v2-1_5b_eval_request_False_False_False.json b/eval-queue/aisquared/dlite-v2-1_5b_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..38c73112ec4f213eb35527f204dcbff42e0161ff --- /dev/null +++ b/eval-queue/aisquared/dlite-v2-1_5b_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "aisquared/dlite-v2-1_5b", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "503201", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "apache-2.0", "likes": 10, "params": 1.558} \ No newline at end of file diff --git a/eval-queue/aisquared/dlite-v2-355m_eval_request_False_False_False.json b/eval-queue/aisquared/dlite-v2-355m_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..64987ad6644ba723785f7346b9a758a06213b084 --- /dev/null +++ b/eval-queue/aisquared/dlite-v2-355m_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "aisquared/dlite-v2-355m", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "498602", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "apache-2.0", "likes": 7, "params": 0.355} \ No newline at end of file diff --git a/eval-queue/aisquared/dlite-v2-774m_eval_request_False_False_False.json b/eval-queue/aisquared/dlite-v2-774m_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..94bd5eb9b4bdc98e9407a58f5d409ab4d83a86f3 --- /dev/null +++ b/eval-queue/aisquared/dlite-v2-774m_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "aisquared/dlite-v2-774m", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "495585", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "apache-2.0", "likes": 8, "params": 0.774} \ No newline at end of file diff --git a/eval-queue/akjindal53244/Arithmo-Mistral-7B_eval_request_False_bfloat16_Original.json b/eval-queue/akjindal53244/Arithmo-Mistral-7B_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..fdc47e9e2100915900ba84c8035259f2296b367f --- /dev/null +++ b/eval-queue/akjindal53244/Arithmo-Mistral-7B_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "akjindal53244/Arithmo-Mistral-7B", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-15T17:13:30Z", "model_type": "\u2b55 : instruction-tuned", "likes": 0, "params": 7.0, "license": "apache-2.0", "job_id": "649070"} \ No newline at end of file diff --git a/eval-queue/akjindal53244/Mistral-7B-v0.1-Open-Platypus_eval_request_False_bfloat16_Original.json b/eval-queue/akjindal53244/Mistral-7B-v0.1-Open-Platypus_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..67f8cf6ea30d0486f932676a29a5027b7eda38ea --- /dev/null +++ b/eval-queue/akjindal53244/Mistral-7B-v0.1-Open-Platypus_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "akjindal53244/Mistral-7B-v0.1-Open-Platypus", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "517434", "license": "apache-2.0", "likes": 2, "params": 7.111} \ No newline at end of file diff --git a/eval-queue/allstax/CodeExplainer-7b-v0.1_eval_request_False_8bit_Original.json b/eval-queue/allstax/CodeExplainer-7b-v0.1_eval_request_False_8bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..5076daf2ce76cfa629dc54ea60d7973f20be64f5 --- /dev/null +++ b/eval-queue/allstax/CodeExplainer-7b-v0.1_eval_request_False_8bit_Original.json @@ -0,0 +1,16 @@ +{ + "model": "allstax/CodeExplainer-7b-v0.1", + "base_model": "mistralai/Mistral-7B-v0.1", + "revision": "main", + "private": false, + "precision": "8bit", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-16T14:50:14Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.243, + "license": "apache-2.0", + "job_id": "925792", + "job_start_time": "2023-12-16T15:21:33.943336" +} \ No newline at end of file diff --git a/eval-queue/aloobun/open-llama-3b-v2-elmv3_eval_request_False_8bit_Original.json b/eval-queue/aloobun/open-llama-3b-v2-elmv3_eval_request_False_8bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..419fcab62583de5a5c604bbe9a541a8d8454f331 --- /dev/null +++ b/eval-queue/aloobun/open-llama-3b-v2-elmv3_eval_request_False_8bit_Original.json @@ -0,0 +1,16 @@ +{ + "model": "aloobun/open-llama-3b-v2-elmv3", + "base_model": "", + "revision": "main", + "private": false, + "precision": "8bit", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-08T11:08:54Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 3.426, + "license": "apache-2.0", + "job_id": "875007", + "job_start_time": "2023-12-09T15:57:40.067587" +} \ No newline at end of file diff --git a/eval-queue/aloobun/open-llama-3b-v2-elmv3_eval_request_False_float16_Original.json b/eval-queue/aloobun/open-llama-3b-v2-elmv3_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..be37a8d3b2b4376430c6fbb766b9d005f7eb0ed5 --- /dev/null +++ b/eval-queue/aloobun/open-llama-3b-v2-elmv3_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "aloobun/open-llama-3b-v2-elmv3", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-08T12:05:20Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 3.426, + "license": "apache-2.0", + "job_id": "875012", + "job_start_time": "2023-12-09T16:01:23.712638" +} \ No newline at end of file diff --git a/eval-queue/andreaskoepf/llama2-13b-megacode2_min100_eval_request_False_float16_Original.json b/eval-queue/andreaskoepf/llama2-13b-megacode2_min100_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..aa08dbd3b7befef08ce14ea7dfd6e289dd775eb3 --- /dev/null +++ b/eval-queue/andreaskoepf/llama2-13b-megacode2_min100_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "andreaskoepf/llama2-13b-megacode2_min100", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "470761", "license": "other", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/anton-l/gpt-j-tiny-random_eval_request_False_False_False.json b/eval-queue/anton-l/gpt-j-tiny-random_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..e6d707dcb1fdaa2e9f3b6ee2b218b926d23c93dc --- /dev/null +++ b/eval-queue/anton-l/gpt-j-tiny-random_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "anton-l/gpt-j-tiny-random", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "522144", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-10-16T12:58:30Z", "license": "?", "likes": 1, "params": 0.051} \ No newline at end of file diff --git a/eval-queue/ashercn97/giraffe-7b_eval_request_False_float16_Original.json b/eval-queue/ashercn97/giraffe-7b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..cd1bd3a054638bf073001d15ab3a7bbb65177867 --- /dev/null +++ b/eval-queue/ashercn97/giraffe-7b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "ashercn97/giraffe-7b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "fine-tuned", "job_id": "471801", "license": "?", "likes": 0, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/ashercn97/manatee-7b_eval_request_False_float16_Original.json b/eval-queue/ashercn97/manatee-7b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..5c25ff3f0827643d0c6a24743840cc4308ec1630 --- /dev/null +++ b/eval-queue/ashercn97/manatee-7b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "ashercn97/manatee-7b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "fine-tuned", "job_id": "461663", "license": "?", "likes": 2, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/bert-base-uncased_eval_request_False_False_False.json b/eval-queue/bert-base-uncased_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..f1702a0892702197c5932dab047516be58afe878 --- /dev/null +++ b/eval-queue/bert-base-uncased_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "bert-base-uncased", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "169851", "weight_type": "Original", "precision": "float16", "license": "apache-2.0", "likes": 1139, "params": 0.11} \ No newline at end of file diff --git a/eval-queue/bertin-project/bertin-gpt-j-6B-alpaca_eval_request_False_float16_Original.json b/eval-queue/bertin-project/bertin-gpt-j-6B-alpaca_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..0825b214ba24c485b7c543b65a473988c8ff41a3 --- /dev/null +++ b/eval-queue/bertin-project/bertin-gpt-j-6B-alpaca_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "bertin-project/bertin-gpt-j-6B-alpaca", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "fine-tuned", "job_id": "470959", "license": "openrail", "likes": 8, "params": 5.844} \ No newline at end of file diff --git a/eval-queue/boomerchan/magpie-13b_eval_request_False_float16_Original.json b/eval-queue/boomerchan/magpie-13b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..78388b752004f519c637ae33a7975ed28c71e78c --- /dev/null +++ b/eval-queue/boomerchan/magpie-13b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "boomerchan/magpie-13b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "519810", "license": "llama2", "likes": 6, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/camel-ai/CAMEL-13B-Combined-Data_eval_request_False_False_False.json b/eval-queue/camel-ai/CAMEL-13B-Combined-Data_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..e61797e0cf4c747784dbf9913ba4f429d0d6735f --- /dev/null +++ b/eval-queue/camel-ai/CAMEL-13B-Combined-Data_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "camel-ai/CAMEL-13B-Combined-Data", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "job_id": "472517", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "license": "?", "likes": 11, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/camel-ai/CAMEL-13B-Role-Playing-Data_eval_request_False_False_False.json b/eval-queue/camel-ai/CAMEL-13B-Role-Playing-Data_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..bb057b74c152eaf7b12f94edbeb817b902866a77 --- /dev/null +++ b/eval-queue/camel-ai/CAMEL-13B-Role-Playing-Data_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "camel-ai/CAMEL-13B-Role-Playing-Data", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "job_id": "517234", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "license": "?", "likes": 12, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/camel-ai/CAMEL-33B-Combined-Data_eval_request_False_float16_Original.json b/eval-queue/camel-ai/CAMEL-33B-Combined-Data_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..ed5bd8876191604a85db1d3624261865ab5b8f4a --- /dev/null +++ b/eval-queue/camel-ai/CAMEL-33B-Combined-Data_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "camel-ai/CAMEL-33B-Combined-Data", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "fine-tuned", "job_id": "461450", "license": "?", "likes": 3, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/chachamatcha/NoDrama-CodeLLama-QLoRa-Evol-Adapter_eval_request_False_8bit_Adapter.json b/eval-queue/chachamatcha/NoDrama-CodeLLama-QLoRa-Evol-Adapter_eval_request_False_8bit_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..5feb6764276a7c6544615ef2fc4422826885818c --- /dev/null +++ b/eval-queue/chachamatcha/NoDrama-CodeLLama-QLoRa-Evol-Adapter_eval_request_False_8bit_Adapter.json @@ -0,0 +1 @@ +{"model": "chachamatcha/NoDrama-CodeLLama-QLoRa-Evol-Adapter", "base_model": "Phind/Phind-CodeLlama-34B-v1", "revision": "main", "private": false, "precision": "8bit", "weight_type": "Adapter", "status": "FAILED", "submitted_time": "2023-09-12T22:21:27Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "440429", "license": "?", "likes": 0, "params": 0} \ No newline at end of file diff --git a/eval-queue/chachamatcha/NoDrama-CodeLLama-QLoRa-Evol_eval_request_False_8bit_Adapter.json b/eval-queue/chachamatcha/NoDrama-CodeLLama-QLoRa-Evol_eval_request_False_8bit_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..1eca0efd28337c0e29a7eab6a87ccd53c4713b78 --- /dev/null +++ b/eval-queue/chachamatcha/NoDrama-CodeLLama-QLoRa-Evol_eval_request_False_8bit_Adapter.json @@ -0,0 +1 @@ +{"model": "chachamatcha/NoDrama-CodeLLama-QLoRa-Evol", "base_model": "Phind/Phind-CodeLlama-34B-v1", "revision": "main", "private": false, "precision": "8bit", "weight_type": "Adapter", "status": "FAILED", "submitted_time": "2023-09-12T22:03:29Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "440424", "license": "llama2", "likes": 0, "params": 33.826} \ No newline at end of file diff --git a/eval-queue/chaoyi-wu/MedLLaMA_13B_eval_request_False_False_False.json b/eval-queue/chaoyi-wu/MedLLaMA_13B_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..7b3431a7c3b39e0ae1b0d5565a5ce6051b9f2a0b --- /dev/null +++ b/eval-queue/chaoyi-wu/MedLLaMA_13B_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "chaoyi-wu/MedLLaMA_13B", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "weight_type": "Original", "precision": "float16", "job_id": "631807", "model_type": "fine-tuned", "license": "apache-2.0", "likes": 28, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/clibrain/Llama-2-13b-ft-instruct-es_eval_request_False_float16_Original.json b/eval-queue/clibrain/Llama-2-13b-ft-instruct-es_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..aad8e102761b677a6cdb88260ca27c1a9b9e5fb6 --- /dev/null +++ b/eval-queue/clibrain/Llama-2-13b-ft-instruct-es_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "clibrain/Llama-2-13b-ft-instruct-es", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "462043", "license": "apache-2.0", "likes": 10, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/clibrain/Llama-2-7b-ft-instruct-es_eval_request_False_float16_Original.json b/eval-queue/clibrain/Llama-2-7b-ft-instruct-es_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..4990f986eb2ea056b9e39d797942d28b0f1b0974 --- /dev/null +++ b/eval-queue/clibrain/Llama-2-7b-ft-instruct-es_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "clibrain/Llama-2-7b-ft-instruct-es", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "461539", "license": "apache-2.0", "likes": 13, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/clibrain/Llama-2-ft-instruct-es_eval_request_False_float16_Original.json b/eval-queue/clibrain/Llama-2-ft-instruct-es_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..729a97ef3b4c7edd9669b42b1a99fb9a4bbc3953 --- /dev/null +++ b/eval-queue/clibrain/Llama-2-ft-instruct-es_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "clibrain/Llama-2-ft-instruct-es", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "fine-tuned", "job_id": "461235", "license": "apache-2.0", "likes": 16, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/concedo/OPT-19M-ChatSalad_eval_request_False_False_False.json b/eval-queue/concedo/OPT-19M-ChatSalad_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..ac70a94df4adcb852e0431fc5c5e4a18f2a5b7c7 --- /dev/null +++ b/eval-queue/concedo/OPT-19M-ChatSalad_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "concedo/OPT-19M-ChatSalad", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "470770", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "other", "likes": 15, "params": 0.019} \ No newline at end of file diff --git a/eval-queue/concedo/Pythia-70M-ChatSalad_eval_request_False_False_False.json b/eval-queue/concedo/Pythia-70M-ChatSalad_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..4f6e36bdbe4c377e3c253e36e1f42dc0e02739b2 --- /dev/null +++ b/eval-queue/concedo/Pythia-70M-ChatSalad_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "concedo/Pythia-70M-ChatSalad", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "472276", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-10-16T12:48:18Z", "license": "other", "likes": 5, "params": 0.096} \ No newline at end of file diff --git a/eval-queue/concedo/Vicuzard-30B-Uncensored_eval_request_False_False_False.json b/eval-queue/concedo/Vicuzard-30B-Uncensored_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..d0ada939a4f127e0825438c9b2ef909080269327 --- /dev/null +++ b/eval-queue/concedo/Vicuzard-30B-Uncensored_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "concedo/Vicuzard-30B-Uncensored", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "job_id": "472368", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "license": "other", "likes": 11, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/dahara1/ELYZA-japanese-Llama-2-7b-fast-instruct-GPTQ_eval_request_False_GPTQ_Original.json b/eval-queue/dahara1/ELYZA-japanese-Llama-2-7b-fast-instruct-GPTQ_eval_request_False_GPTQ_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d8701fa408f1423ec0543464e87696389aa2587f --- /dev/null +++ b/eval-queue/dahara1/ELYZA-japanese-Llama-2-7b-fast-instruct-GPTQ_eval_request_False_GPTQ_Original.json @@ -0,0 +1,16 @@ +{ + "model": "dahara1/ELYZA-japanese-Llama-2-7b-fast-instruct-GPTQ", + "base_model": null, + "revision": "main", + "private": false, + "precision": "GPTQ", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-11-10T07:19:55Z", + "model_type": "\u2b55 : instruction-tuned", + "job_id": null, + "license": "llama2", + "likes": 2, + "params": 9.904, + "job_start_time": null +} \ No newline at end of file diff --git a/eval-queue/dahara1/weblab-10b-instruction-sft-GPTQ_eval_request_False_GPTQ_Delta.json b/eval-queue/dahara1/weblab-10b-instruction-sft-GPTQ_eval_request_False_GPTQ_Delta.json new file mode 100644 index 0000000000000000000000000000000000000000..c0290dda492c02f34b0a068a4f6e024d58a6bbf3 --- /dev/null +++ b/eval-queue/dahara1/weblab-10b-instruction-sft-GPTQ_eval_request_False_GPTQ_Delta.json @@ -0,0 +1,16 @@ +{ + "model": "dahara1/weblab-10b-instruction-sft-GPTQ", + "base_model": null, + "revision": "main", + "private": false, + "precision": "GPTQ", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-11-10T02:41:29Z", + "model_type": "\u2b55 : instruction-tuned", + "job_id": null, + "license": "cc-by-nc-4.0", + "likes": 12, + "params": 14.88, + "job_start_time": null +} \ No newline at end of file diff --git a/eval-queue/deepnight-research/Saily_220B_eval_request_False_float16_Original.json b/eval-queue/deepnight-research/Saily_220B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..a625d89ae9031cec7ece68008f2b35855af827c9 --- /dev/null +++ b/eval-queue/deepnight-research/Saily_220B_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "deepnight-research/Saily_220B", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-17T16:40:41Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 6, + "params": 208.448, + "license": "llama2", + "job_id": null, + "job_start_time": null +} \ No newline at end of file diff --git a/eval-queue/deepnight-research/lil-c3po_eval_request_False_float16_Original.json b/eval-queue/deepnight-research/lil-c3po_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..263e9d7c0e5bf0b4f8cce28570995f113b952b2a --- /dev/null +++ b/eval-queue/deepnight-research/lil-c3po_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "deepnight-research/lil-c3po", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-16T08:43:04Z", + "model_type": "\ud83d\udfe6 : RL-tuned", + "likes": 0, + "params": 7.242, + "license": "mit", + "job_id": "924776", + "job_start_time": "2023-12-16T15:11:32.732090" +} \ No newline at end of file diff --git a/eval-queue/deepnight-research/robin-70B_eval_request_False_float16_Original.json b/eval-queue/deepnight-research/robin-70B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..fb4a1a5ca02ab211bff9a2c07ee7de30e5ab4daf --- /dev/null +++ b/eval-queue/deepnight-research/robin-70B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "deepnight-research/robin-70B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-08-29T14:05:24Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "397114", "params": 70.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/deepnight-research/robin-70b-v1_eval_request_False_bfloat16_Original.json b/eval-queue/deepnight-research/robin-70b-v1_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..cf418bd3ac8d89fb33e8216b7f3acd6d9f103ffd --- /dev/null +++ b/eval-queue/deepnight-research/robin-70b-v1_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "deepnight-research/robin-70b-v1", "base_model": "petra", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-08-29T14:05:39Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "397169", "params": 70.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/deepnight-research/robin-70b-v1_eval_request_False_float16_Original.json b/eval-queue/deepnight-research/robin-70b-v1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..65b45da8aa409746140d23d9a1fecba098c2ebc8 --- /dev/null +++ b/eval-queue/deepnight-research/robin-70b-v1_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "deepnight-research/robin-70b-v1", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-08-29T14:05:24Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "397121", "params": 70.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/deepnight-research/robin_eval_request_False_float16_Original.json b/eval-queue/deepnight-research/robin_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..39f52884011efb9795cdf95b3a93f87e43cfe8db --- /dev/null +++ b/eval-queue/deepnight-research/robin_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "deepnight-research/robin", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-08-22T08:38:22Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "385620", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/deepnight-research/saily_100b_eval_request_False_float16_Original.json b/eval-queue/deepnight-research/saily_100b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..43bf9d83b1c1a115a80ffb6ea5a55c7ea25c05a9 --- /dev/null +++ b/eval-queue/deepnight-research/saily_100b_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "deepnight-research/saily_100b", + "base_model": "llama", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-01T02:07:49Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 117.749, + "license": "mit", + "job_id": "846010", + "job_start_time": "2023-12-04T14:08:30.270295" +} \ No newline at end of file diff --git a/eval-queue/deepnight-research/zsc-text_eval_request_False_float16_Original.json b/eval-queue/deepnight-research/zsc-text_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..db2e4d7ba7d7b28bd80a16a97043d24db9925560 --- /dev/null +++ b/eval-queue/deepnight-research/zsc-text_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "deepnight-research/zsc-text", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "fine-tuned", "job_id": "516599", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/digitous/13B-Chimera_eval_request_False_False_False.json b/eval-queue/digitous/13B-Chimera_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..3bd3e7853b8d5261b93ddefe5c889e362f128d4f --- /dev/null +++ b/eval-queue/digitous/13B-Chimera_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "digitous/13B-Chimera", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "510719", "weight_type": "Original", "precision": "float16", "submitted_time": "2023-09-09T10:52:17Z", "license": "?", "likes": 6, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/digitous/13B-HyperMantis_eval_request_False_False_False.json b/eval-queue/digitous/13B-HyperMantis_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..8e1dfd730876d2f21b545ca6a3fe2e23f9589382 --- /dev/null +++ b/eval-queue/digitous/13B-HyperMantis_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "digitous/13B-HyperMantis", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "498730", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-10-16T12:48:18Z", "license": "other", "likes": 26, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/digitous/Adventien-GPTJ_eval_request_False_False_False.json b/eval-queue/digitous/Adventien-GPTJ_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..eaf5ca43eab8a1e1c4c621321d5f78cd80225ee0 --- /dev/null +++ b/eval-queue/digitous/Adventien-GPTJ_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "digitous/Adventien-GPTJ", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "470829", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "?", "likes": 0, "params": 5.844} \ No newline at end of file diff --git a/eval-queue/digitous/Alpacino13b_eval_request_False_False_False.json b/eval-queue/digitous/Alpacino13b_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..bfbd742eae6ee9a6994f3b6e31e3bac99b64f633 --- /dev/null +++ b/eval-queue/digitous/Alpacino13b_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "digitous/Alpacino13b", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "497801", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-10-16T12:48:18Z", "license": "other", "likes": 29, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/digitous/Alpacino30b_eval_request_False_False_False.json b/eval-queue/digitous/Alpacino30b_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..50827a4631b29f66cd050279b66e24afeaa50d61 --- /dev/null +++ b/eval-queue/digitous/Alpacino30b_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "digitous/Alpacino30b", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "461411", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-10-16T12:48:18Z", "license": "other", "likes": 67, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/digitous/GPT-R_eval_request_False_False_False.json b/eval-queue/digitous/GPT-R_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..251fe2faf5fc37f1829554b5186088e74561ed32 --- /dev/null +++ b/eval-queue/digitous/GPT-R_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "digitous/GPT-R", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "510352", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "bigscience-openrail-m", "likes": 9, "params": 5.844} \ No newline at end of file diff --git a/eval-queue/digitous/Janin-GPTJ_eval_request_False_False_False.json b/eval-queue/digitous/Janin-GPTJ_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..a6e4740a0aa36f7177e1302166243437f773c3f9 --- /dev/null +++ b/eval-queue/digitous/Janin-GPTJ_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "digitous/Janin-GPTJ", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "495019", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "creativeml-openrail-m", "likes": 0, "params": 5.844} \ No newline at end of file diff --git a/eval-queue/digitous/Janin-R_eval_request_False_False_False.json b/eval-queue/digitous/Janin-R_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..13d1bd44646d2caf02372539198e4317a60796bf --- /dev/null +++ b/eval-queue/digitous/Janin-R_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "digitous/Janin-R", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "461159", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-10-16T12:48:18Z", "license": "creativeml-openrail-m", "likes": 1, "params": 5.844} \ No newline at end of file diff --git a/eval-queue/digitous/Javalion-GPTJ_eval_request_False_False_False.json b/eval-queue/digitous/Javalion-GPTJ_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..d076c16f1e3f42bdd9ed39d08aefaccd77e3537c --- /dev/null +++ b/eval-queue/digitous/Javalion-GPTJ_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "digitous/Javalion-GPTJ", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "503966", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "creativeml-openrail-m", "likes": 1, "params": 5.844} \ No newline at end of file diff --git a/eval-queue/digitous/Javalion-R_eval_request_False_False_False.json b/eval-queue/digitous/Javalion-R_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..fa1dd98463caeedb39d33aebbdb13b7ff5fe54f8 --- /dev/null +++ b/eval-queue/digitous/Javalion-R_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "digitous/Javalion-R", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "494553", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-10-16T12:48:18Z", "license": "creativeml-openrail-m", "likes": 5, "params": 5.844} \ No newline at end of file diff --git a/eval-queue/digitous/Javelin-GPTJ_eval_request_False_False_False.json b/eval-queue/digitous/Javelin-GPTJ_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..e520ba1c5cdb7962f2ca977195d4cc1e177df75b --- /dev/null +++ b/eval-queue/digitous/Javelin-GPTJ_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "digitous/Javelin-GPTJ", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "498712", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "creativeml-openrail-m", "likes": 4, "params": 5.844} \ No newline at end of file diff --git a/eval-queue/digitous/Javelin-R_eval_request_False_False_False.json b/eval-queue/digitous/Javelin-R_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..4a82b56b6745a58b49ece10d550333f55faa6af7 --- /dev/null +++ b/eval-queue/digitous/Javelin-R_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "digitous/Javelin-R", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "503973", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "creativeml-openrail-m", "likes": 2, "params": 5.844} \ No newline at end of file diff --git a/eval-queue/digitous/Skegma-GPTJ_eval_request_False_False_False.json b/eval-queue/digitous/Skegma-GPTJ_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..2bd2192c9937f8cf1dec5c6df745809cdac91599 --- /dev/null +++ b/eval-queue/digitous/Skegma-GPTJ_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "digitous/Skegma-GPTJ", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "511525", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-09-09T10:52:17Z", "license": "creativeml-openrail-m", "likes": 0, "params": 5.844} \ No newline at end of file diff --git a/eval-queue/distilgpt2_eval_request_False_False_False.json b/eval-queue/distilgpt2_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..bd9e4cfd35371f11a1a9b6b585686b88b78f1e23 --- /dev/null +++ b/eval-queue/distilgpt2_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "distilgpt2", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "169903", "weight_type": "Original", "precision": "float16", "license": "apache-2.0", "likes": 262, "params": 0.088} \ No newline at end of file diff --git a/eval-queue/distilgpt2_eval_request_False_float16_Original.json b/eval-queue/distilgpt2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..291bf6bb9c1618ead25e8ad4fe65753fc3b411d4 --- /dev/null +++ b/eval-queue/distilgpt2_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "distilgpt2", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-14T19:59:01Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 287, + "params": 0.088, + "license": "apache-2.0", + "job_id": "924663", + "job_start_time": "2023-12-16T13:01:31.721815" +} \ No newline at end of file diff --git a/eval-queue/dltjdgh0928/lsh_finetune_v0.11_eval_request_False_4bit_Original.json b/eval-queue/dltjdgh0928/lsh_finetune_v0.11_eval_request_False_4bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..1b36b63dad2cbc7c29875a73feedd18dbd6cc111 --- /dev/null +++ b/eval-queue/dltjdgh0928/lsh_finetune_v0.11_eval_request_False_4bit_Original.json @@ -0,0 +1 @@ +{"model": "dltjdgh0928/lsh_finetune_v0.11", "base_model": "mistralai/Mistral-7B-Instruct-v0.1", "revision": "main", "private": false, "precision": "4bit", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-31T09:37:31Z", "model_type": "\u2b55 : instruction-tuned", "likes": 0, "params": 0, "license": "apache-2.0", "job_id": "650117"} \ No newline at end of file diff --git a/eval-queue/dotvignesh/perry-7b_eval_request_False_float16_Original.json b/eval-queue/dotvignesh/perry-7b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..213677e6f1f86b981618390539b8c9c55fab8165 --- /dev/null +++ b/eval-queue/dotvignesh/perry-7b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "dotvignesh/perry-7b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "514509", "license": "?", "likes": 0, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/dsvv-cair/alpaca-cleaned-llama-30b-bf16_eval_request_False_False_False.json b/eval-queue/dsvv-cair/alpaca-cleaned-llama-30b-bf16_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..58f27b763744c70296c05a5eb4ae56569bd85ad0 --- /dev/null +++ b/eval-queue/dsvv-cair/alpaca-cleaned-llama-30b-bf16_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "dsvv-cair/alpaca-cleaned-llama-30b-bf16", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "job_id": "470790", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "license": "?", "likes": 3, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/edor/Hermes-Platypus2-mini-7B_eval_request_False_float16_Original.json b/eval-queue/edor/Hermes-Platypus2-mini-7B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..5f6c757ff32dfc52788850a3177687be3e246f99 --- /dev/null +++ b/eval-queue/edor/Hermes-Platypus2-mini-7B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "edor/Hermes-Platypus2-mini-7B", "base_model": "meta-llama/Llama-2-7b-hf", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517385", "params": 7.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/edor/Platypus2-mini-7B_eval_request_False_float16_Original.json b/eval-queue/edor/Platypus2-mini-7B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..1fe84f93485bb5d3e6913d216d0452d6cff9e18b --- /dev/null +++ b/eval-queue/edor/Platypus2-mini-7B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "edor/Platypus2-mini-7B", "base_model": "meta-llama/Llama-2-7b-hf", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "497691", "license": "other", "likes": 0, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/edor/Stable-Platypus2-mini-7B_eval_request_False_float16_Original.json b/eval-queue/edor/Stable-Platypus2-mini-7B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..c3e018a26c7fcd078c41e8579928944c73d23ce1 --- /dev/null +++ b/eval-queue/edor/Stable-Platypus2-mini-7B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "edor/Stable-Platypus2-mini-7B", "base_model": "meta-llama/Llama-2-7b-hf", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "518152", "params": 7.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/grantprice/Cerebras-GPT-590M-finetuned-DND_eval_request_False_False_False.json b/eval-queue/grantprice/Cerebras-GPT-590M-finetuned-DND_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..c95bae9cadd9a45131d831f0e8c89b0264ac79ec --- /dev/null +++ b/eval-queue/grantprice/Cerebras-GPT-590M-finetuned-DND_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "grantprice/Cerebras-GPT-590M-finetuned-DND", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "job_id": "641797", "weight_type": "Original", "precision": "float16", "license": "apache-2.0", "likes": 0, "params": 0.59} \ No newline at end of file diff --git a/eval-queue/ibranze/araproje-llama2-7b-hf_eval_request_False_float16_Original.json b/eval-queue/ibranze/araproje-llama2-7b-hf_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..af775e21c8a70fc7fcea7d6c0d59fdf4aed7da24 --- /dev/null +++ b/eval-queue/ibranze/araproje-llama2-7b-hf_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "ibranze/araproje-llama2-7b-hf", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "fine-tuned", "job_id": "518449", "license": "?", "likes": 0, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/internlm/internlm-20b-chat_eval_request_False_False_False.json b/eval-queue/internlm/internlm-20b-chat_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..965a47232c2be70a90a82a48dd652e512450296b --- /dev/null +++ b/eval-queue/internlm/internlm-20b-chat_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "internlm/internlm-20b-chat", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-11-08T10:34:08Z", "job_id": "642702", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "license": "apache-2.0", "likes": 114, "params": 20.0} \ No newline at end of file diff --git a/eval-queue/internlm/internlm-20b_eval_request_False_False_False.json b/eval-queue/internlm/internlm-20b_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..6fbf80e7840a3903da561449596b1e595c2f88a4 --- /dev/null +++ b/eval-queue/internlm/internlm-20b_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "internlm/internlm-20b", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-11-08T10:34:08Z", "job_id": "642705", "weight_type": "Original", "precision": "float16", "model_type": "pretrained", "license": "apache-2.0", "likes": 47, "params": 20.0} \ No newline at end of file diff --git a/eval-queue/jarradh/llama2_70b_chat_uncensored_eval_request_False_float16_Original.json b/eval-queue/jarradh/llama2_70b_chat_uncensored_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..3c4bf1c698b828e97fea62f862e197ce4b8a776f --- /dev/null +++ b/eval-queue/jarradh/llama2_70b_chat_uncensored_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "jarradh/llama2_70b_chat_uncensored", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "494885", "license": "llama2", "likes": 37, "params": 68.715} \ No newline at end of file diff --git a/eval-queue/jphme/Llama-2-13b-chat-german_eval_request_False_float16_Original.json b/eval-queue/jphme/Llama-2-13b-chat-german_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b6af076b43a9fac8dedcbc970cceadcc88a421b7 --- /dev/null +++ b/eval-queue/jphme/Llama-2-13b-chat-german_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "jphme/Llama-2-13b-chat-german", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "fine-tuned", "job_id": "461551", "license": "?", "likes": 46, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/jphme/em_german_leo_mistral_eval_request_False_float16_Original.json b/eval-queue/jphme/em_german_leo_mistral_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..edb3b1ed2d9cc2de7a3c1defde0f23f18b18682c --- /dev/null +++ b/eval-queue/jphme/em_german_leo_mistral_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "jphme/em_german_leo_mistral", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "518457", "license": "apache-2.0", "likes": 4, "params": 7.242} \ No newline at end of file diff --git a/eval-queue/jphme/orca_mini_v2_ger_7b_eval_request_False_False_False.json b/eval-queue/jphme/orca_mini_v2_ger_7b_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..f40ef6f9f28505b690ded7c6be90dad19f5d751b --- /dev/null +++ b/eval-queue/jphme/orca_mini_v2_ger_7b_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "jphme/orca_mini_v2_ger_7b", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "job_id": "461602", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "license": "cc-by-nc-sa-4.0", "likes": 8, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/jslin09/bloom-560m-finetuned-fraud_eval_request_False_False_False.json b/eval-queue/jslin09/bloom-560m-finetuned-fraud_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..ce8743fc0c107a670fb803c1ed8e16467187bee0 --- /dev/null +++ b/eval-queue/jslin09/bloom-560m-finetuned-fraud_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "jslin09/bloom-560m-finetuned-fraud", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "461361", "weight_type": "Original", "precision": "float16", "submitted_time": "2023-09-09T10:52:17Z", "license": "bigscience-bloom-rail-1.0", "likes": 1, "params": 0.559} \ No newline at end of file diff --git a/eval-queue/kevinpro/Vicuna-13B-CoT_eval_request_False_False_False.json b/eval-queue/kevinpro/Vicuna-13B-CoT_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..18add1b93610db1b1d26d4e587cbdbbb200c0a62 --- /dev/null +++ b/eval-queue/kevinpro/Vicuna-13B-CoT_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "kevinpro/Vicuna-13B-CoT", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "job_id": "461503", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "license": "?", "likes": 4, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/kfkas/Llama-2-ko-7b-Chat_eval_request_False_bfloat16_Original.json b/eval-queue/kfkas/Llama-2-ko-7b-Chat_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..10903387c43307c871d09c96dee2ba640b9b2261 --- /dev/null +++ b/eval-queue/kfkas/Llama-2-ko-7b-Chat_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "kfkas/Llama-2-ko-7b-Chat", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "fine-tuned", "job_id": "461138", "license": "?", "likes": 51, "params": 6.666} \ No newline at end of file diff --git a/eval-queue/kfkas/Llama-2-ko-7b-Chat_eval_request_False_float16_Original.json b/eval-queue/kfkas/Llama-2-ko-7b-Chat_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..bb4d119ba2d48138dd71aea03fc4a8ab177f02a3 --- /dev/null +++ b/eval-queue/kfkas/Llama-2-ko-7b-Chat_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "kfkas/Llama-2-ko-7b-Chat", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "fine-tuned", "job_id": "462041", "license": "?", "likes": 51, "params": 6.666} \ No newline at end of file diff --git a/eval-queue/lgaalves/gpt-2-xl_camel-ai-physics_eval_request_False_float16_Original.json b/eval-queue/lgaalves/gpt-2-xl_camel-ai-physics_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..46e1f3bd02a957abecac3bf6a0b54c921b6ad371 --- /dev/null +++ b/eval-queue/lgaalves/gpt-2-xl_camel-ai-physics_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/gpt-2-xl_camel-ai-physics", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "517927", "license": "mit", "likes": 0, "params": 1.558} \ No newline at end of file diff --git a/eval-queue/lgaalves/gpt1_eval_request_False_float16_Original.json b/eval-queue/lgaalves/gpt1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..5700da055e168e8c7129eed79f591152acb193e4 --- /dev/null +++ b/eval-queue/lgaalves/gpt1_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/gpt1", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-25T14:54:09Z", "model_type": "\ud83d\udfe2 : pretrained", "job_id": "479511", "license": "mit", "likes": 0, "params": 0.117} \ No newline at end of file diff --git a/eval-queue/lgaalves/gpt2-dolly_eval_request_False_4bit_Original.json b/eval-queue/lgaalves/gpt2-dolly_eval_request_False_4bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..3f175a7f210b98011bedd913d05936b6f3f94674 --- /dev/null +++ b/eval-queue/lgaalves/gpt2-dolly_eval_request_False_4bit_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/gpt2-dolly", "base_model": "", "revision": "main", "private": false, "precision": "4bit", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "519038", "license": "mit", "likes": 1, "params": 0.124} \ No newline at end of file diff --git a/eval-queue/lgaalves/gpt2-dolly_eval_request_False_float16_Adapter.json b/eval-queue/lgaalves/gpt2-dolly_eval_request_False_float16_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..d70f31a88f6e7e90633270f2557e0d678b1b84fd --- /dev/null +++ b/eval-queue/lgaalves/gpt2-dolly_eval_request_False_float16_Adapter.json @@ -0,0 +1 @@ +{"model": "lgaalves/gpt2-dolly", "base_model": "gpt2", "revision": "main", "private": false, "precision": "float16", "weight_type": "Adapter", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "520701", "license": "mit", "likes": 1, "params": 0.124} \ No newline at end of file diff --git a/eval-queue/lgaalves/gpt2-dolly_eval_request_False_float16_Original.json b/eval-queue/lgaalves/gpt2-dolly_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b20d603cb9afab89f762a34e50436728dfa3c98a --- /dev/null +++ b/eval-queue/lgaalves/gpt2-dolly_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/gpt2-dolly", "base_model": "gpt2", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "499581", "license": "mit", "likes": 1, "params": 0.124} \ No newline at end of file diff --git a/eval-queue/lgaalves/gpt2-xl-camel-ai-physics_eval_request_False_float16_Original.json b/eval-queue/lgaalves/gpt2-xl-camel-ai-physics_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e0d5680d5a9f1df9ff4b982a7f39dd851f57c19a --- /dev/null +++ b/eval-queue/lgaalves/gpt2-xl-camel-ai-physics_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/gpt2-xl-camel-ai-physics", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-05T21:09:27Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "437244", "license": "mit", "likes": 0, "params": 1.558} \ No newline at end of file diff --git a/eval-queue/lgaalves/gpt2-xl_camel-ai-physics_eval_request_False_float16_Original.json b/eval-queue/lgaalves/gpt2-xl_camel-ai-physics_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..7ed3322fcdd5dcc604040e3feec8dfce5da7f491 --- /dev/null +++ b/eval-queue/lgaalves/gpt2-xl_camel-ai-physics_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/gpt2-xl_camel-ai-physics", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-16T19:46:32Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "462509", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/lgaalves/gpt2-xl_lima_eval_request_False_float16_Original.json b/eval-queue/lgaalves/gpt2-xl_lima_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..942900d34efb84c50e6a940458ba0b12a3cfa73f --- /dev/null +++ b/eval-queue/lgaalves/gpt2-xl_lima_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/gpt2-xl_lima", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-14T21:22:23Z", "model_type": "\u2b55 : instruction-tuned", "likes": 0, "params": 0, "license": "mit", "job_id": "651740"} \ No newline at end of file diff --git a/eval-queue/lgaalves/gpt2_camel_physics-platypus_eval_request_False_float16_Original.json b/eval-queue/lgaalves/gpt2_camel_physics-platypus_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..df570998018121456487e6e639eb652bcb13eb66 --- /dev/null +++ b/eval-queue/lgaalves/gpt2_camel_physics-platypus_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/gpt2_camel_physics-platypus", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "517942", "license": "mit", "likes": 0, "params": 0.124} \ No newline at end of file diff --git a/eval-queue/lgaalves/gpt2_guanaco-dolly-platypus_eval_request_False_float16_Original.json b/eval-queue/lgaalves/gpt2_guanaco-dolly-platypus_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..0c022eb2d805ea31c59dd0e0e92c6944a8c49ccb --- /dev/null +++ b/eval-queue/lgaalves/gpt2_guanaco-dolly-platypus_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/gpt2_guanaco-dolly-platypus", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "498360", "license": "mit", "likes": 1, "params": 0.124} \ No newline at end of file diff --git a/eval-queue/lgaalves/gpt2_open-platypus_eval_request_False_float16_Original.json b/eval-queue/lgaalves/gpt2_open-platypus_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..2f0b77139a06fe64d43ae36ad94457f6ef33cfec --- /dev/null +++ b/eval-queue/lgaalves/gpt2_open-platypus_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/gpt2_open-platypus", "base_model": "gpt2", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "498181", "license": "mit", "likes": 0, "params": 0.124} \ No newline at end of file diff --git a/eval-queue/lgaalves/gpt2_platypus-camel_physics_eval_request_False_float16_Original.json b/eval-queue/lgaalves/gpt2_platypus-camel_physics_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..5ff6c8d617765cc0c36d809d470e59978c8c7a8c --- /dev/null +++ b/eval-queue/lgaalves/gpt2_platypus-camel_physics_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/gpt2_platypus-camel_physics", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "517779", "license": "mit", "likes": 0, "params": 0.124} \ No newline at end of file diff --git a/eval-queue/lgaalves/gpt2_platypus-dolly-guanaco_eval_request_False_float16_Original.json b/eval-queue/lgaalves/gpt2_platypus-dolly-guanaco_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..2abf988f1cc7ef47f5d008d365e37f8dfb981bd9 --- /dev/null +++ b/eval-queue/lgaalves/gpt2_platypus-dolly-guanaco_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/gpt2_platypus-dolly-guanaco", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "476440", "license": "mit", "likes": 0, "params": 0.124} \ No newline at end of file diff --git a/eval-queue/lgaalves/llama-2-13b-chat-platypus_eval_request_False_float16_Original.json b/eval-queue/lgaalves/llama-2-13b-chat-platypus_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..31bf0f19dc54ca764b6475fedbe903056b785a09 --- /dev/null +++ b/eval-queue/lgaalves/llama-2-13b-chat-platypus_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/llama-2-13b-chat-platypus", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "520950", "license": "llama2", "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/lgaalves/llama-2-13b-hf-platypus_eval_request_False_float16_Original.json b/eval-queue/lgaalves/llama-2-13b-hf-platypus_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..c09d5a8f9853815568364bda5be2ea70b580aa4c --- /dev/null +++ b/eval-queue/lgaalves/llama-2-13b-hf-platypus_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/llama-2-13b-hf-platypus", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "522009", "license": "llama2", "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/lgaalves/llama-2-7b-hf_open-platypus_eval_request_False_float16_Original.json b/eval-queue/lgaalves/llama-2-7b-hf_open-platypus_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..87114cf745915ff9ad4ae365ac669b74457c29e0 --- /dev/null +++ b/eval-queue/lgaalves/llama-2-7b-hf_open-platypus_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/llama-2-7b-hf_open-platypus", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "499911", "license": "llama2", "likes": 0, "params": 6.738} \ No newline at end of file diff --git a/eval-queue/lgaalves/mistral-7b-platypus1k_eval_request_False_float16_Original.json b/eval-queue/lgaalves/mistral-7b-platypus1k_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..58e59bb1eec261779819528237e9acd09f97dfc7 --- /dev/null +++ b/eval-queue/lgaalves/mistral-7b-platypus1k_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/mistral-7b-platypus1k", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "632928", "license": "apache-2.0", "likes": 0, "params": 7.242} \ No newline at end of file diff --git a/eval-queue/lgaalves/mistral-7b-platypus_1k_eval_request_False_float16_Original.json b/eval-queue/lgaalves/mistral-7b-platypus_1k_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d6e4de5588a8c2960810b1457df56b9bcd81e609 --- /dev/null +++ b/eval-queue/lgaalves/mistral-7b-platypus_1k_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/mistral-7b-platypus_1k", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-10T22:09:14Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "491018", "license": "apache-2.0", "likes": 0, "params": 7.242} \ No newline at end of file diff --git a/eval-queue/lgaalves/mistral-7b-platypus_eval_request_False_float16_Original.json b/eval-queue/lgaalves/mistral-7b-platypus_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..bd357e546a9b540f9fb569c82beef899356e7326 --- /dev/null +++ b/eval-queue/lgaalves/mistral-7b-platypus_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/mistral-7b-platypus", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-11T13:50:20Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "492399", "license": "apache-2.0", "likes": 0, "params": 7.111} \ No newline at end of file diff --git a/eval-queue/lgaalves/mistral-7b-v0.1-platypus_eval_request_False_float16_Original.json b/eval-queue/lgaalves/mistral-7b-v0.1-platypus_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..8a4efc4c8a503ed7a7124f226fed19f37b9edc6c --- /dev/null +++ b/eval-queue/lgaalves/mistral-7b-v0.1-platypus_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/mistral-7b-v0.1-platypus", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-11T11:38:50Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "492076", "params": 7.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/lgaalves/mistral-7b_open_platypus_eval_request_False_float16_Original.json b/eval-queue/lgaalves/mistral-7b_open_platypus_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e5b30ef07999427a2378d49ad8c7183bba9c57a8 --- /dev/null +++ b/eval-queue/lgaalves/mistral-7b_open_platypus_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/mistral-7b_open_platypus", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-15T22:06:54Z", "model_type": "\u2b55 : instruction-tuned", "likes": 0, "params": 7.0, "license": "apache-2.0", "job_id": "697968"} \ No newline at end of file diff --git a/eval-queue/lgaalves/tinyllama-1.1b-chat-v0.3-platypus_eval_request_False_float16_Original.json b/eval-queue/lgaalves/tinyllama-1.1b-chat-v0.3-platypus_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..03a54aa55fd893dc94a621d53b10a69fba44bfa4 --- /dev/null +++ b/eval-queue/lgaalves/tinyllama-1.1b-chat-v0.3-platypus_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/tinyllama-1.1b-chat-v0.3-platypus", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-09T22:07:44Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "489620", "license": "mit", "likes": 3, "params": 1.035} \ No newline at end of file diff --git a/eval-queue/lgaalves/tinyllama-1.1b-chat-v0.3_platypus_eval_request_False_float16_Original.json b/eval-queue/lgaalves/tinyllama-1.1b-chat-v0.3_platypus_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..0fb65d23418f7db16fbfb2c5c36c650cefa5f112 --- /dev/null +++ b/eval-queue/lgaalves/tinyllama-1.1b-chat-v0.3_platypus_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/tinyllama-1.1b-chat-v0.3_platypus", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "515594", "license": "mit", "likes": 3, "params": 1.035} \ No newline at end of file diff --git a/eval-queue/lgaalves/xgen-7b-8k-dolly_eval_request_False_float16_Original.json b/eval-queue/lgaalves/xgen-7b-8k-dolly_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..180ae9d7dd21a12c53689092cf9cfbb6ebf19e85 --- /dev/null +++ b/eval-queue/lgaalves/xgen-7b-8k-dolly_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/xgen-7b-8k-dolly", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-16T19:47:22Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "462525", "license": "mit", "likes": 0, "params": 6.896} \ No newline at end of file diff --git a/eval-queue/lgaalves/xgen-7b-8k_dolly_eval_request_False_float16_Original.json b/eval-queue/lgaalves/xgen-7b-8k_dolly_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..753e98754cad83747e6f9330676ae748ee61ea6c --- /dev/null +++ b/eval-queue/lgaalves/xgen-7b-8k_dolly_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "lgaalves/xgen-7b-8k_dolly", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-22T15:05:37Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "477426", "license": "mit", "likes": 0, "params": 6.896} \ No newline at end of file diff --git a/eval-queue/lilloukas/GPlatty-30B_eval_request_False_False_False.json b/eval-queue/lilloukas/GPlatty-30B_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..0e4c80a4f7bb273a72bc70c6672dbb7828aace18 --- /dev/null +++ b/eval-queue/lilloukas/GPlatty-30B_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "lilloukas/GPlatty-30B", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "job_id": "472302", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "license": "other", "likes": 18, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/lilloukas/Platypus-30B_eval_request_False_False_False.json b/eval-queue/lilloukas/Platypus-30B_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..bfc446efe6ddf6c123742cac2c1a5326cb353fd3 --- /dev/null +++ b/eval-queue/lilloukas/Platypus-30B_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "lilloukas/Platypus-30B", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "job_id": "461219", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "license": "other", "likes": 16, "params": 32.316} \ No newline at end of file diff --git a/eval-queue/llm-agents/tora-13b-v1.0_eval_request_False_float16_Original.json b/eval-queue/llm-agents/tora-13b-v1.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..92e5b5bb4143dcf1bad15f8da49d69e3494e07ee --- /dev/null +++ b/eval-queue/llm-agents/tora-13b-v1.0_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "llm-agents/tora-13b-v1.0", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:19:55Z", "model_type": "instruction-tuned", "job_id": "522938", "license": "llama2", "likes": 2, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/llm-agents/tora-70b-v1.0_eval_request_False_float16_Original.json b/eval-queue/llm-agents/tora-70b-v1.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..5afafdc45d198967b57520f9197ebe5269fbd1dd --- /dev/null +++ b/eval-queue/llm-agents/tora-70b-v1.0_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "llm-agents/tora-70b-v1.0", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:00:29Z", "model_type": "fine-tuned", "job_id": "522190", "license": "llama2", "likes": 8, "params": 68.715} \ No newline at end of file diff --git a/eval-queue/llm-agents/tora-7b-v1.0_eval_request_False_float16_Original.json b/eval-queue/llm-agents/tora-7b-v1.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e4af88f30b1fd903c7c25d1506f7d8ffb2bcbf5c --- /dev/null +++ b/eval-queue/llm-agents/tora-7b-v1.0_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "llm-agents/tora-7b-v1.0", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "instruction-tuned", "job_id": "520480", "license": "llama2", "likes": 2, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/llm-agents/tora-code-13b-v1.0_eval_request_False_float16_Original.json b/eval-queue/llm-agents/tora-code-13b-v1.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d51e1d2804724204fe00e8c34a8cdd27aa165f52 --- /dev/null +++ b/eval-queue/llm-agents/tora-code-13b-v1.0_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "llm-agents/tora-code-13b-v1.0", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "instruction-tuned", "job_id": "514652", "license": "llama2", "likes": 1, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/llm-agents/tora-code-34b-v1.0_eval_request_False_float16_Original.json b/eval-queue/llm-agents/tora-code-34b-v1.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..de1845203a301ba891892e762caeee18c86dbee2 --- /dev/null +++ b/eval-queue/llm-agents/tora-code-34b-v1.0_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "llm-agents/tora-code-34b-v1.0", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:27:38Z", "model_type": "instruction-tuned", "job_id": "523047", "license": "llama2", "likes": 4, "params": 33.482} \ No newline at end of file diff --git a/eval-queue/llm-agents/tora-code-7b-v1.0_eval_request_False_float16_Original.json b/eval-queue/llm-agents/tora-code-7b-v1.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..fd0697b33e46faed87be616f5c524811556d8d05 --- /dev/null +++ b/eval-queue/llm-agents/tora-code-7b-v1.0_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "llm-agents/tora-code-7b-v1.0", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:00:29Z", "model_type": "instruction-tuned", "job_id": "522216", "license": "llama2", "likes": 7, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/medalpaca/medalpaca-13b_eval_request_False_False_False.json b/eval-queue/medalpaca/medalpaca-13b_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..a84e8b14c2281ddd168b128fa97b2c0af4a50687 --- /dev/null +++ b/eval-queue/medalpaca/medalpaca-13b_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "medalpaca/medalpaca-13b", "base_model": "medalpaca/medalpaca-7b", "revision": "main", "private": false, "status": "FAILED_2", "job_id": "176926", "weight_type": "Original", "precision": "float16", "license": "cc", "likes": 55, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/medalpaca/medalpaca-7b_eval_request_False_False_False.json b/eval-queue/medalpaca/medalpaca-7b_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..f9f5bd4154c923229f1d19ab540539c929d3c8d0 --- /dev/null +++ b/eval-queue/medalpaca/medalpaca-7b_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "medalpaca/medalpaca-7b", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "495146", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-10-16T12:48:18Z", "license": "cc", "likes": 30, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/mediocredev/open-llama-3b-v2-instruct_eval_request_False_float16_Original.json b/eval-queue/mediocredev/open-llama-3b-v2-instruct_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..008bf6c6a061995a3603026312bd9b0f2d1326c6 --- /dev/null +++ b/eval-queue/mediocredev/open-llama-3b-v2-instruct_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "mediocredev/open-llama-3b-v2-instruct", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-16T00:46:43Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 3.426, + "license": "apache-2.0", + "job_id": "924742", + "job_start_time": "2023-12-16T14:33:45.391808" +} \ No newline at end of file diff --git a/eval-queue/meta-math/MetaMath-13B-V1.0_eval_request_False_float16_Original.json b/eval-queue/meta-math/MetaMath-13B-V1.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..66c15a5a48193704f364bedc390caf3be13d1177 --- /dev/null +++ b/eval-queue/meta-math/MetaMath-13B-V1.0_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "meta-math/MetaMath-13B-V1.0", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "516527", "license": "llama2", "likes": 6, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/meta-math/MetaMath-70B-V1.0_eval_request_False_float16_Original.json b/eval-queue/meta-math/MetaMath-70B-V1.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d9b91cd6a5255b2ee7736e92b012b97f796876b8 --- /dev/null +++ b/eval-queue/meta-math/MetaMath-70B-V1.0_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "meta-math/MetaMath-70B-V1.0", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "518782", "license": "llama2", "likes": 8, "params": 68.715} \ No newline at end of file diff --git a/eval-queue/meta-math/MetaMath-Llemma-7B_eval_request_False_float16_Original.json b/eval-queue/meta-math/MetaMath-Llemma-7B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b7bd09b3c0a007dadacb7c51509adb12396803f7 --- /dev/null +++ b/eval-queue/meta-math/MetaMath-Llemma-7B_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "meta-math/MetaMath-Llemma-7B", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-10T08:40:57Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 6, + "params": 7.0, + "license": "apache-2.0", + "job_id": "881954", + "job_start_time": "2023-12-10T08:42:33.894820" +} \ No newline at end of file diff --git a/eval-queue/meta-math/MetaMath-Mistral-7B_eval_request_False_float16_Original.json b/eval-queue/meta-math/MetaMath-Mistral-7B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..1e475c331a7322118fb6dd16155ae35fc7efa468 --- /dev/null +++ b/eval-queue/meta-math/MetaMath-Mistral-7B_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "meta-math/MetaMath-Mistral-7B", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-03T15:31:15Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 32, + "params": 7.0, + "license": "apache-2.0", + "job_id": "847272", + "job_start_time": "2023-12-04T16:48:32.203036" +} \ No newline at end of file diff --git a/eval-queue/mhenrichsen/hestenettetLM_eval_request_False_float16_Original.json b/eval-queue/mhenrichsen/hestenettetLM_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..0731c6052c8126f5fe891788857a5467e033f654 --- /dev/null +++ b/eval-queue/mhenrichsen/hestenettetLM_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "mhenrichsen/hestenettetLM", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-09T11:38:24Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 0, "params": 0, "license": "mit", "job_id": "650289"} \ No newline at end of file diff --git a/eval-queue/migtissera/SynthIA-70B-v1.5_eval_request_False_float16_Original.json b/eval-queue/migtissera/SynthIA-70B-v1.5_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d95116a39849499ff6f141340ddefbee12de4915 --- /dev/null +++ b/eval-queue/migtissera/SynthIA-70B-v1.5_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "migtissera/SynthIA-70B-v1.5", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-12T05:10:06Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 34, "params": 70.0, "license": "llama2", "job_id": "650417"} \ No newline at end of file diff --git a/eval-queue/migtissera/SynthIA-7B-v1.3_eval_request_False_float16_Original.json b/eval-queue/migtissera/SynthIA-7B-v1.3_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..572a264d4bf68c89a2cc1232d1b52f6dfcb30e43 --- /dev/null +++ b/eval-queue/migtissera/SynthIA-7B-v1.3_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "migtissera/SynthIA-7B-v1.3", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:00:29Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 115, "job_id": "522373", "license": "apache-2.0", "params": 7.111} \ No newline at end of file diff --git a/eval-queue/migtissera/SynthIA-7B-v1.5_eval_request_False_float16_Original.json b/eval-queue/migtissera/SynthIA-7B-v1.5_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..138a5680648712a88040c8b01cd7407cabf3a27b --- /dev/null +++ b/eval-queue/migtissera/SynthIA-7B-v1.5_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "migtissera/SynthIA-7B-v1.5", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-14T05:37:38Z", "model_type": "\ud83d\udd36 : fine-tuned", "license": "apache-2.0", "likes": 0, "params": 7.111, "job_id": "643148"} \ No newline at end of file diff --git a/eval-queue/migtissera/SynthIA-7B-v2.0_eval_request_False_float16_Original.json b/eval-queue/migtissera/SynthIA-7B-v2.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..f1014d75d74e62609fb5a35db3d9ad793326cf09 --- /dev/null +++ b/eval-queue/migtissera/SynthIA-7B-v2.0_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "migtissera/SynthIA-7B-v2.0", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-14T11:24:50Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 15, "params": 7.0, "license": "apache-2.0", "job_id": "651128"} \ No newline at end of file diff --git a/eval-queue/migtissera/Synthia-13B-v1.2_eval_request_False_float16_Original.json b/eval-queue/migtissera/Synthia-13B-v1.2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..3758510d95c222336218447f1e00e476a4193de7 --- /dev/null +++ b/eval-queue/migtissera/Synthia-13B-v1.2_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "migtissera/Synthia-13B-v1.2", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "633193", "license": "llama2", "likes": 6, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/migtissera/Synthia-13B_eval_request_False_float16_Original.json b/eval-queue/migtissera/Synthia-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..6de43e46cd8b019224b2598c80c738347f472d1c --- /dev/null +++ b/eval-queue/migtissera/Synthia-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "migtissera/Synthia-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "497930", "license": "llama2", "likes": 10, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/migtissera/Synthia-34B-v1.2_eval_request_False_float16_Original.json b/eval-queue/migtissera/Synthia-34B-v1.2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..f65a079b1749d8b75d6156e2b5f0719374bfb1f1 --- /dev/null +++ b/eval-queue/migtissera/Synthia-34B-v1.2_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "migtissera/Synthia-34B-v1.2", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:58:30Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522178", "params": 34.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/migtissera/Synthia-70B-v1.1_eval_request_False_float16_Original.json b/eval-queue/migtissera/Synthia-70B-v1.1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..69aa78db18c4a3f699f790669dedd9e05e886dfa --- /dev/null +++ b/eval-queue/migtissera/Synthia-70B-v1.1_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "migtissera/Synthia-70B-v1.1", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "472718", "license": "llama2", "likes": 7, "params": 68.715} \ No newline at end of file diff --git a/eval-queue/migtissera/Synthia-70B-v1.2_eval_request_False_float16_Original.json b/eval-queue/migtissera/Synthia-70B-v1.2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..42344a2e3563d8d70e585a76cc46365f48c52708 --- /dev/null +++ b/eval-queue/migtissera/Synthia-70B-v1.2_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "migtissera/Synthia-70B-v1.2", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "503247", "license": "llama2", "likes": 16, "params": 68.715} \ No newline at end of file diff --git a/eval-queue/migtissera/Synthia-70B-v1.2b_eval_request_False_float16_Original.json b/eval-queue/migtissera/Synthia-70B-v1.2b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d573a3ea8556a163c0f21b8dee984525222b213f --- /dev/null +++ b/eval-queue/migtissera/Synthia-70B-v1.2b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "migtissera/Synthia-70B-v1.2b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "516909", "license": "llama2", "likes": 17, "params": 68.715} \ No newline at end of file diff --git a/eval-queue/migtissera/Synthia-70B_eval_request_False_float16_Original.json b/eval-queue/migtissera/Synthia-70B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..25fd14854f77ab6a9f84fd48d2c7abf9a645b12e --- /dev/null +++ b/eval-queue/migtissera/Synthia-70B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "migtissera/Synthia-70B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "498114", "license": "llama2", "likes": 8, "params": 68.715} \ No newline at end of file diff --git a/eval-queue/migtissera/Synthia-7B-v1.2_eval_request_False_float16_Original.json b/eval-queue/migtissera/Synthia-7B-v1.2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d17b449248ed96b5a9cea79d1de204fbc913a045 --- /dev/null +++ b/eval-queue/migtissera/Synthia-7B-v1.2_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "migtissera/Synthia-7B-v1.2", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517544", "license": "llama2", "likes": 10, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/migtissera/Synthia-7B_eval_request_False_float16_Original.json b/eval-queue/migtissera/Synthia-7B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..835ff3c608a6d67386ce13c9371ce7992b2a586b --- /dev/null +++ b/eval-queue/migtissera/Synthia-7B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "migtissera/Synthia-7B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "497907", "license": "llama2", "likes": 2, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/migtissera/Synthia-MoE-v3-Mixtral-8x7B_eval_request_False_bfloat16_Original.json b/eval-queue/migtissera/Synthia-MoE-v3-Mixtral-8x7B_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..05cd3dc161974dc007822e94adc5a6a2973e217e --- /dev/null +++ b/eval-queue/migtissera/Synthia-MoE-v3-Mixtral-8x7B_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "migtissera/Synthia-MoE-v3-Mixtral-8x7B", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-12T17:11:35Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.0, + "license": "apache-2.0", + "job_id": "906042", + "job_start_time": "2023-12-12T17:16:39.617597" +} \ No newline at end of file diff --git a/eval-queue/migtissera/Synthia-MoE-v3-Mixtral-8x7B_eval_request_False_float16_Original.json b/eval-queue/migtissera/Synthia-MoE-v3-Mixtral-8x7B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..75828624f2350b2642f1a7231a406f3c69dd13b7 --- /dev/null +++ b/eval-queue/migtissera/Synthia-MoE-v3-Mixtral-8x7B_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "migtissera/Synthia-MoE-v3-Mixtral-8x7B", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-12T18:52:22Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 3, + "params": 7.0, + "license": "apache-2.0", + "job_id": "911211", + "job_start_time": "2023-12-12T18:54:09.759894" +} \ No newline at end of file diff --git a/eval-queue/migtissera/Tess-34B-v1.4_eval_request_False_float16_Original.json b/eval-queue/migtissera/Tess-34B-v1.4_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..9f0cfceac9fbca84cd27e343ba9f7c28ba84198d --- /dev/null +++ b/eval-queue/migtissera/Tess-34B-v1.4_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "migtissera/Tess-34B-v1.4", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-05T19:04:55Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 34.389, + "license": "other", + "job_id": "858075", + "job_start_time": "2023-12-06T16:40:26.372197" +} \ No newline at end of file diff --git a/eval-queue/migtissera/Tess-7B-v1.4_eval_request_False_float16_Original.json b/eval-queue/migtissera/Tess-7B-v1.4_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..73da53491ef9ef2881d805692acc665698aec992 --- /dev/null +++ b/eval-queue/migtissera/Tess-7B-v1.4_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "migtissera/Tess-7B-v1.4", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-05T19:05:14Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "858079", + "job_start_time": "2023-12-06T16:43:33.853947" +} \ No newline at end of file diff --git a/eval-queue/migtissera/Tess-M-Creative-v1.0_eval_request_False_float16_Original.json b/eval-queue/migtissera/Tess-M-Creative-v1.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..8f6e80a6aa2a641a3514734b9d9ef6a645cbceb6 --- /dev/null +++ b/eval-queue/migtissera/Tess-M-Creative-v1.0_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "migtissera/Tess-M-Creative-v1.0", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-04T09:26:03Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 30, + "params": 34.389, + "license": "other", + "job_id": "847463", + "job_start_time": "2023-12-04T19:06:31.538080" +} \ No newline at end of file diff --git a/eval-queue/migtissera/Tess-M-v1.1_eval_request_False_float16_Original.json b/eval-queue/migtissera/Tess-M-v1.1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d59f113e02a973ac3883956d116e51a63e8d4493 --- /dev/null +++ b/eval-queue/migtissera/Tess-M-v1.1_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "migtissera/Tess-M-v1.1", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-22T04:24:54Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 34.389, + "license": "other", + "job_id": "749906", + "job_start_time": null +} \ No newline at end of file diff --git a/eval-queue/migtissera/Tess-M-v1.2_eval_request_False_float16_Original.json b/eval-queue/migtissera/Tess-M-v1.2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e59f6ee80e15dfe9dbd70bf0cbf5736ff293174b --- /dev/null +++ b/eval-queue/migtissera/Tess-M-v1.2_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "migtissera/Tess-M-v1.2", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-11-23T20:52:21Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 0, + "license": "other", + "job_id": "796214", + "job_start_time": "2023-11-26T14:08:39.427332" +} \ No newline at end of file diff --git a/eval-queue/migtissera/Tess-M-v1.3_eval_request_False_float16_Original.json b/eval-queue/migtissera/Tess-M-v1.3_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..a7d3a61557974a0aa29f510174009e3b0da7bec1 --- /dev/null +++ b/eval-queue/migtissera/Tess-M-v1.3_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "migtissera/Tess-M-v1.3", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-27T10:24:52Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 6, + "params": 34, + "license": "other", + "job_id": "845713", + "job_start_time": "2023-12-04T09:00:24.647142" +} \ No newline at end of file diff --git a/eval-queue/migtissera/Tess-Medium-200K-v1.0_eval_request_False_float16_Original.json b/eval-queue/migtissera/Tess-Medium-200K-v1.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..c5db8fbc21494183328c9404fc1ecb1fbf74c163 --- /dev/null +++ b/eval-queue/migtissera/Tess-Medium-200K-v1.0_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "migtissera/Tess-Medium-200K-v1.0", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-11-16T23:26:18Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 3, + "params": 34.389, + "license": "other", + "job_id": "749903", + "job_start_time": null +} \ No newline at end of file diff --git a/eval-queue/migtissera/Tess-Medium-v1.0_eval_request_False_float16_Original.json b/eval-queue/migtissera/Tess-Medium-v1.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..9a8f7d8f0fcac1aad7adf4c4c318a26f92463fc6 --- /dev/null +++ b/eval-queue/migtissera/Tess-Medium-v1.0_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "migtissera/Tess-Medium-v1.0", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-16T02:51:27Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 2, "params": 34.389, "license": "other", "job_id": "698100"} \ No newline at end of file diff --git a/eval-queue/migtissera/Tess-XL-v1.0_eval_request_False_float16_Original.json b/eval-queue/migtissera/Tess-XL-v1.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..ab9b0473655bcb3477b61c1d525b265163dcca68 --- /dev/null +++ b/eval-queue/migtissera/Tess-XL-v1.0_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "migtissera/Tess-XL-v1.0", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-15T15:33:58Z", "model_type": "\u2b55 : instruction-tuned", "likes": 3, "params": 117.749, "license": "llama2", "job_id": "697816"} \ No newline at end of file diff --git a/eval-queue/migtissera/Tess-XS-v1-3-yarn-128K_eval_request_False_bfloat16_Original.json b/eval-queue/migtissera/Tess-XS-v1-3-yarn-128K_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e695f334eb99658b8bf9651a619798ad50dae978 --- /dev/null +++ b/eval-queue/migtissera/Tess-XS-v1-3-yarn-128K_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "migtissera/Tess-XS-v1-3-yarn-128K", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-04T05:50:11Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 7, + "params": 0, + "license": "apache-2.0", + "job_id": "847352", + "job_start_time": "2023-12-04T17:44:37.847303" +} \ No newline at end of file diff --git a/eval-queue/migtissera/Tess-XS-v1-3-yarn-128K_eval_request_False_float16_Original.json b/eval-queue/migtissera/Tess-XS-v1-3-yarn-128K_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..ceb40fae7cda61c2728d321d86d41a90ac986687 --- /dev/null +++ b/eval-queue/migtissera/Tess-XS-v1-3-yarn-128K_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "migtissera/Tess-XS-v1-3-yarn-128K", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-29T22:52:23Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 6, + "params": 0, + "license": "apache-2.0", + "job_id": "845733", + "job_start_time": "2023-12-04T09:17:53.067824" +} \ No newline at end of file diff --git a/eval-queue/migtissera/Tess-XS-v1.0_eval_request_False_float16_Original.json b/eval-queue/migtissera/Tess-XS-v1.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..a5a514faed0da6470048018c55380d2b0ec36b16 --- /dev/null +++ b/eval-queue/migtissera/Tess-XS-v1.0_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "migtissera/Tess-XS-v1.0", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-16T02:51:37Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 0, "params": 7.242, "license": "apache-2.0", "job_id": "698102"} \ No newline at end of file diff --git a/eval-queue/migtissera/Tess-XS-v1.1_eval_request_False_float16_Original.json b/eval-queue/migtissera/Tess-XS-v1.1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..53d9f63324e6a2ceef847a1d5e15989afa71253d --- /dev/null +++ b/eval-queue/migtissera/Tess-XS-v1.1_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "migtissera/Tess-XS-v1.1", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-22T04:24:40Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 0, "params": 7.242, "license": "apache-2.0", "job_id": "742606"} \ No newline at end of file diff --git a/eval-queue/migtissera/Tess-XS-v1.2_eval_request_False_float16_Original.json b/eval-queue/migtissera/Tess-XS-v1.2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..72203df2f2ea1dea34202f2e8598ee2283abd82c --- /dev/null +++ b/eval-queue/migtissera/Tess-XS-v1.2_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "migtissera/Tess-XS-v1.2", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-11-23T22:01:09Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 0, + "license": "apache-2.0", + "job_id": "796247", + "job_start_time": "2023-11-26T15:06:27.511028" +} \ No newline at end of file diff --git a/eval-queue/mncai/Llama2-7B-guanaco-1k_eval_request_False_float16_Original.json b/eval-queue/mncai/Llama2-7B-guanaco-1k_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e836907302381b49717bd8869f311b1513d0ed6c --- /dev/null +++ b/eval-queue/mncai/Llama2-7B-guanaco-1k_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "mncai/Llama2-7B-guanaco-1k", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517333", "license": "?", "likes": 0, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/mncai/Llama2-7B-guanaco-dolphin-500_eval_request_False_float16_Original.json b/eval-queue/mncai/Llama2-7B-guanaco-dolphin-500_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..7e418af17fb682f2cd9ac9a0437686eb91f517fd --- /dev/null +++ b/eval-queue/mncai/Llama2-7B-guanaco-dolphin-500_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "mncai/Llama2-7B-guanaco-dolphin-500", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "518185", "license": "?", "likes": 0, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/mncai/Mistral-7B-CollectiveCognition-OpenOrca-1k_eval_request_False_bfloat16_Original.json b/eval-queue/mncai/Mistral-7B-CollectiveCognition-OpenOrca-1k_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..43ce19f8c884dd575dc81c89af82fe9c7f70be6b --- /dev/null +++ b/eval-queue/mncai/Mistral-7B-CollectiveCognition-OpenOrca-1k_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "mncai/Mistral-7B-CollectiveCognition-OpenOrca-1k", "base_model": "mistral-7B", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-22T04:31:15Z", "model_type": "\u2b55 : instruction-tuned", "likes": 0, "params": 7.0, "license": "mit", "job_id": "649246"} \ No newline at end of file diff --git a/eval-queue/mncai/Mistral-7B-CollectiveCognition_eval_request_False_bfloat16_Original.json b/eval-queue/mncai/Mistral-7B-CollectiveCognition_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..512e0e6e4caf19d6e93e2887a2978dadd8c7c711 --- /dev/null +++ b/eval-queue/mncai/Mistral-7B-CollectiveCognition_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "mncai/Mistral-7B-CollectiveCognition", "base_model": "mistral-7B", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-22T04:36:05Z", "model_type": "\u2b55 : instruction-tuned", "likes": 0, "params": 7.0, "license": "mit", "job_id": "649261"} \ No newline at end of file diff --git a/eval-queue/mncai/Mistral-7B-OpenOrca-1k_eval_request_False_bfloat16_Original.json b/eval-queue/mncai/Mistral-7B-OpenOrca-1k_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..898444942a0a117ea48627c52e97cb2f3b9216f9 --- /dev/null +++ b/eval-queue/mncai/Mistral-7B-OpenOrca-1k_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "mncai/Mistral-7B-OpenOrca-1k", "base_model": "Mistral-7B", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "517513", "license": "?", "likes": 1, "params": 7.111} \ No newline at end of file diff --git a/eval-queue/mncai/Mistral-7B-Orca-Guanaco1k-merged_eval_request_False_bfloat16_Original.json b/eval-queue/mncai/Mistral-7B-Orca-Guanaco1k-merged_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..317c5482876844f0dc4bf6dfb23e6596860f8694 --- /dev/null +++ b/eval-queue/mncai/Mistral-7B-Orca-Guanaco1k-merged_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "mncai/Mistral-7B-Orca-Guanaco1k-merged", "base_model": "Mistral-7B", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-10T02:46:47Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "489746", "license": "?", "likes": 0, "params": 7.111} \ No newline at end of file diff --git a/eval-queue/mncai/Mistral-7B-dolphin-Guanaco1k-merged_eval_request_False_bfloat16_Original.json b/eval-queue/mncai/Mistral-7B-dolphin-Guanaco1k-merged_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..1ab4290904a574049becd96bee2f41ffa38cb0cc --- /dev/null +++ b/eval-queue/mncai/Mistral-7B-dolphin-Guanaco1k-merged_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "mncai/Mistral-7B-dolphin-Guanaco1k-merged", "base_model": "Mistral-7B", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-10T04:48:39Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "489769", "license": "?", "likes": 0, "params": 7.111} \ No newline at end of file diff --git a/eval-queue/mncai/Mistral-7B-openplatypus-1k_eval_request_False_bfloat16_Original.json b/eval-queue/mncai/Mistral-7B-openplatypus-1k_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..8f7f8ca7be5fc3a046a17b917f5c48113618c431 --- /dev/null +++ b/eval-queue/mncai/Mistral-7B-openplatypus-1k_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "mncai/Mistral-7B-openplatypus-1k", "base_model": "Mistral-7B", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "519842", "license": "llama2", "likes": 0, "params": 7.111} \ No newline at end of file diff --git a/eval-queue/mncai/SGPT-1.3B-insurance-epoch10_eval_request_False_False_False.json b/eval-queue/mncai/SGPT-1.3B-insurance-epoch10_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..b2273e75d44eaece0978f9118fe26b56c230b3d7 --- /dev/null +++ b/eval-queue/mncai/SGPT-1.3B-insurance-epoch10_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "mncai/SGPT-1.3B-insurance-epoch10", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "weight_type": "Original", "precision": "float16", "job_id": "461841", "license": "?", "likes": 0, "params": 1.27} \ No newline at end of file diff --git a/eval-queue/mncai/agiin-11.1B-v0.0_eval_request_False_float16_Original.json b/eval-queue/mncai/agiin-11.1B-v0.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..279f994f7b8ef847a2346d9e931811ac91513d1f --- /dev/null +++ b/eval-queue/mncai/agiin-11.1B-v0.0_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "mncai/agiin-11.1B-v0.0", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-14T07:19:45Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 11.168, + "license": "apache-2.0", + "job_id": "924625", + "job_start_time": "2023-12-16T12:29:32.091663" +} \ No newline at end of file diff --git a/eval-queue/mncai/agiin-13.6B-v0.0_eval_request_False_float16_Original.json b/eval-queue/mncai/agiin-13.6B-v0.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..a8776b1c7952221020da909221b62243889430c5 --- /dev/null +++ b/eval-queue/mncai/agiin-13.6B-v0.0_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "mncai/agiin-13.6B-v0.0", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-14T07:57:58Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 13.785, + "license": "apache-2.0", + "job_id": "924627", + "job_start_time": "2023-12-16T12:31:32.543399" +} \ No newline at end of file diff --git a/eval-queue/mncai/agiin-13.6B-v0.1_eval_request_False_float16_Original.json b/eval-queue/mncai/agiin-13.6B-v0.1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..fcc21ef28fc57ac31a4b288d68b3cec3c66b8340 --- /dev/null +++ b/eval-queue/mncai/agiin-13.6B-v0.1_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "mncai/agiin-13.6B-v0.1", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-15T01:07:16Z", + "model_type": "\ud83d\udfe6 : RL-tuned", + "likes": 0, + "params": 13.785, + "license": "apache-2.0", + "job_id": "924674", + "job_start_time": "2023-12-16T13:10:46.114839" +} \ No newline at end of file diff --git a/eval-queue/mncai/chatdoctor_eval_request_False_False_False.json b/eval-queue/mncai/chatdoctor_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..b0eab9c5d90c111d529115d33f0a6d041e490fcd --- /dev/null +++ b/eval-queue/mncai/chatdoctor_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "mncai/chatdoctor", "base_model": "llama-7b", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "weight_type": "Original", "precision": "float16", "job_id": "461194", "model_type": "fine-tuned", "license": "apache-2.0", "likes": 8, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/mncai/llama2-7b-gua-dol-500_eval_request_False_float16_Original.json b/eval-queue/mncai/llama2-7b-gua-dol-500_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..4d2ee73099b3551165200792c6837dc48381ac45 --- /dev/null +++ b/eval-queue/mncai/llama2-7b-gua-dol-500_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "mncai/llama2-7b-gua-dol-500", "base_model": "Llama2-7B", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-27T09:18:02Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "479660", "params": 7.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/mncai/mistral-7b-dpo-merge-v1.1_eval_request_False_float16_Original.json b/eval-queue/mncai/mistral-7b-dpo-merge-v1.1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..0d93f747bc15d032a42d347fdc04696a33fce95d --- /dev/null +++ b/eval-queue/mncai/mistral-7b-dpo-merge-v1.1_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "mncai/mistral-7b-dpo-merge-v1.1", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "RUNNING", + "submitted_time": "2023-12-17T13:45:42Z", + "model_type": "\ud83d\udfe6 : RL-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "930311", + "job_start_time": "2023-12-17T13:46:46.812439" +} \ No newline at end of file diff --git a/eval-queue/mncai/mistral-7b-dpo-v5_eval_request_False_float16_Original.json b/eval-queue/mncai/mistral-7b-dpo-v5_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..471819f10125eb1fdd69e253927e98bafd7b3847 --- /dev/null +++ b/eval-queue/mncai/mistral-7b-dpo-v5_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "mncai/mistral-7b-dpo-v5", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-14T04:28:39Z", + "model_type": "\ud83d\udfe6 : RL-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "924617", + "job_start_time": "2023-12-16T12:21:31.419065" +} \ No newline at end of file diff --git a/eval-queue/mncai/mistral-7b-dpo-v6_eval_request_False_float16_Original.json b/eval-queue/mncai/mistral-7b-dpo-v6_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..de312852c11b958b00ee2445179b3b2df364885e --- /dev/null +++ b/eval-queue/mncai/mistral-7b-dpo-v6_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "mncai/mistral-7b-dpo-v6", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-16T18:15:06Z", + "model_type": "\ud83d\udfe6 : RL-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "925906", + "job_start_time": "2023-12-16T18:15:34.176257" +} \ No newline at end of file diff --git a/eval-queue/mncai/yi-34B-v2_eval_request_False_float16_Original.json b/eval-queue/mncai/yi-34B-v2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..28c002bf9c51b7164a1d448d53f4efe3fbf7c726 --- /dev/null +++ b/eval-queue/mncai/yi-34B-v2_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "mncai/yi-34B-v2", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-07T15:56:01Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 34.389, + "license": "other", + "job_id": "874864", + "job_start_time": "2023-12-09T13:51:47.954712" +} \ No newline at end of file diff --git a/eval-queue/mncai/yi-34B-v3_eval_request_False_float16_Original.json b/eval-queue/mncai/yi-34B-v3_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..646bbbe8d46f22839caef9b25716f2c15db0888b --- /dev/null +++ b/eval-queue/mncai/yi-34B-v3_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "mncai/yi-34B-v3", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-10T09:49:38Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 34.389, + "license": "other", + "job_id": "883011", + "job_start_time": "2023-12-10T09:49:55.473741" +} \ No newline at end of file diff --git a/eval-queue/ngoan/NgoanYi_eval_request_False_float16_Original.json b/eval-queue/ngoan/NgoanYi_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..5fe99c88388247c3cba6fef3a5173ca0fe12cdf1 --- /dev/null +++ b/eval-queue/ngoan/NgoanYi_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "ngoan/NgoanYi", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-10T19:31:03Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 33.93, + "license": "other", + "job_id": "887615", + "job_start_time": "2023-12-10T19:32:41.559397" +} \ No newline at end of file diff --git a/eval-queue/notstoic/OPT-13B-Nerybus-Mix-4bit-128g_eval_request_False_False_False.json b/eval-queue/notstoic/OPT-13B-Nerybus-Mix-4bit-128g_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..c759844e4c15a767e301ebe630f6719b4e16bce4 --- /dev/null +++ b/eval-queue/notstoic/OPT-13B-Nerybus-Mix-4bit-128g_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "notstoic/OPT-13B-Nerybus-Mix-4bit-128g", "base_model": "KoboldAI/OPT-13B-Nerybus-Mix", "revision": "main", "private": false, "status": "FAILED_2", "job_id": "177160", "weight_type": "Original", "precision": "float16", "license": "other", "likes": 4, "params": 12.853} \ No newline at end of file diff --git a/eval-queue/notstoic/PygmalionCoT-7b_eval_request_False_False_False.json b/eval-queue/notstoic/PygmalionCoT-7b_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..c70300e79923b6e6687f0a34287fdc40ac63b99f --- /dev/null +++ b/eval-queue/notstoic/PygmalionCoT-7b_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "notstoic/PygmalionCoT-7b", "base_model": "", "revision": "main", "private": false, "status": "FINISHED", "job_id": "470743", "weight_type": "Original", "precision": "float16", "model_type": "fine-tuned", "submitted_time": "2023-10-16T12:46:18Z", "license": "other", "likes": 15, "params": 6.738} \ No newline at end of file diff --git a/eval-queue/notstoic/pygmalion-13b-4bit-128g_eval_request_False_False_False.json b/eval-queue/notstoic/pygmalion-13b-4bit-128g_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..e83a6662dd64e94ac8a661dd5ab6f9b1dd7aa7cf --- /dev/null +++ b/eval-queue/notstoic/pygmalion-13b-4bit-128g_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "notstoic/pygmalion-13b-4bit-128g", "base_model": "", "revision": "main", "private": false, "status": "FAILED_2", "job_id": "176751", "weight_type": "Original", "precision": "float16", "license": "other", "likes": 134, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/oh-yeontaek/llama-2-13B-LoRA-assemble_eval_request_False_float16_Original.json b/eval-queue/oh-yeontaek/llama-2-13B-LoRA-assemble_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..025fa22a7fc8cc6f8b8b6203d7a4d2b214c5237d --- /dev/null +++ b/eval-queue/oh-yeontaek/llama-2-13B-LoRA-assemble_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "oh-yeontaek/llama-2-13B-LoRA-assemble", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:00:29Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522218", "license": "?", "likes": 7, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/oh-yeontaek/llama-2-70B-LoRA-assemble-v2_eval_request_False_float16_Original.json b/eval-queue/oh-yeontaek/llama-2-70B-LoRA-assemble-v2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b0e8130bd5c7b70b75f1badff6da97155a692fac --- /dev/null +++ b/eval-queue/oh-yeontaek/llama-2-70B-LoRA-assemble-v2_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "oh-yeontaek/llama-2-70B-LoRA-assemble-v2", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-06T10:31:15Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "632843", "license": "?", "likes": 2, "params": 68.715} \ No newline at end of file diff --git a/eval-queue/oh-yeontaek/llama-2-70B-LoRA-assemble_eval_request_False_float16_Original.json b/eval-queue/oh-yeontaek/llama-2-70B-LoRA-assemble_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b3e8ad80eac80f15cdf22ba452ca61a820dde121 --- /dev/null +++ b/eval-queue/oh-yeontaek/llama-2-70B-LoRA-assemble_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "oh-yeontaek/llama-2-70B-LoRA-assemble", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:00:29Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522386", "params": 70.0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/oh-yeontaek/llama-2-7B-LoRA-assemble_eval_request_False_float16_Original.json b/eval-queue/oh-yeontaek/llama-2-7B-LoRA-assemble_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..8840d7dde930191e2702293750f2680c2a844851 --- /dev/null +++ b/eval-queue/oh-yeontaek/llama-2-7B-LoRA-assemble_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "oh-yeontaek/llama-2-7B-LoRA-assemble", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517373", "license": "?", "likes": 4, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/one-man-army/una-neural-chat-v3-3-P1-OMA_eval_request_False_bfloat16_Original.json b/eval-queue/one-man-army/una-neural-chat-v3-3-P1-OMA_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..02516c368c30fbf28228b18bfcadd1152d32b353 --- /dev/null +++ b/eval-queue/one-man-army/una-neural-chat-v3-3-P1-OMA_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "one-man-army/una-neural-chat-v3-3-P1-OMA", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-12T09:29:34Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "901049", + "job_start_time": "2023-12-12T09:30:11.639283" +} \ No newline at end of file diff --git a/eval-queue/one-man-army/una-neural-chat-v3-3-P1-OMA_eval_request_False_float16_Original.json b/eval-queue/one-man-army/una-neural-chat-v3-3-P1-OMA_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..3c635e93bdf6c3a5dd72e83b5fa0ec5069d62d2e --- /dev/null +++ b/eval-queue/one-man-army/una-neural-chat-v3-3-P1-OMA_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "one-man-army/una-neural-chat-v3-3-P1-OMA", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-12T08:28:35Z", + "model_type": "\ud83d\udfe6 : RL-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "901019", + "job_start_time": "2023-12-12T09:10:40.881617" +} \ No newline at end of file diff --git a/eval-queue/one-man-army/una-neural-chat-v3-3-P2-OMA_eval_request_False_bfloat16_Original.json b/eval-queue/one-man-army/una-neural-chat-v3-3-P2-OMA_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..0fb383b7eb8bf1ec09be5b7a8b62258f95824838 --- /dev/null +++ b/eval-queue/one-man-army/una-neural-chat-v3-3-P2-OMA_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "one-man-army/una-neural-chat-v3-3-P2-OMA", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-12T15:46:01Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "902888", + "job_start_time": "2023-12-12T15:48:30.377006" +} \ No newline at end of file diff --git a/eval-queue/one-man-army/una-neural-chat-v3-3-P2-OMA_eval_request_False_float16_Original.json b/eval-queue/one-man-army/una-neural-chat-v3-3-P2-OMA_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..dfb7d0189f20e374dedf8012f11e0729adba4010 --- /dev/null +++ b/eval-queue/one-man-army/una-neural-chat-v3-3-P2-OMA_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "one-man-army/una-neural-chat-v3-3-P2-OMA", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-12T15:46:07Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.242, + "license": "apache-2.0", + "job_id": "902891", + "job_start_time": "2023-12-12T15:50:21.840084" +} \ No newline at end of file diff --git a/eval-queue/openbmb/UltraLM-13b-v2.0_eval_request_False_float16_Original.json b/eval-queue/openbmb/UltraLM-13b-v2.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..a04fd9906e2831b3ce027768ba714670ee71c132 --- /dev/null +++ b/eval-queue/openbmb/UltraLM-13b-v2.0_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "openbmb/UltraLM-13b-v2.0", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517457", "license": "mit", "likes": 2, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/openbmb/UltraLM-13b_eval_request_False_float16_Original.json b/eval-queue/openbmb/UltraLM-13b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..791ba36cf1804277d38756c48cb468a036fb9c0c --- /dev/null +++ b/eval-queue/openbmb/UltraLM-13b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "openbmb/UltraLM-13b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:58:30Z", "model_type": "fine-tuned", "job_id": "522154", "license": "?", "likes": 67, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/openbmb/UltraLM-65b_eval_request_False_False_True.json b/eval-queue/openbmb/UltraLM-65b_eval_request_False_False_True.json new file mode 100644 index 0000000000000000000000000000000000000000..67ed60a078accaa8e3c3e8106984468152701a45 --- /dev/null +++ b/eval-queue/openbmb/UltraLM-65b_eval_request_False_False_True.json @@ -0,0 +1 @@ +{"model": "openbmb/UltraLM-65b", "base_model": "huggyllama/llama-65b", "revision": "main", "private": false, "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "weight_type": "Delta", "precision": "float16", "job_id": "472645", "license": "?", "likes": 6, "params": 65.024} \ No newline at end of file diff --git a/eval-queue/openbmb/UltraLM-65b_eval_request_False_float16_Delta.json b/eval-queue/openbmb/UltraLM-65b_eval_request_False_float16_Delta.json new file mode 100644 index 0000000000000000000000000000000000000000..544601cb5478356dd5d0a9a525399478d1374031 --- /dev/null +++ b/eval-queue/openbmb/UltraLM-65b_eval_request_False_float16_Delta.json @@ -0,0 +1 @@ +{"model": "openbmb/UltraLM-65b", "base_model": "huggyllama/llama-65b", "revision": "main", "private": false, "precision": "float16", "weight_type": "Delta", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "fine-tuned", "job_id": "471808", "license": "?", "likes": 6, "params": 65.024} \ No newline at end of file diff --git a/eval-queue/openbmb/UltraLM-65b_eval_request_False_float16_Original.json b/eval-queue/openbmb/UltraLM-65b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..98c14c6defe14ad77c8b9d319c6f23e29acfafc7 --- /dev/null +++ b/eval-queue/openbmb/UltraLM-65b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "openbmb/UltraLM-65b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "fine-tuned", "job_id": "461466", "license": "?", "likes": 6, "params": 65.024} \ No newline at end of file diff --git a/eval-queue/openbmb/UltraRM-13b_eval_request_False_float16_Original.json b/eval-queue/openbmb/UltraRM-13b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..99ac44b69ae5d4f7507a91284d17dc0eec346200 --- /dev/null +++ b/eval-queue/openbmb/UltraRM-13b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "openbmb/UltraRM-13b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "515767", "license": "mit", "likes": 8, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/openthaigpt/openthaigpt-1.0.0-alpha-7b-chat-ckpt-hf_eval_request_False_float16_Original.json b/eval-queue/openthaigpt/openthaigpt-1.0.0-alpha-7b-chat-ckpt-hf_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b222586d5c7df543fb8c1030b25e90bb5faf1b37 --- /dev/null +++ b/eval-queue/openthaigpt/openthaigpt-1.0.0-alpha-7b-chat-ckpt-hf_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "openthaigpt/openthaigpt-1.0.0-alpha-7b-chat-ckpt-hf", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "472342", "license": "apache-2.0", "likes": 1, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/pansophic/hari_eval_request_False_bfloat16_Original.json b/eval-queue/pansophic/hari_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..53f4702afd44b5cde0d51fb39546aa8a5c2457df --- /dev/null +++ b/eval-queue/pansophic/hari_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "pansophic/hari", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-11-29T12:13:28Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 0, + "license": "apache-2.0", + "job_id": "810285", + "job_start_time": "2023-11-29T14:32:29.015811" +} \ No newline at end of file diff --git a/eval-queue/pansophic/hari_eval_request_False_float16_Original.json b/eval-queue/pansophic/hari_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..84b71703a39e908b41ea31b652996700ab1eb221 --- /dev/null +++ b/eval-queue/pansophic/hari_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "pansophic/hari", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-11-29T12:14:18Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 0, + "license": "apache-2.0", + "job_id": "845715", + "job_start_time": "2023-12-04T09:02:06.066790" +} \ No newline at end of file diff --git a/eval-queue/prithivida/Asimov-7B-v1_eval_request_False_4bit_Adapter.json b/eval-queue/prithivida/Asimov-7B-v1_eval_request_False_4bit_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..e677470d44adb210bc76b3fedcf505a4fc818d97 --- /dev/null +++ b/eval-queue/prithivida/Asimov-7B-v1_eval_request_False_4bit_Adapter.json @@ -0,0 +1 @@ +{"model": "prithivida/Asimov-7B-v1", "base_model": "mistralai/Mistral-7B-v0.1", "revision": "main", "private": false, "precision": "4bit", "weight_type": "Adapter", "status": "FINISHED", "submitted_time": "2023-11-17T13:32:19Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 0, "params": 7.0, "license": "mit", "job_id": "700929"} \ No newline at end of file diff --git a/eval-queue/prithivida/Asimov-7B-v2_eval_request_False_4bit_Adapter.json b/eval-queue/prithivida/Asimov-7B-v2_eval_request_False_4bit_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..8f5078d40abdac08801e19c6fa08016d3da66897 --- /dev/null +++ b/eval-queue/prithivida/Asimov-7B-v2_eval_request_False_4bit_Adapter.json @@ -0,0 +1,16 @@ +{ + "model": "prithivida/Asimov-7B-v2", + "base_model": "mistralai/Mistral-7B-v0.1", + "revision": "main", + "private": false, + "precision": "4bit", + "weight_type": "Adapter", + "status": "FINISHED", + "submitted_time": "2023-11-28T15:16:03Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.0, + "license": "mit", + "job_id": "805035", + "job_start_time": "2023-11-28T15:17:27.539449" +} \ No newline at end of file diff --git a/eval-queue/s1ghhh/medllama-2-70b-qlora-1.1_eval_request_False_4bit_Adapter.json b/eval-queue/s1ghhh/medllama-2-70b-qlora-1.1_eval_request_False_4bit_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..3d26e7e1e2571a73ad093ccd908f8e554a38a1e5 --- /dev/null +++ b/eval-queue/s1ghhh/medllama-2-70b-qlora-1.1_eval_request_False_4bit_Adapter.json @@ -0,0 +1 @@ +{"model": "s1ghhh/medllama-2-70b-qlora-1.1", "base_model": "NousResearch/Llama-2-70b-hf", "revision": "main", "private": false, "precision": "4bit", "weight_type": "Adapter", "status": "FAILED", "submitted_time": "2023-09-25T07:24:27Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "479490", "license": "llama2", "likes": 1, "params": 70.0} \ No newline at end of file diff --git a/eval-queue/s1ghhh/medllama-2-70b-qlora-1.1_eval_request_False_bfloat16_Adapter.json b/eval-queue/s1ghhh/medllama-2-70b-qlora-1.1_eval_request_False_bfloat16_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..d9c362ea960224cdc693bea748609a3b4ec89bcc --- /dev/null +++ b/eval-queue/s1ghhh/medllama-2-70b-qlora-1.1_eval_request_False_bfloat16_Adapter.json @@ -0,0 +1 @@ +{"model": "s1ghhh/medllama-2-70b-qlora-1.1", "base_model": "NousResearch/Llama-2-70b-hf", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Adapter", "status": "FAILED", "submitted_time": "2023-10-04T12:54:05Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "486678", "license": "llama2", "likes": 1, "params": 70.0} \ No newline at end of file diff --git a/eval-queue/s1ghhh/medllama-2-70b-qlora-1.1_eval_request_False_float16_Adapter.json b/eval-queue/s1ghhh/medllama-2-70b-qlora-1.1_eval_request_False_float16_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..128c5ad802ebb7a1363b1f2c41372c5b82f3296e --- /dev/null +++ b/eval-queue/s1ghhh/medllama-2-70b-qlora-1.1_eval_request_False_float16_Adapter.json @@ -0,0 +1 @@ +{"model": "s1ghhh/medllama-2-70b-qlora-1.1", "base_model": "NousResearch/Llama-2-70b-hf", "revision": "main", "private": false, "precision": "float16", "weight_type": "Adapter", "status": "FINISHED", "submitted_time": "2023-10-16T13:00:29Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522204", "license": "llama2", "likes": 1, "params": 70.0} \ No newline at end of file diff --git a/eval-queue/s1ghhh/medllama-2-70b-qlora-4bit_eval_request_False_4bit_Adapter.json b/eval-queue/s1ghhh/medllama-2-70b-qlora-4bit_eval_request_False_4bit_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..80e0ad27cec88fc8648b1d7b7cd7d963fcb3a25a --- /dev/null +++ b/eval-queue/s1ghhh/medllama-2-70b-qlora-4bit_eval_request_False_4bit_Adapter.json @@ -0,0 +1 @@ +{"model": "s1ghhh/medllama-2-70b-qlora-4bit", "base_model": "NousResearch/Llama-2-70b-hf", "revision": "main", "private": false, "precision": "4bit", "weight_type": "Adapter", "status": "FAILED", "submitted_time": "2023-10-04T16:17:03Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "486798", "license": "apache-2.0", "likes": 0, "params": 70.0} \ No newline at end of file diff --git a/eval-queue/s1ghhh/medllama-2-70b-qlora-4bit_eval_request_False_float16_Adapter.json b/eval-queue/s1ghhh/medllama-2-70b-qlora-4bit_eval_request_False_float16_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..98fb20c71f1535f7c0db9da54ea811f1310ec1d1 --- /dev/null +++ b/eval-queue/s1ghhh/medllama-2-70b-qlora-4bit_eval_request_False_float16_Adapter.json @@ -0,0 +1 @@ +{"model": "s1ghhh/medllama-2-70b-qlora-4bit", "base_model": "NousResearch/Llama-2-70b-hf", "revision": "main", "private": false, "precision": "float16", "weight_type": "Adapter", "status": "FAILED", "submitted_time": "2023-10-04T16:16:55Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "486792", "license": "apache-2.0", "likes": 0, "params": 70.0} \ No newline at end of file diff --git a/eval-queue/sachith-surge/open-llama-v2-lamini-orca-evol-qlora-checkpoin_eval_request_False_4bit_Adapter.json b/eval-queue/sachith-surge/open-llama-v2-lamini-orca-evol-qlora-checkpoin_eval_request_False_4bit_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..cbff70e6a1966d6a3ac3d081094055571d465d28 --- /dev/null +++ b/eval-queue/sachith-surge/open-llama-v2-lamini-orca-evol-qlora-checkpoin_eval_request_False_4bit_Adapter.json @@ -0,0 +1 @@ +{"model": "sachith-surge/open-llama-v2-lamini-orca-evol-qlora-checkpoin", "base_model": "openlm-research/open_llama_3b_v2", "revision": "main", "private": false, "precision": "4bit", "weight_type": "Adapter", "status": "FAILED", "submitted_time": "2023-09-22T04:20:04Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "477412", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/sachith-surge/open-llama-v2-lamini-orca-evol-qlora-checkpoint-safetensors_eval_request_False_4bit_Original.json b/eval-queue/sachith-surge/open-llama-v2-lamini-orca-evol-qlora-checkpoint-safetensors_eval_request_False_4bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..8e01f427ae4b6acd8def1594f220bce81e526cd1 --- /dev/null +++ b/eval-queue/sachith-surge/open-llama-v2-lamini-orca-evol-qlora-checkpoint-safetensors_eval_request_False_4bit_Original.json @@ -0,0 +1 @@ +{"model": "sachith-surge/open-llama-v2-lamini-orca-evol-qlora-checkpoint-safetensors", "base_model": "", "revision": "main", "private": false, "precision": "4bit", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-23T12:17:43Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 0, "params": 3.426, "license": "apache-2.0", "job_id": "649431"} \ No newline at end of file diff --git a/eval-queue/sachith-surge/open-llama-v2-lamini-orca-evol-qlora-checkpoint-safetensors_eval_request_False_float16_Original.json b/eval-queue/sachith-surge/open-llama-v2-lamini-orca-evol-qlora-checkpoint-safetensors_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..44a4d4a8775458bc4d4b03c692ea3fff0bd803df --- /dev/null +++ b/eval-queue/sachith-surge/open-llama-v2-lamini-orca-evol-qlora-checkpoint-safetensors_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "sachith-surge/open-llama-v2-lamini-orca-evol-qlora-checkpoint-safetensors", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-23T12:17:39Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 0, "params": 3.426, "license": "apache-2.0", "job_id": "649426"} \ No newline at end of file diff --git a/eval-queue/sachith-surge/open-llama-v2-lamini-orca-evol-qlora-checkpoint_eval_request_False_4bit_Adapter.json b/eval-queue/sachith-surge/open-llama-v2-lamini-orca-evol-qlora-checkpoint_eval_request_False_4bit_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..2aa932c4b5673b4ee975b79f7bb38d8e903dfb39 --- /dev/null +++ b/eval-queue/sachith-surge/open-llama-v2-lamini-orca-evol-qlora-checkpoint_eval_request_False_4bit_Adapter.json @@ -0,0 +1 @@ +{"model": "sachith-surge/open-llama-v2-lamini-orca-evol-qlora-checkpoint", "base_model": "openlm-research/open_llama_3b_v2", "revision": "main", "private": false, "precision": "4bit", "weight_type": "Adapter", "status": "FAILED", "submitted_time": "2023-09-22T04:19:45Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "477410", "license": "?", "likes": 0, "params": 0} \ No newline at end of file diff --git a/eval-queue/scales-okn/docket-language-model_eval_request_False_float16_Original.json b/eval-queue/scales-okn/docket-language-model_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..0be2936a0b67551817a32c39c1ae8bb1689a165e --- /dev/null +++ b/eval-queue/scales-okn/docket-language-model_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "scales-okn/docket-language-model", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-18T19:08:18Z", "model_type": "\ud83d\udfe2 : pretrained", "job_id": "470001", "license": "?", "likes": 0, "params": 0.434} \ No newline at end of file diff --git a/eval-queue/teknium/CollectiveCognition-v1-Mistral-7B_eval_request_False_bfloat16_Original.json b/eval-queue/teknium/CollectiveCognition-v1-Mistral-7B_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..c5a958fd6f3ecb73017306c4e56088724143a76b --- /dev/null +++ b/eval-queue/teknium/CollectiveCognition-v1-Mistral-7B_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "teknium/CollectiveCognition-v1-Mistral-7B", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:19:55Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "522782", "license": "apache-2.0", "likes": 4, "params": 7.111} \ No newline at end of file diff --git a/eval-queue/teknium/CollectiveCognition-v1.1-Mistral-7B_eval_request_False_bfloat16_Original.json b/eval-queue/teknium/CollectiveCognition-v1.1-Mistral-7B_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..763c2cc686b166acb7d12b89220dcf389b14600f --- /dev/null +++ b/eval-queue/teknium/CollectiveCognition-v1.1-Mistral-7B_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "teknium/CollectiveCognition-v1.1-Mistral-7B", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "517130", "license": "apache-2.0", "likes": 45, "params": 7.111} \ No newline at end of file diff --git a/eval-queue/teknium/CollectiveCognition-v1.1-Mistral-7B_eval_request_False_float16_Original.json b/eval-queue/teknium/CollectiveCognition-v1.1-Mistral-7B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..deb2a5a01cd9215d047a1561f4c06975d22db5a7 --- /dev/null +++ b/eval-queue/teknium/CollectiveCognition-v1.1-Mistral-7B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "teknium/CollectiveCognition-v1.1-Mistral-7B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-05T10:21:50Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "642668", "license": "apache-2.0", "likes": 45, "params": 7.111} \ No newline at end of file diff --git a/eval-queue/teknium/Mistral-Trismegistus-7B_eval_request_False_bfloat16_Original.json b/eval-queue/teknium/Mistral-Trismegistus-7B_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e31450049a8fed2c2e5ad51f923d1bbfcff1de6b --- /dev/null +++ b/eval-queue/teknium/Mistral-Trismegistus-7B_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "teknium/Mistral-Trismegistus-7B", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "517495", "license": "apache-2.0", "likes": 59, "params": 7.111} \ No newline at end of file diff --git a/eval-queue/teknium/OpenHermes-13B_eval_request_False_bfloat16_Original.json b/eval-queue/teknium/OpenHermes-13B_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..176b880557bec380065395317ce963e22a0b2d51 --- /dev/null +++ b/eval-queue/teknium/OpenHermes-13B_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "teknium/OpenHermes-13B", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "518324", "license": "mit", "likes": 28, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/teknium/OpenHermes-13B_eval_request_False_float16_Original.json b/eval-queue/teknium/OpenHermes-13B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..49f22458930c3b2b4e363df36e9d19fade304ee2 --- /dev/null +++ b/eval-queue/teknium/OpenHermes-13B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "teknium/OpenHermes-13B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "516956", "license": "mit", "likes": 28, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/teknium/OpenHermes-2-Mistral-7B_eval_request_False_float16_Original.json b/eval-queue/teknium/OpenHermes-2-Mistral-7B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..a5d053a99609e2f33b0320f573ac6ceed55dfbeb --- /dev/null +++ b/eval-queue/teknium/OpenHermes-2-Mistral-7B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "teknium/OpenHermes-2-Mistral-7B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-14T14:47:29Z", "model_type": "\u2b55 : instruction-tuned", "likes": 197, "params": 7.0, "license": "apache-2.0", "job_id": "651425"} \ No newline at end of file diff --git a/eval-queue/teknium/OpenHermes-2.5-Mistral-7B_eval_request_False_bfloat16_Original.json b/eval-queue/teknium/OpenHermes-2.5-Mistral-7B_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..1c70f4c5bcfb29486f575d46fcbbeaa7c659018f --- /dev/null +++ b/eval-queue/teknium/OpenHermes-2.5-Mistral-7B_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "teknium/OpenHermes-2.5-Mistral-7B", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-17T19:50:31Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 125, "params": 7.0, "license": "apache-2.0", "job_id": "702657"} \ No newline at end of file diff --git a/eval-queue/teknium/OpenHermes-2.5-Mistral-7B_eval_request_False_float16_Original.json b/eval-queue/teknium/OpenHermes-2.5-Mistral-7B_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..f58976dcce36111ca8fa12c165adaaa61638812d --- /dev/null +++ b/eval-queue/teknium/OpenHermes-2.5-Mistral-7B_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "teknium/OpenHermes-2.5-Mistral-7B", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-14T15:14:43Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 115, "params": 7.0, "license": "apache-2.0", "job_id": "651458"} \ No newline at end of file diff --git a/eval-queue/teknium/OpenHermes-7B_eval_request_False_bfloat16_Original.json b/eval-queue/teknium/OpenHermes-7B_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..7cb03167619776a8e620ddb2d404b6745d3e107b --- /dev/null +++ b/eval-queue/teknium/OpenHermes-7B_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "teknium/OpenHermes-7B", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "518348", "license": "mit", "likes": 7, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/teknium/airoboros-mistral2.2-7b_eval_request_False_float16_Original.json b/eval-queue/teknium/airoboros-mistral2.2-7b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..70481ee1bc454d46b9f19a3a1cdfe3d46b8da120 --- /dev/null +++ b/eval-queue/teknium/airoboros-mistral2.2-7b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "teknium/airoboros-mistral2.2-7b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-05T15:12:19Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "486840", "license": "mit", "likes": 21, "params": 7.111} \ No newline at end of file diff --git a/eval-queue/thiakx/flan-t5-qlora-financial-phrasebank_eval_request_False_4bit_Adapter.json b/eval-queue/thiakx/flan-t5-qlora-financial-phrasebank_eval_request_False_4bit_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..3c77419394d091ac233f96a4ee686644adf28d53 --- /dev/null +++ b/eval-queue/thiakx/flan-t5-qlora-financial-phrasebank_eval_request_False_4bit_Adapter.json @@ -0,0 +1 @@ +{"model": "thiakx/flan-t5-qlora-financial-phrasebank", "base_model": "google/flan-t5-base", "revision": "main", "private": false, "precision": "4bit", "weight_type": "Adapter", "status": "FAILED", "submitted_time": "2023-08-29T14:05:24Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "397073", "license": "?", "likes": 0, "params": 0} \ No newline at end of file diff --git a/eval-queue/tontanwannaphong1998/cat_mm_b11_eval_request_False_float16_Original.json b/eval-queue/tontanwannaphong1998/cat_mm_b11_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..3ce77ff494e08fcd23f10ee2665e83b92e18ef1f --- /dev/null +++ b/eval-queue/tontanwannaphong1998/cat_mm_b11_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "tontanwannaphong1998/cat_mm_b11", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-24T17:15:26Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "479420", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/tontanwannaphong1998/cat_mm_b12_4_eval_request_False_float16_Original.json b/eval-queue/tontanwannaphong1998/cat_mm_b12_4_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..832fc604b14764bf3d477e4e1e5edc79e82cc0b5 --- /dev/null +++ b/eval-queue/tontanwannaphong1998/cat_mm_b12_4_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "tontanwannaphong1998/cat_mm_b12_4", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-25T10:28:40Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "479495", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/tontanwannaphong1998/cat_mm_b12_eval_request_False_float16_Original.json b/eval-queue/tontanwannaphong1998/cat_mm_b12_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..066e04524af27012348c951ff5c11d8383e322f3 --- /dev/null +++ b/eval-queue/tontanwannaphong1998/cat_mm_b12_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "tontanwannaphong1998/cat_mm_b12", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-24T14:16:08Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "479398", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/tontanwannaphong1998/cat_mm_b1_eval_request_False_float16_Original.json b/eval-queue/tontanwannaphong1998/cat_mm_b1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d1fbfa26a8480da6146c542dca462d927afb253c --- /dev/null +++ b/eval-queue/tontanwannaphong1998/cat_mm_b1_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "tontanwannaphong1998/cat_mm_b1", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-24T14:15:31Z", "model_type": "\ud83d\udfe2 : pretrained", "job_id": "479395", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/tontanwannaphong1998/cat_mm_b_eval_request_False_float16_Original.json b/eval-queue/tontanwannaphong1998/cat_mm_b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..faf68599c8efc71f9d5891c6f6ff7a675709cc28 --- /dev/null +++ b/eval-queue/tontanwannaphong1998/cat_mm_b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "tontanwannaphong1998/cat_mm_b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-24T15:44:24Z", "model_type": "\ud83d\udfe2 : pretrained", "job_id": "479404", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/uberkie/metharme-1.3b-finetuned_eval_request_False_float16_Original.json b/eval-queue/uberkie/metharme-1.3b-finetuned_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..dab01ea305ea972c324d06fce38df6bde36cce00 --- /dev/null +++ b/eval-queue/uberkie/metharme-1.3b-finetuned_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "uberkie/metharme-1.3b-finetuned", "base_model": "metharme-1.3b", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "518528", "params": 1.3, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/uukuguy/CollectiveCognition-v1.1-Mistral-7B-dare-0.85_eval_request_False_float16_Original.json b/eval-queue/uukuguy/CollectiveCognition-v1.1-Mistral-7B-dare-0.85_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..f935fb971a17d9fe70dd9b3ab8a69c6170827cd8 --- /dev/null +++ b/eval-queue/uukuguy/CollectiveCognition-v1.1-Mistral-7B-dare-0.85_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "uukuguy/CollectiveCognition-v1.1-Mistral-7B-dare-0.85", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-22T11:35:45Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.0, + "license": "llama2", + "job_id": "759147", + "job_start_time": "2023-11-23T13:40:11.114715" +} \ No newline at end of file diff --git a/eval-queue/uukuguy/Mistral-7B-OpenOrca-lora-merged_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/Mistral-7B-OpenOrca-lora-merged_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..311c5c6e6ea16167d6d22c462700a3b81dfb89cc --- /dev/null +++ b/eval-queue/uukuguy/Mistral-7B-OpenOrca-lora-merged_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/Mistral-7B-OpenOrca-lora-merged", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-09T17:52:37Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 0, "params": 7.0, "license": "llama2", "job_id": "650301"} \ No newline at end of file diff --git a/eval-queue/uukuguy/Mistral-7B-OpenOrca-lora_eval_request_False_bfloat16_Adapter.json b/eval-queue/uukuguy/Mistral-7B-OpenOrca-lora_eval_request_False_bfloat16_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..ead1bd9292c2a2e17651e28f736e411835043e49 --- /dev/null +++ b/eval-queue/uukuguy/Mistral-7B-OpenOrca-lora_eval_request_False_bfloat16_Adapter.json @@ -0,0 +1 @@ +{"model": "uukuguy/Mistral-7B-OpenOrca-lora", "base_model": "mistralai/Mistral-7B-v0.1", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Adapter", "status": "FINISHED", "submitted_time": "2023-10-16T11:59:27Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 0, "params": 7.0, "license": "llama2", "job_id": "649104"} \ No newline at end of file diff --git a/eval-queue/uukuguy/Orca-2-13b-f16_eval_request_False_float16_Original.json b/eval-queue/uukuguy/Orca-2-13b-f16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b36769690aff98d9874cb2aa8cddc9920cf27c83 --- /dev/null +++ b/eval-queue/uukuguy/Orca-2-13b-f16_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "uukuguy/Orca-2-13b-f16", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-30T18:45:32Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 13.0, + "license": "llama2", + "job_id": "845991", + "job_start_time": "2023-12-04T13:59:29.886196" +} \ No newline at end of file diff --git a/eval-queue/uukuguy/Orca-2-7b-f16_eval_request_False_float16_Original.json b/eval-queue/uukuguy/Orca-2-7b-f16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b351b8c334ad6d62186ae170923ada50f8530354 --- /dev/null +++ b/eval-queue/uukuguy/Orca-2-7b-f16_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "uukuguy/Orca-2-7b-f16", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-22T01:08:46Z", + "model_type": "\ud83d\udfe2 : pretrained", + "likes": 0, + "params": 7.0, + "license": "llama2", + "job_id": "742284", + "job_start_time": null +} \ No newline at end of file diff --git a/eval-queue/uukuguy/SynthIA-7B-v1.3-dare-0.85_eval_request_False_float16_Original.json b/eval-queue/uukuguy/SynthIA-7B-v1.3-dare-0.85_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..132dce39db59e58310a266c17d7b195344b04213 --- /dev/null +++ b/eval-queue/uukuguy/SynthIA-7B-v1.3-dare-0.85_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "uukuguy/SynthIA-7B-v1.3-dare-0.85", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-22T07:08:37Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.0, + "license": "llama2", + "job_id": "749940", + "job_start_time": null +} \ No newline at end of file diff --git a/eval-queue/uukuguy/airoboros-m-7b-3.1.2-dare-0.85_eval_request_False_float16_Original.json b/eval-queue/uukuguy/airoboros-m-7b-3.1.2-dare-0.85_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b17b4db110b1cfccf27c1319ee5f5452f7af5b3e --- /dev/null +++ b/eval-queue/uukuguy/airoboros-m-7b-3.1.2-dare-0.85_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "uukuguy/airoboros-m-7b-3.1.2-dare-0.85", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-22T05:36:09Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.0, + "license": "apache-2.0", + "job_id": "749931", + "job_start_time": null +} \ No newline at end of file diff --git a/eval-queue/uukuguy/mistral-7b-platypus-fp16-dare-0.9_eval_request_False_float16_Original.json b/eval-queue/uukuguy/mistral-7b-platypus-fp16-dare-0.9_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..a504a151ca90ee55e452fc889015bf1bbf73ebaf --- /dev/null +++ b/eval-queue/uukuguy/mistral-7b-platypus-fp16-dare-0.9_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/mistral-7b-platypus-fp16-dare-0.9", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-20T05:48:19Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 0, "params": 7.0, "license": "llama2", "job_id": "718396"} \ No newline at end of file diff --git a/eval-queue/uukuguy/neural-chat-7b-v3-1-dare-0.85_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/neural-chat-7b-v3-1-dare-0.85_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..cbe13bb365e27d4af1cd36cad26af5365e20c289 --- /dev/null +++ b/eval-queue/uukuguy/neural-chat-7b-v3-1-dare-0.85_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "uukuguy/neural-chat-7b-v3-1-dare-0.85", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-05T12:10:17Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 1, + "params": 7.0, + "license": "llama2", + "job_id": "858062", + "job_start_time": "2023-12-06T16:31:04.327165" +} \ No newline at end of file diff --git a/eval-queue/uukuguy/neural-chat-7b-v3-1-dare-0.85_eval_request_False_float16_Original.json b/eval-queue/uukuguy/neural-chat-7b-v3-1-dare-0.85_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..4693149db77ed5965b4b1af318822c7a96eb123f --- /dev/null +++ b/eval-queue/uukuguy/neural-chat-7b-v3-1-dare-0.85_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/neural-chat-7b-v3-1-dare-0.85", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-20T12:04:14Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 0, "params": 7.0, "license": "llama2", "job_id": "719439"} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-code-mistral-7b-v1.0_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-code-mistral-7b-v1.0_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..166cd6e8a7bd4bf2b9f5df728952016b38bc40e7 --- /dev/null +++ b/eval-queue/uukuguy/speechless-code-mistral-7b-v1.0_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-code-mistral-7b-v1.0", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:19:55Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522409", "license": "llama2", "likes": 0, "params": 7.111} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-code-mistral-7b-v1.0_eval_request_False_float16_Original.json b/eval-queue/uukuguy/speechless-code-mistral-7b-v1.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..4713e17c3a03483a4bc5fb88a4a6f8b88c69577b --- /dev/null +++ b/eval-queue/uukuguy/speechless-code-mistral-7b-v1.0_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "uukuguy/speechless-code-mistral-7b-v1.0", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-05T07:35:14Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 7, + "params": 7.0, + "license": "apache-2.0", + "job_id": "858025", + "job_start_time": "2023-12-06T16:15:36.804152" +} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-code-mistral-7b-v2.0_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-code-mistral-7b-v2.0_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..dd3717a813d4cbbe8cce729fced6a0b8aa0a2cdd --- /dev/null +++ b/eval-queue/uukuguy/speechless-code-mistral-7b-v2.0_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "uukuguy/speechless-code-mistral-7b-v2.0", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-09T13:32:47Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.0, + "license": "apache-2.0", + "job_id": "875138", + "job_start_time": "2023-12-09T18:00:40.420611" +} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-code-mistral-orca-7b-v1.0_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-code-mistral-orca-7b-v1.0_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e05bfd5a3ba45359b81e6519b0705e76c36603a2 --- /dev/null +++ b/eval-queue/uukuguy/speechless-code-mistral-orca-7b-v1.0_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-code-mistral-orca-7b-v1.0", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-09T01:52:37Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "488095", "license": "llama2", "likes": 0, "params": 7.111} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-code-mistral-orca-7b-v1.0_eval_request_False_float16_Original.json b/eval-queue/uukuguy/speechless-code-mistral-orca-7b-v1.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..df5476f938e99f7b60aedf53865ea0d0a7940379 --- /dev/null +++ b/eval-queue/uukuguy/speechless-code-mistral-orca-7b-v1.0_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-code-mistral-orca-7b-v1.0", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-14T11:29:49Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 4, "params": 7.0, "license": "llama2", "job_id": "651134"} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-codellama-34b-v1.9_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-codellama-34b-v1.9_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..f0979156abcddc2607dbdf84a19889649c94d71a --- /dev/null +++ b/eval-queue/uukuguy/speechless-codellama-34b-v1.9_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-codellama-34b-v1.9", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:00:29Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522188", "license": "llama2", "likes": 0, "params": 33.482} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-codellama-34b-v2.0_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-codellama-34b-v2.0_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..42d54f045dd11f2053bf89ad5ad836f8c6b85fde --- /dev/null +++ b/eval-queue/uukuguy/speechless-codellama-34b-v2.0_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-codellama-34b-v2.0", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "514674", "license": "llama2", "likes": 3, "params": 33.482} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-codellama-34b-v2.0_eval_request_False_float16_Original.json b/eval-queue/uukuguy/speechless-codellama-34b-v2.0_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..9da92f0932ef2cc013c86e5169d2f548734839dc --- /dev/null +++ b/eval-queue/uukuguy/speechless-codellama-34b-v2.0_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "uukuguy/speechless-codellama-34b-v2.0", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-12-07T18:43:29Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 8, + "params": 34.0, + "license": "llama2", + "job_id": "874870", + "job_start_time": "2023-12-09T13:56:18.742329" +} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-codellama-dolphin-orca-platypus-13b_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-codellama-dolphin-orca-platypus-13b_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..49d718193a880d8ca862c5edb5842f455c07d219 --- /dev/null +++ b/eval-queue/uukuguy/speechless-codellama-dolphin-orca-platypus-13b_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-codellama-dolphin-orca-platypus-13b", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "514526", "license": "llama2", "likes": 2, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-codellama-dolphin-orca-platypus-34b_eval_request_False_float16_Original.json b/eval-queue/uukuguy/speechless-codellama-dolphin-orca-platypus-34b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..6cb7bab2762aafeee1d7fb90c14ed0a7d152652b --- /dev/null +++ b/eval-queue/uukuguy/speechless-codellama-dolphin-orca-platypus-34b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-codellama-dolphin-orca-platypus-34b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:19:55Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522818", "license": "llama2", "likes": 6, "params": 33.482} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-codellama-orca-13b_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-codellama-orca-13b_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d7e4b102117fa064c802d100cabf7e8d1cbd732f --- /dev/null +++ b/eval-queue/uukuguy/speechless-codellama-orca-13b_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-codellama-orca-13b", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "504646", "license": "llama2", "likes": 2, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-codellama-orca-13b_eval_request_False_float16_Original.json b/eval-queue/uukuguy/speechless-codellama-orca-13b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..30dac5ac687d7ab658bd09617fcec4663df397df --- /dev/null +++ b/eval-queue/uukuguy/speechless-codellama-orca-13b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-codellama-orca-13b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "515173", "license": "llama2", "likes": 2, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..123072c2d38cbcf14ec9a11ba42ae189373935a6 --- /dev/null +++ b/eval-queue/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-codellama-orca-airoboros-13b-0.10e", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "504690", "license": "llama2", "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e_eval_request_False_float16_Original.json b/eval-queue/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..cbdff2dc727242131fa7547d886f7fba4bbe9fbb --- /dev/null +++ b/eval-queue/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-codellama-orca-airoboros-13b-0.10e", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "522052", "license": "llama2", "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-codellama-orca-platypus-13b-0.10e_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-codellama-orca-platypus-13b-0.10e_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..3cff035e7bfa9c51ed99c58780f5a690ad97128f --- /dev/null +++ b/eval-queue/uukuguy/speechless-codellama-orca-platypus-13b-0.10e_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-codellama-orca-platypus-13b-0.10e", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "503333", "license": "llama2", "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-codellama-orca-platypus-13b-0.10e_eval_request_False_float16_Original.json b/eval-queue/uukuguy/speechless-codellama-orca-platypus-13b-0.10e_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b7e85058a840511b71091b09720aa34df401bcff --- /dev/null +++ b/eval-queue/uukuguy/speechless-codellama-orca-platypus-13b-0.10e_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-codellama-orca-platypus-13b-0.10e", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "516696", "license": "llama2", "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-codellama-platypus-13b_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-codellama-platypus-13b_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..bf8edecb6b67fcd0f1b8186f69be7af35c2fae25 --- /dev/null +++ b/eval-queue/uukuguy/speechless-codellama-platypus-13b_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-codellama-platypus-13b", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "505075", "license": "llama2", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-codellama-platypus-13b_eval_request_False_float16_Original.json b/eval-queue/uukuguy/speechless-codellama-platypus-13b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..a3e58a42e596c2ac6bc23ec41afae28d67f2f16e --- /dev/null +++ b/eval-queue/uukuguy/speechless-codellama-platypus-13b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-codellama-platypus-13b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "517437", "license": "llama2", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-coding-7b-16k-tora_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-coding-7b-16k-tora_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..9a5c2ddf0df3402b5d99861369e306616ba11311 --- /dev/null +++ b/eval-queue/uukuguy/speechless-coding-7b-16k-tora_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-coding-7b-16k-tora", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-15T10:58:49Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 1, "params": 7.0, "license": "llama2", "job_id": "660377"} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-coding-7b-16k-tora_eval_request_False_float16_Original.json b/eval-queue/uukuguy/speechless-coding-7b-16k-tora_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..c64313e097e49a368946ce9e7cff197986f0a957 --- /dev/null +++ b/eval-queue/uukuguy/speechless-coding-7b-16k-tora_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "uukuguy/speechless-coding-7b-16k-tora", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-07T18:39:46Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 1, + "params": 7.0, + "license": "llama2", + "job_id": "874866", + "job_start_time": "2023-12-09T13:53:18.751124" +} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-hermes-coig-lite-13b_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-hermes-coig-lite-13b_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..98afa5b3a4c06bb0f520c10e2af54fecf573d8ea --- /dev/null +++ b/eval-queue/uukuguy/speechless-hermes-coig-lite-13b_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-hermes-coig-lite-13b", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "504988", "license": ["mit"], "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-hermes-coig-lite-13b_eval_request_False_float16_Original.json b/eval-queue/uukuguy/speechless-hermes-coig-lite-13b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..5392a980da0c7aca5ba6fac3d34a121101813c11 --- /dev/null +++ b/eval-queue/uukuguy/speechless-hermes-coig-lite-13b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-hermes-coig-lite-13b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "503286", "license": ["mit"], "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-llama2-13b_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-llama2-13b_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..32b8c807a10abc91b046dd17ec69658d3e2ffd4a --- /dev/null +++ b/eval-queue/uukuguy/speechless-llama2-13b_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "uukuguy/speechless-llama2-13b", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-07T18:42:36Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 4, + "params": 13.016, + "license": "llama2", + "job_id": "874868", + "job_start_time": "2023-12-09T13:54:48.094165" +} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-llama2-13b_eval_request_False_float16_Original.json b/eval-queue/uukuguy/speechless-llama2-13b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..20db933cce4f43bb492aabeba8aa87417c9bc54f --- /dev/null +++ b/eval-queue/uukuguy/speechless-llama2-13b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-llama2-13b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-07T12:26:56Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "439391", "license": "?", "likes": 2, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-llama2-hermes-orca-platypus-13b_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-llama2-hermes-orca-platypus-13b_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..0b59ea68ac91fe002ad1ca60fddfed0e97493a29 --- /dev/null +++ b/eval-queue/uukuguy/speechless-llama2-hermes-orca-platypus-13b_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-llama2-hermes-orca-platypus-13b", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "504868", "license": "?", "likes": 0, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..1100a86f690924ef1ec5e7b417b148422484c4bd --- /dev/null +++ b/eval-queue/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "498081", "license": "?", "likes": 24, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b_eval_request_False_float16_Original.json b/eval-queue/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..1582cc3288f588a20d9e1dc3bce01e7f700ccac3 --- /dev/null +++ b/eval-queue/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "439509", "license": "?", "likes": 24, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-llama2-luban-orca-platypus-13b_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-llama2-luban-orca-platypus-13b_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..548badf8048bad57d9395308477f370ff431f571 --- /dev/null +++ b/eval-queue/uukuguy/speechless-llama2-luban-orca-platypus-13b_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-llama2-luban-orca-platypus-13b", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "499599", "license": "?", "likes": 3, "params": 13.016} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-mistral-7b-dare-0.85_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-mistral-7b-dare-0.85_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e8043d34a513408b1306e9165ae4a679aa61c8ea --- /dev/null +++ b/eval-queue/uukuguy/speechless-mistral-7b-dare-0.85_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "uukuguy/speechless-mistral-7b-dare-0.85", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-27T06:20:00Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.242, + "license": "llama2", + "job_id": "802083", + "job_start_time": "2023-11-28T02:21:24.278451" +} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b-dare-0.85_eval_request_False_float16_Original.json b/eval-queue/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b-dare-0.85_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d3b1d0a828d3dbfb121932dca76bd8a65e058786 --- /dev/null +++ b/eval-queue/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b-dare-0.85_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b-dare-0.85", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-27T06:16:22Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 1, + "params": 7.0, + "license": "llama2", + "job_id": "802078", + "job_start_time": "2023-11-28T02:19:47.979083" +} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..abf711f7b9d316cb351a367bf83996c91c3fdedf --- /dev/null +++ b/eval-queue/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-14T11:30:08Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 7, "params": 7.242, "license": "llama2", "job_id": "651137"} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-mistral-six-in-one-7b-orth-1.0_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-mistral-six-in-one-7b-orth-1.0_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..54dd12bf15c7b5afec76039ab53f540fdd070ab4 --- /dev/null +++ b/eval-queue/uukuguy/speechless-mistral-six-in-one-7b-orth-1.0_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "uukuguy/speechless-mistral-six-in-one-7b-orth-1.0", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-13T01:57:31Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.0, + "license": "apache-2.0", + "job_id": "914593", + "job_start_time": "2023-12-13T02:00:02.727309" +} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-mistral-six-in-one-7b_eval_request_False_float16_Original.json b/eval-queue/uukuguy/speechless-mistral-six-in-one-7b_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..5f3fdbe313d43ad20f9f456ae383b7beb42f7a59 --- /dev/null +++ b/eval-queue/uukuguy/speechless-mistral-six-in-one-7b_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-mistral-six-in-one-7b", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-15T02:01:55Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 0, "params": 7.242, "license": "llama2", "job_id": "648653"} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-orca-platypus-coig-lite-2k-0.6e-13b_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-orca-platypus-coig-lite-2k-0.6e-13b_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..5bdb175b93a576ead3027294be83913b8f42ce83 --- /dev/null +++ b/eval-queue/uukuguy/speechless-orca-platypus-coig-lite-2k-0.6e-13b_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-orca-platypus-coig-lite-2k-0.6e-13b", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "505602", "license": "cc-by-nc-4.0", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-orca-platypus-coig-lite-4k-0.5e-13b_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-orca-platypus-coig-lite-4k-0.5e-13b_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..df5d2c9e1ae51ebc1aef060ad9a0dd96e38980a7 --- /dev/null +++ b/eval-queue/uukuguy/speechless-orca-platypus-coig-lite-4k-0.5e-13b_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-orca-platypus-coig-lite-4k-0.5e-13b", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "504760", "license": "cc-by-nc-4.0", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-orca-platypus-coig-lite-4k-0.6e-13b_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-orca-platypus-coig-lite-4k-0.6e-13b_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..108c3dd7c5d09175840bc0c4e835bde9fa68bc21 --- /dev/null +++ b/eval-queue/uukuguy/speechless-orca-platypus-coig-lite-4k-0.6e-13b_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-orca-platypus-coig-lite-4k-0.6e-13b", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:19:55Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522844", "license": "cc-by-nc-4.0", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-tools-7b_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-tools-7b_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..42181557aad0a82936497163c204cc7763af72fb --- /dev/null +++ b/eval-queue/uukuguy/speechless-tools-7b_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "uukuguy/speechless-tools-7b", + "base_model": "", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-30T18:48:35Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.0, + "license": "llama2", + "job_id": "845994", + "job_start_time": "2023-12-04T14:00:44.738567" +} \ No newline at end of file diff --git a/eval-queue/uukuguy/speechless-tora-code-7b-v1.0_eval_request_False_bfloat16_Original.json b/eval-queue/uukuguy/speechless-tora-code-7b-v1.0_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..d11c65fa7ae96896ca5817adf34e5fb98f4ec98d --- /dev/null +++ b/eval-queue/uukuguy/speechless-tora-code-7b-v1.0_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "uukuguy/speechless-tora-code-7b-v1.0", "base_model": "", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:19:55Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522878", "license": "llama2", "likes": 0, "params": 6.607} \ No newline at end of file diff --git a/eval-queue/uukuguy/zephyr-7b-alpha-dare-0.85_eval_request_False_float16_Original.json b/eval-queue/uukuguy/zephyr-7b-alpha-dare-0.85_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..4365a379429122c44cdea4ae762c99668779863d --- /dev/null +++ b/eval-queue/uukuguy/zephyr-7b-alpha-dare-0.85_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "uukuguy/zephyr-7b-alpha-dare-0.85", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-30T19:54:42Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 7.0, + "license": "llama2", + "job_id": "845996", + "job_start_time": "2023-12-04T14:02:00.227642" +} \ No newline at end of file diff --git a/eval-queue/vihangd/dopeyplats-1.1b-2T-v1_eval_request_False_float16_Original.json b/eval-queue/vihangd/dopeyplats-1.1b-2T-v1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..3ecc2ca0347ecec535f1561ff5b701b97dc9ab04 --- /dev/null +++ b/eval-queue/vihangd/dopeyplats-1.1b-2T-v1_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "vihangd/dopeyplats-1.1b-2T-v1", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-26T13:13:19Z", + "model_type": "\ud83d\udfe6 : RL-tuned", + "likes": 0, + "params": 1.1, + "license": "apache-2.0", + "job_id": "801973", + "job_start_time": "2023-11-28T01:09:16.062993" +} \ No newline at end of file diff --git a/eval-queue/vihangd/dopeyshearedplats-1.3b-v1_eval_request_False_float16_Original.json b/eval-queue/vihangd/dopeyshearedplats-1.3b-v1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..36f9c9401f0809a43aca4979d29448406a37d10a --- /dev/null +++ b/eval-queue/vihangd/dopeyshearedplats-1.3b-v1_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "vihangd/dopeyshearedplats-1.3b-v1", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-12T15:30:50Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 1.3, + "license": "llama2", + "job_id": "902846", + "job_start_time": "2023-12-12T15:32:09.346902" +} \ No newline at end of file diff --git a/eval-queue/vihangd/dopeyshearedplats-2.7b-v1_eval_request_False_float16_Original.json b/eval-queue/vihangd/dopeyshearedplats-2.7b-v1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..276dedff32565c950decd2105b128299357aff07 --- /dev/null +++ b/eval-queue/vihangd/dopeyshearedplats-2.7b-v1_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "vihangd/dopeyshearedplats-2.7b-v1", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-16T13:12:39Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 2.7, + "license": "llama2", + "job_id": "925787", + "job_start_time": "2023-12-16T15:15:40.946156" +} \ No newline at end of file diff --git a/eval-queue/vihangd/neuralfalcon-1b-v1_eval_request_False_float16_Original.json b/eval-queue/vihangd/neuralfalcon-1b-v1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..04a5e54e28b15167d8a9661dc5b01305a6d3038c --- /dev/null +++ b/eval-queue/vihangd/neuralfalcon-1b-v1_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "vihangd/neuralfalcon-1b-v1", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-17T02:09:13Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 1.0, + "license": "apache-2.0", + "job_id": "926734", + "job_start_time": "2023-12-17T02:10:34.646054" +} \ No newline at end of file diff --git a/eval-queue/vihangd/shearedplats-1.3b-v1_eval_request_False_float16_Original.json b/eval-queue/vihangd/shearedplats-1.3b-v1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..630334d26391fcd6e9aa5c7e0138c9ffa4ce8a72 --- /dev/null +++ b/eval-queue/vihangd/shearedplats-1.3b-v1_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "vihangd/shearedplats-1.3b-v1", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-16T04:53:15Z", "model_type": "\u2b55 : instruction-tuned", "likes": 0, "params": 1.3, "license": "llama2", "job_id": "698112"} \ No newline at end of file diff --git a/eval-queue/vihangd/shearedplats-2.7b-v1_eval_request_False_float16_Original.json b/eval-queue/vihangd/shearedplats-2.7b-v1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b892fc3850ebad89f9774f3a3201f1ceadccdc57 --- /dev/null +++ b/eval-queue/vihangd/shearedplats-2.7b-v1_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "vihangd/shearedplats-2.7b-v1", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-11T06:27:44Z", "model_type": "\u2b55 : instruction-tuned", "likes": 0, "params": 2.7, "license": "llama2", "job_id": "650380"} \ No newline at end of file diff --git a/eval-queue/vihangd/shearedplats-2.7b-v2_eval_request_False_float16_Original.json b/eval-queue/vihangd/shearedplats-2.7b-v2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..9284eb170ae7fe66c6cc89afadd913f32ad9177a --- /dev/null +++ b/eval-queue/vihangd/shearedplats-2.7b-v2_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "vihangd/shearedplats-2.7b-v2", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-11-18T04:17:42Z", "model_type": "\u2b55 : instruction-tuned", "likes": 0, "params": 2.7, "license": "llama2", "job_id": "702691"} \ No newline at end of file diff --git a/eval-queue/vihangd/smartyplats-1.1b-v1_eval_request_False_float16_Original.json b/eval-queue/vihangd/smartyplats-1.1b-v1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..54090142002a14334c6b9af05e7c716e102bd7cc --- /dev/null +++ b/eval-queue/vihangd/smartyplats-1.1b-v1_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "vihangd/smartyplats-1.1b-v1", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-10T04:30:44Z", "model_type": "\u2b55 : instruction-tuned", "likes": 0, "params": 1.1, "license": "apache-2.0", "job_id": "650337"} \ No newline at end of file diff --git a/eval-queue/vihangd/smartyplats-1.1b-v2_eval_request_False_float16_Original.json b/eval-queue/vihangd/smartyplats-1.1b-v2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..af6702360ddccda527b9916d5d3da5ffbfa420d7 --- /dev/null +++ b/eval-queue/vihangd/smartyplats-1.1b-v2_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "vihangd/smartyplats-1.1b-v2", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FAILED", + "submitted_time": "2023-11-24T02:55:28Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 1.1, + "license": "apache-2.0", + "job_id": "800063", + "job_start_time": "2023-11-27T13:18:04.116568" +} \ No newline at end of file diff --git a/eval-queue/vihangd/smartyplats-3b-v1_eval_request_False_float16_Original.json b/eval-queue/vihangd/smartyplats-3b-v1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..43fdf84e7a91968e3ea846e960aac31381bfa192 --- /dev/null +++ b/eval-queue/vihangd/smartyplats-3b-v1_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "vihangd/smartyplats-3b-v1", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "513849", "license": "apache-2.0", "likes": 0, "params": 3.324} \ No newline at end of file diff --git a/eval-queue/vihangd/smartyplats-3b-v2_eval_request_False_float16_Original.json b/eval-queue/vihangd/smartyplats-3b-v2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..4a35f9c028cc2c7e101d7e1db6d9295b74da8684 --- /dev/null +++ b/eval-queue/vihangd/smartyplats-3b-v2_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "vihangd/smartyplats-3b-v2", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:19:55Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "522682", "license": "apache-2.0", "likes": 0, "params": 3.324} \ No newline at end of file diff --git a/eval-queue/vihangd/smartyplats-7b-v1_eval_request_False_float16_Original.json b/eval-queue/vihangd/smartyplats-7b-v1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..fc2c032dabb883ed0c07ba321da92209394223f1 --- /dev/null +++ b/eval-queue/vihangd/smartyplats-7b-v1_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "vihangd/smartyplats-7b-v1", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-10-27T10:45:07Z", "model_type": "\u2b55 : instruction-tuned", "likes": 0, "params": 7.0, "license": "apache-2.0", "job_id": "649736"} \ No newline at end of file diff --git a/eval-queue/vihangd/smartyplats-7b-v2_eval_request_False_float16_Original.json b/eval-queue/vihangd/smartyplats-7b-v2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..bdadcd4901619bc9b93772aceee4602b53b115fe --- /dev/null +++ b/eval-queue/vihangd/smartyplats-7b-v2_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "vihangd/smartyplats-7b-v2", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-11-23T05:31:51Z", + "model_type": "\u2b55 : instruction-tuned", + "likes": 0, + "params": 7.0, + "license": "apache-2.0", + "job_id": "796150", + "job_start_time": "2023-11-26T12:52:46.818157" +} \ No newline at end of file diff --git a/eval-queue/w95/megachat_eval_request_False_float16_Adapter.json b/eval-queue/w95/megachat_eval_request_False_float16_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..39fdf4513a296f3e7a58dee017ed88d94bb5f8f5 --- /dev/null +++ b/eval-queue/w95/megachat_eval_request_False_float16_Adapter.json @@ -0,0 +1 @@ +{"model": "w95/megachat", "base_model": "PY007/TinyLlama-1.1B-Chat-v0.3", "revision": "main", "private": false, "precision": "float16", "weight_type": "Adapter", "status": "FINISHED", "submitted_time": "2023-10-22T20:40:40Z", "model_type": "\u2b55 : instruction-tuned", "likes": 0, "params": 0, "license": "apache-2.0", "job_id": "649276"} \ No newline at end of file diff --git a/eval-queue/wei123602/FINETUNE3_TEST4_eval_request_False_float16_Original.json b/eval-queue/wei123602/FINETUNE3_TEST4_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..0f1293d752fa095a9a7f1c142cb93f0de1e549e6 --- /dev/null +++ b/eval-queue/wei123602/FINETUNE3_TEST4_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "wei123602/FINETUNE3_TEST4", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "518559", "license": "?", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/wei123602/Llama-2-13b-FINETUNE4_TEST2_eval_request_False_float16_Original.json b/eval-queue/wei123602/Llama-2-13b-FINETUNE4_TEST2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..28f36f0829951c1a5aa3e3606cb4213c28ee8c71 --- /dev/null +++ b/eval-queue/wei123602/Llama-2-13b-FINETUNE4_TEST2_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "wei123602/Llama-2-13b-FINETUNE4_TEST2", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517983", "license": "?", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/wei123602/Llama-2-13b-FINETUNE4_TEST3_eval_request_False_float16_Original.json b/eval-queue/wei123602/Llama-2-13b-FINETUNE4_TEST3_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..3af872e0352c07ff9538c38503510cd06f42ca57 --- /dev/null +++ b/eval-queue/wei123602/Llama-2-13b-FINETUNE4_TEST3_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "wei123602/Llama-2-13b-FINETUNE4_TEST3", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:54:17Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "518780", "license": "?", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/wei123602/Llama-2-13b-FINETUNE4_TEST_eval_request_False_float16_Original.json b/eval-queue/wei123602/Llama-2-13b-FINETUNE4_TEST_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..25910ea47859d111e50f47d54a9660c030ca4d06 --- /dev/null +++ b/eval-queue/wei123602/Llama-2-13b-FINETUNE4_TEST_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "wei123602/Llama-2-13b-FINETUNE4_TEST", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517440", "license": "?", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/wei123602/Llama-2-13b-FINETUNE4_compare8k2_eval_request_False_float16_Original.json b/eval-queue/wei123602/Llama-2-13b-FINETUNE4_compare8k2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..b16ec4f9d0c0b02112a18801493720e419de51b8 --- /dev/null +++ b/eval-queue/wei123602/Llama-2-13b-FINETUNE4_compare8k2_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "wei123602/Llama-2-13b-FINETUNE4_compare8k2", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517975", "license": "?", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/wei123602/Llama-2-13b-FINETUNE4_eval_request_False_float16_Original.json b/eval-queue/wei123602/Llama-2-13b-FINETUNE4_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..42a5868e9e05badeaef8c8c619ef123f10256769 --- /dev/null +++ b/eval-queue/wei123602/Llama-2-13b-FINETUNE4_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "wei123602/Llama-2-13b-FINETUNE4", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "513851", "license": "?", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/wei123602/llama-13b-FINETUNE3_eval_request_False_float16_Original.json b/eval-queue/wei123602/llama-13b-FINETUNE3_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..dbe7933ad9d0bcc8f97dde2df30c7f600dc56e9d --- /dev/null +++ b/eval-queue/wei123602/llama-13b-FINETUNE3_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "wei123602/llama-13b-FINETUNE3", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517979", "license": "?", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/wei123602/llama2-13b-FINETUNE3_TEST2_eval_request_False_float16_Original.json b/eval-queue/wei123602/llama2-13b-FINETUNE3_TEST2_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..f21215e9c925564d4564ba3be95b828c9bc67adb --- /dev/null +++ b/eval-queue/wei123602/llama2-13b-FINETUNE3_TEST2_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "wei123602/llama2-13b-FINETUNE3_TEST2", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:58:30Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "522175", "license": "?", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/wei123602/llama2-13b-FINETUNE3_TEST_eval_request_False_float16_Original.json b/eval-queue/wei123602/llama2-13b-FINETUNE3_TEST_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..02d22b5df8777595485fe15cc36348e42994047b --- /dev/null +++ b/eval-queue/wei123602/llama2-13b-FINETUNE3_TEST_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "wei123602/llama2-13b-FINETUNE3_TEST", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:48:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "517041", "license": "?", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/wei123602/llama2-13b-fintune2-4E_eval_request_False_float16_Original.json b/eval-queue/wei123602/llama2-13b-fintune2-4E_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..3a6e89e0bac92f7833bcf4d354ac968a58ce2fb5 --- /dev/null +++ b/eval-queue/wei123602/llama2-13b-fintune2-4E_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "wei123602/llama2-13b-fintune2-4E", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T12:46:18Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "514312", "license": "?", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/wei123602/llama2-13b-fintune2_eval_request_False_8bit_Original.json b/eval-queue/wei123602/llama2-13b-fintune2_eval_request_False_8bit_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..98a66b0f404b57a07bf29ae5d0c1c3d1c991cff3 --- /dev/null +++ b/eval-queue/wei123602/llama2-13b-fintune2_eval_request_False_8bit_Original.json @@ -0,0 +1 @@ +{"model": "wei123602/llama2-13b-fintune2", "base_model": "wei123602/llama2-13b-fintune2", "revision": "main", "private": false, "precision": "8bit", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-09-05T02:35:00Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "430849", "license": "llama2", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/wei123602/llama2-13b-fintune2_eval_request_False_float16_Adapter.json b/eval-queue/wei123602/llama2-13b-fintune2_eval_request_False_float16_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..a4f3cbc780208bccc072c6f6b9ccce86f7212aeb --- /dev/null +++ b/eval-queue/wei123602/llama2-13b-fintune2_eval_request_False_float16_Adapter.json @@ -0,0 +1 @@ +{"model": "wei123602/llama2-13b-fintune2", "base_model": "wei123602/llama2-13b-fintune2", "revision": "main", "private": false, "precision": "float16", "weight_type": "Adapter", "status": "FAILED", "submitted_time": "2023-09-09T10:38:12Z", "model_type": "\ud83d\udd36 : fine-tuned", "job_id": "439910", "license": "llama2", "likes": 0, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/wordcab/llama-natural-instructions-13b_eval_request_False_False_False.json b/eval-queue/wordcab/llama-natural-instructions-13b_eval_request_False_False_False.json new file mode 100644 index 0000000000000000000000000000000000000000..672f042508acb6238e03b1b00968cbeaa65b10ac --- /dev/null +++ b/eval-queue/wordcab/llama-natural-instructions-13b_eval_request_False_False_False.json @@ -0,0 +1 @@ +{"model": "wordcab/llama-natural-instructions-13b", "base_model": "", "revision": "main", "private": false, "status": "FAILED_2", "job_id": "177111", "weight_type": "Original", "precision": "float16", "license": "?", "likes": 7, "params": 12.852} \ No newline at end of file diff --git a/eval-queue/wtang06/mpt-125m-c4_eval_request_False_float16_Original.json b/eval-queue/wtang06/mpt-125m-c4_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..c1ef5028da60aad22e79ca01495b505bd7852e3b --- /dev/null +++ b/eval-queue/wtang06/mpt-125m-c4_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "wtang06/mpt-125m-c4", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-10-16T13:19:55Z", "model_type": "\ud83d\udfe2 : pretrained", "job_id": "522841", "license": "apache-2.0", "likes": 1, "params": 0.124} \ No newline at end of file diff --git a/eval-queue/wxl_eval_request_False_float16_Adapter.json b/eval-queue/wxl_eval_request_False_float16_Adapter.json new file mode 100644 index 0000000000000000000000000000000000000000..6db46e6373c2291e0a41ef4cdf10f6f5334e45b6 --- /dev/null +++ b/eval-queue/wxl_eval_request_False_float16_Adapter.json @@ -0,0 +1 @@ +{"model": "wxl", "base_model": "stabilityai/stablelm-base-alpha-3b", "revision": "main", "private": false, "precision": "float16", "weight_type": "Adapter", "status": "FAILED", "submitted_time": "2023-07-31T23:30:23Z", "model_type": "fine-tuned", "job_id": "338563", "params": 0, "license": "?", "likes": 0} \ No newline at end of file diff --git a/eval-queue/yulan-team/YuLan-Chat-2-13b-fp16_eval_request_False_float16_Original.json b/eval-queue/yulan-team/YuLan-Chat-2-13b-fp16_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..3fea741c24b0a3990ccd77b833286f85b19d6056 --- /dev/null +++ b/eval-queue/yulan-team/YuLan-Chat-2-13b-fp16_eval_request_False_float16_Original.json @@ -0,0 +1 @@ +{"model": "yulan-team/YuLan-Chat-2-13b-fp16", "base_model": "", "revision": "main", "private": false, "precision": "float16", "weight_type": "Original", "status": "FINISHED", "submitted_time": "2023-09-09T10:52:17Z", "model_type": "\u2b55 : instruction-tuned", "job_id": "461922", "license": "mit", "likes": 7, "params": 12.95} \ No newline at end of file diff --git a/eval-queue/zyh3826/20231206094523-pretrain-Llama-2-13b-hf-76000_eval_request_False_bfloat16_Original.json b/eval-queue/zyh3826/20231206094523-pretrain-Llama-2-13b-hf-76000_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..e8776e1cfb8c280b2d8890d38566576a9f562681 --- /dev/null +++ b/eval-queue/zyh3826/20231206094523-pretrain-Llama-2-13b-hf-76000_eval_request_False_bfloat16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "zyh3826/20231206094523-pretrain-Llama-2-13b-hf-76000", + "base_model": "llama2", + "revision": "main", + "private": false, + "precision": "bfloat16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-14T02:54:25Z", + "model_type": "\ud83d\udd36 : fine-tuned", + "likes": 0, + "params": 13.254, + "license": "llama2", + "job_id": "924605", + "job_start_time": "2023-12-16T12:09:45.559549" +} \ No newline at end of file diff --git a/eval-queue/zyh3826/llama2-13b-ft-openllm-leaderboard-v1_eval_request_False_bfloat16_Original.json b/eval-queue/zyh3826/llama2-13b-ft-openllm-leaderboard-v1_eval_request_False_bfloat16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..11167985d24e8dc2146f9ac200a9138864f5a842 --- /dev/null +++ b/eval-queue/zyh3826/llama2-13b-ft-openllm-leaderboard-v1_eval_request_False_bfloat16_Original.json @@ -0,0 +1 @@ +{"model": "zyh3826/llama2-13b-ft-openllm-leaderboard-v1", "base_model": "llama2-13b", "revision": "main", "private": false, "precision": "bfloat16", "weight_type": "Original", "status": "FAILED", "submitted_time": "2023-11-01T09:44:10Z", "model_type": "\ud83d\udd36 : fine-tuned", "likes": 0, "params": 13.0, "license": "llama2", "job_id": "650136"} \ No newline at end of file diff --git a/eval-queue/zyh3826/llama2-13b-ft-openllm-leaderboard-v1_eval_request_False_float16_Original.json b/eval-queue/zyh3826/llama2-13b-ft-openllm-leaderboard-v1_eval_request_False_float16_Original.json new file mode 100644 index 0000000000000000000000000000000000000000..6b85ceed6b318412d077abfc4541b50e9ee1e270 --- /dev/null +++ b/eval-queue/zyh3826/llama2-13b-ft-openllm-leaderboard-v1_eval_request_False_float16_Original.json @@ -0,0 +1,16 @@ +{ + "model": "zyh3826/llama2-13b-ft-openllm-leaderboard-v1", + "base_model": "", + "revision": "main", + "private": false, + "precision": "float16", + "weight_type": "Original", + "status": "FINISHED", + "submitted_time": "2023-12-07T03:44:45Z", + "model_type": [], + "likes": 0, + "params": 13.016, + "license": "llama2", + "job_id": "874825", + "job_start_time": "2023-12-09T13:23:37.150018" +} \ No newline at end of file diff --git a/eval-results/AlekseyKorshuk/chatml-pyg-v1/results_2023-07-18T19-38-34.758007.json b/eval-results/AlekseyKorshuk/chatml-pyg-v1/results_2023-07-18T19-38-34.758007.json new file mode 100644 index 0000000000000000000000000000000000000000..7facac484b0c386b8a853d22771e8a7c8a15cfe0 --- /dev/null +++ b/eval-results/AlekseyKorshuk/chatml-pyg-v1/results_2023-07-18T19-38-34.758007.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.3395904436860068, + "acc_stderr": 0.01383903976282016, + "acc_norm": 0.378839590443686, + "acc_norm_stderr": 0.014175915490000322 + }, + "harness|hellaswag|10": { + "acc": 0.4722166899024099, + "acc_stderr": 0.004982072108448082, + "acc_norm": 0.6329416450906195, + "acc_norm_stderr": 0.004810175357870944 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.35555555555555557, + "acc_stderr": 0.04135176749720386, + "acc_norm": 0.35555555555555557, + "acc_norm_stderr": 0.04135176749720386 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.03583496176361063, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.03583496176361063 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.35471698113207545, + "acc_stderr": 0.02944517532819958, + "acc_norm": 0.35471698113207545, + "acc_norm_stderr": 0.02944517532819958 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3611111111111111, + "acc_stderr": 0.040166600304512336, + "acc_norm": 0.3611111111111111, + "acc_norm_stderr": 0.040166600304512336 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.28901734104046245, + "acc_stderr": 0.034564257450869995, + "acc_norm": 0.28901734104046245, + "acc_norm_stderr": 0.034564257450869995 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.040233822736177455, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.040233822736177455 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3148936170212766, + "acc_stderr": 0.030363582197238167, + "acc_norm": 0.3148936170212766, + "acc_norm_stderr": 0.030363582197238167 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.0383515395439942, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.0383515395439942 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3103448275862069, + "acc_stderr": 0.03855289616378949, + "acc_norm": 0.3103448275862069, + "acc_norm_stderr": 0.03855289616378949 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.02218203720294836, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.02218203720294836 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.03670066451047182, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.03670066451047182 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.38064516129032255, + "acc_stderr": 0.027621717832907036, + "acc_norm": 0.38064516129032255, + "acc_norm_stderr": 0.027621717832907036 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2660098522167488, + "acc_stderr": 0.03108982600293752, + "acc_norm": 0.2660098522167488, + "acc_norm_stderr": 0.03108982600293752 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.38181818181818183, + "acc_stderr": 0.037937131711656344, + "acc_norm": 0.38181818181818183, + "acc_norm_stderr": 0.037937131711656344 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.37373737373737376, + "acc_stderr": 0.03446897738659333, + "acc_norm": 0.37373737373737376, + "acc_norm_stderr": 0.03446897738659333 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.3471502590673575, + "acc_stderr": 0.03435696168361356, + "acc_norm": 0.3471502590673575, + "acc_norm_stderr": 0.03435696168361356 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3282051282051282, + "acc_stderr": 0.023807633198657273, + "acc_norm": 0.3282051282051282, + "acc_norm_stderr": 0.023807633198657273 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.02696242432507382, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.02696242432507382 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.030388353551886845, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.030388353551886845 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2913907284768212, + "acc_stderr": 0.03710185726119995, + "acc_norm": 0.2913907284768212, + "acc_norm_stderr": 0.03710185726119995 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3284403669724771, + "acc_stderr": 0.02013590279729839, + "acc_norm": 0.3284403669724771, + "acc_norm_stderr": 0.02013590279729839 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2037037037037037, + "acc_stderr": 0.027467401804058, + "acc_norm": 0.2037037037037037, + "acc_norm_stderr": 0.027467401804058 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.4215686274509804, + "acc_stderr": 0.03465868196380758, + "acc_norm": 0.4215686274509804, + "acc_norm_stderr": 0.03465868196380758 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.4008438818565401, + "acc_stderr": 0.031900803894732356, + "acc_norm": 0.4008438818565401, + "acc_norm_stderr": 0.031900803894732356 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.4080717488789238, + "acc_stderr": 0.03298574607842821, + "acc_norm": 0.4080717488789238, + "acc_norm_stderr": 0.03298574607842821 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.366412213740458, + "acc_stderr": 0.042258754519696386, + "acc_norm": 0.366412213740458, + "acc_norm_stderr": 0.042258754519696386 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.4049586776859504, + "acc_stderr": 0.044811377559424694, + "acc_norm": 0.4049586776859504, + "acc_norm_stderr": 0.044811377559424694 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.04750077341199986, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.04750077341199986 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4233128834355828, + "acc_stderr": 0.03881891213334382, + "acc_norm": 0.4233128834355828, + "acc_norm_stderr": 0.03881891213334382 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285713, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285713 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3592233009708738, + "acc_stderr": 0.047504583990416925, + "acc_norm": 0.3592233009708738, + "acc_norm_stderr": 0.047504583990416925 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.4188034188034188, + "acc_stderr": 0.03232128912157792, + "acc_norm": 0.4188034188034188, + "acc_norm_stderr": 0.03232128912157792 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.37420178799489145, + "acc_stderr": 0.017304805072252037, + "acc_norm": 0.37420178799489145, + "acc_norm_stderr": 0.017304805072252037 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.34971098265895956, + "acc_stderr": 0.025674281456531032, + "acc_norm": 0.34971098265895956, + "acc_norm_stderr": 0.025674281456531032 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25139664804469275, + "acc_stderr": 0.01450897945355399, + "acc_norm": 0.25139664804469275, + "acc_norm_stderr": 0.01450897945355399 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.3464052287581699, + "acc_stderr": 0.02724561304721535, + "acc_norm": 0.3464052287581699, + "acc_norm_stderr": 0.02724561304721535 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3022508038585209, + "acc_stderr": 0.02608270069539965, + "acc_norm": 0.3022508038585209, + "acc_norm_stderr": 0.02608270069539965 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.30246913580246915, + "acc_stderr": 0.025557653981868062, + "acc_norm": 0.30246913580246915, + "acc_norm_stderr": 0.025557653981868062 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2730496453900709, + "acc_stderr": 0.02657786094330785, + "acc_norm": 0.2730496453900709, + "acc_norm_stderr": 0.02657786094330785 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3220338983050847, + "acc_stderr": 0.011933936071891098, + "acc_norm": 0.3220338983050847, + "acc_norm_stderr": 0.011933936071891098 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2610294117647059, + "acc_stderr": 0.026679252270103124, + "acc_norm": 0.2610294117647059, + "acc_norm_stderr": 0.026679252270103124 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.29901960784313725, + "acc_stderr": 0.018521756215423024, + "acc_norm": 0.29901960784313725, + "acc_norm_stderr": 0.018521756215423024 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.42727272727272725, + "acc_stderr": 0.04738198703545483, + "acc_norm": 0.42727272727272725, + "acc_norm_stderr": 0.04738198703545483 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3224489795918367, + "acc_stderr": 0.029923100563683906, + "acc_norm": 0.3224489795918367, + "acc_norm_stderr": 0.029923100563683906 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.36318407960199006, + "acc_stderr": 0.034005985055990146, + "acc_norm": 0.36318407960199006, + "acc_norm_stderr": 0.034005985055990146 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3433734939759036, + "acc_stderr": 0.03696584317010601, + "acc_norm": 0.3433734939759036, + "acc_norm_stderr": 0.03696584317010601 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.38596491228070173, + "acc_stderr": 0.03733756969066164, + "acc_norm": 0.38596491228070173, + "acc_norm_stderr": 0.03733756969066164 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2668298653610771, + "mc1_stderr": 0.015483691939237265, + "mc2": 0.4261047240960072, + "mc2_stderr": 0.014497158431106898 + }, + "all": { + "acc": 0.33039657747407947, + "acc_stderr": 0.03392940066852172, + "acc_norm": 0.33378596903248436, + "acc_norm_stderr": 0.033932196922362455, + "mc1": 0.2668298653610771, + "mc1_stderr": 0.015483691939237265, + "mc2": 0.4261047240960072, + "mc2_stderr": 0.014497158431106898 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "AlekseyKorshuk/chatml-pyg-v1", + "model_sha": "79d5a4d53953ca1c26bc2155f168b7e2108f377f", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "ed17e576dbafa5da" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "0875c25c8fc0a94d" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "18cfffb76bc8f0d1" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "16b3626c8a5e3797" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "21f0989f5760198a" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "f7d801bfd913884d" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "23f9089575432d5a" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "04b8293f2ab7fbbf" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "7994d94bfa36d003" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "a2c91752be5b1798" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "db71da66ed82b921" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "e81cf9738ad7e157" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "4a2d5f00cb00d9b7" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e9bcfaa6beefb456" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "6f8215a3de7eebd1" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "aacac708cd4c5a61" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "16b6c6e390eb7cea" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "4130880a19c4edb0" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "96b81f570a84328b" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "e3a7592f84b44888" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "f9edf462e8201551" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "ecf7754754c2bb76" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "30b07e31cf9b5c6f" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "4d1dc7c4ad251829" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "d36b9d9f0f4424fe" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "a0a7af55ac7ae037" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "84fd36aa004c8578" + } + } +} \ No newline at end of file diff --git a/eval-results/AlekseyKorshuk/chatml-pyg-v1/results_2023-09-16T21-00-54.207494.json b/eval-results/AlekseyKorshuk/chatml-pyg-v1/results_2023-09-16T21-00-54.207494.json new file mode 100644 index 0000000000000000000000000000000000000000..195a7f5743f414687f3ec35bc5f23c6b4036d554 --- /dev/null +++ b/eval-results/AlekseyKorshuk/chatml-pyg-v1/results_2023-09-16T21-00-54.207494.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "AlekseyKorshuk/chatml-pyg-v1", + "model_sha": "79d5a4d53953ca1c26bc2155f168b7e2108f377f", + "model_size": "11.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.06354865771812081, + "em_stderr": 0.0024982474364717406, + "f1": 0.11724203020134202, + "f1_stderr": 0.0027033976138729605 + }, + "harness|gsm8k|5": { + "acc": 0.05155420773313116, + "acc_stderr": 0.006090887955262826 + }, + "harness|winogrande|5": { + "acc": 0.6250986582478295, + "acc_stderr": 0.013605544523788001 + }, + "all": { + "em": 0.06354865771812081, + "em_stderr": 0.0024982474364717406, + "f1": 0.11724203020134202, + "f1_stderr": 0.0027033976138729605, + "acc": 0.3383264329904803, + "acc_stderr": 0.009848216239525413 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f21277d2c2d2e06c", + "hash_cont_tokens": "f023fdba4b82f2e9" + }, + "truncated": 382, + "non-truncated": 9154, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "4bce745050ba8e39" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "26cd3631535039d0", + "hash_cont_tokens": "4755f6f6f07d191b" + }, + "total_evaluation_time_secondes": "12960.945364236832", + "truncated": 382, + "non-truncated": 13007, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-07-24T10-58-39.640665.json b/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-07-24T10-58-39.640665.json new file mode 100644 index 0000000000000000000000000000000000000000..9be3e9c29f3e6740d38965f958a76ec9235b728a --- /dev/null +++ b/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-07-24T10-58-39.640665.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.3720136518771331, + "acc_stderr": 0.014124597881844461, + "acc_norm": 0.4061433447098976, + "acc_norm_stderr": 0.01435165669009786 + }, + "harness|hellaswag|10": { + "acc": 0.4963154750049791, + "acc_stderr": 0.004989645929811438, + "acc_norm": 0.6772555267874926, + "acc_norm_stderr": 0.004665704208339031 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4222222222222222, + "acc_stderr": 0.04266763404099582, + "acc_norm": 0.4222222222222222, + "acc_norm_stderr": 0.04266763404099582 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.40131578947368424, + "acc_stderr": 0.03988903703336283, + "acc_norm": 0.40131578947368424, + "acc_norm_stderr": 0.03988903703336283 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.32452830188679244, + "acc_stderr": 0.028815615713432115, + "acc_norm": 0.32452830188679244, + "acc_norm_stderr": 0.028815615713432115 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2916666666666667, + "acc_stderr": 0.03800968060554858, + "acc_norm": 0.2916666666666667, + "acc_norm_stderr": 0.03800968060554858 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.28901734104046245, + "acc_stderr": 0.034564257450869995, + "acc_norm": 0.28901734104046245, + "acc_norm_stderr": 0.034564257450869995 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3574468085106383, + "acc_stderr": 0.03132941789476425, + "acc_norm": 0.3574468085106383, + "acc_norm_stderr": 0.03132941789476425 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21929824561403508, + "acc_stderr": 0.03892431106518752, + "acc_norm": 0.21929824561403508, + "acc_norm_stderr": 0.03892431106518752 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3724137931034483, + "acc_stderr": 0.0402873153294756, + "acc_norm": 0.3724137931034483, + "acc_norm_stderr": 0.0402873153294756 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2275132275132275, + "acc_stderr": 0.021591269407823778, + "acc_norm": 0.2275132275132275, + "acc_norm_stderr": 0.021591269407823778 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.18253968253968253, + "acc_stderr": 0.0345507101910215, + "acc_norm": 0.18253968253968253, + "acc_norm_stderr": 0.0345507101910215 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720683, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720683 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.34838709677419355, + "acc_stderr": 0.027104826328100944, + "acc_norm": 0.34838709677419355, + "acc_norm_stderr": 0.027104826328100944 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2512315270935961, + "acc_stderr": 0.030516530732694433, + "acc_norm": 0.2512315270935961, + "acc_norm_stderr": 0.030516530732694433 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.40606060606060607, + "acc_stderr": 0.03834816355401181, + "acc_norm": 0.40606060606060607, + "acc_norm_stderr": 0.03834816355401181 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.36363636363636365, + "acc_stderr": 0.03427308652999934, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.03427308652999934 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.36787564766839376, + "acc_stderr": 0.03480175668466037, + "acc_norm": 0.36787564766839376, + "acc_norm_stderr": 0.03480175668466037 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.023901157979402527, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.023901157979402527 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.027309140588230182, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.027309140588230182 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3067226890756303, + "acc_stderr": 0.02995382389188705, + "acc_norm": 0.3067226890756303, + "acc_norm_stderr": 0.02995382389188705 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.25165562913907286, + "acc_stderr": 0.035433042343899844, + "acc_norm": 0.25165562913907286, + "acc_norm_stderr": 0.035433042343899844 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3302752293577982, + "acc_stderr": 0.02016446633634298, + "acc_norm": 0.3302752293577982, + "acc_norm_stderr": 0.02016446633634298 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.17592592592592593, + "acc_stderr": 0.02596742095825853, + "acc_norm": 0.17592592592592593, + "acc_norm_stderr": 0.02596742095825853 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.03410785338904719, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.03410785338904719 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.3924050632911392, + "acc_stderr": 0.03178471874564729, + "acc_norm": 0.3924050632911392, + "acc_norm_stderr": 0.03178471874564729 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.42152466367713004, + "acc_stderr": 0.03314190222110657, + "acc_norm": 0.42152466367713004, + "acc_norm_stderr": 0.03314190222110657 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3435114503816794, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.3435114503816794, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.48760330578512395, + "acc_stderr": 0.045629515481807666, + "acc_norm": 0.48760330578512395, + "acc_norm_stderr": 0.045629515481807666 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.04750077341199986, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.04750077341199986 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3496932515337423, + "acc_stderr": 0.03746668325470021, + "acc_norm": 0.3496932515337423, + "acc_norm_stderr": 0.03746668325470021 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.04521829902833586, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.04521829902833586 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3786407766990291, + "acc_stderr": 0.04802694698258973, + "acc_norm": 0.3786407766990291, + "acc_norm_stderr": 0.04802694698258973 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.43162393162393164, + "acc_stderr": 0.0324483553531149, + "acc_norm": 0.43162393162393164, + "acc_norm_stderr": 0.0324483553531149 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.39080459770114945, + "acc_stderr": 0.017448366067062526, + "acc_norm": 0.39080459770114945, + "acc_norm_stderr": 0.017448366067062526 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.3583815028901734, + "acc_stderr": 0.0258167567915842, + "acc_norm": 0.3583815028901734, + "acc_norm_stderr": 0.0258167567915842 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23575418994413408, + "acc_stderr": 0.014196375686290804, + "acc_norm": 0.23575418994413408, + "acc_norm_stderr": 0.014196375686290804 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.42483660130718953, + "acc_stderr": 0.028304576673141107, + "acc_norm": 0.42483660130718953, + "acc_norm_stderr": 0.028304576673141107 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.34726688102893893, + "acc_stderr": 0.027040745502307336, + "acc_norm": 0.34726688102893893, + "acc_norm_stderr": 0.027040745502307336 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.3487654320987654, + "acc_stderr": 0.02651759772446501, + "acc_norm": 0.3487654320987654, + "acc_norm_stderr": 0.02651759772446501 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.33687943262411346, + "acc_stderr": 0.02819553487396673, + "acc_norm": 0.33687943262411346, + "acc_norm_stderr": 0.02819553487396673 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3324641460234681, + "acc_stderr": 0.01203202233226052, + "acc_norm": 0.3324641460234681, + "acc_norm_stderr": 0.01203202233226052 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2426470588235294, + "acc_stderr": 0.026040662474201257, + "acc_norm": 0.2426470588235294, + "acc_norm_stderr": 0.026040662474201257 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.31862745098039214, + "acc_stderr": 0.018850084696468712, + "acc_norm": 0.31862745098039214, + "acc_norm_stderr": 0.018850084696468712 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.33636363636363636, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.33636363636363636, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.39591836734693875, + "acc_stderr": 0.03130802899065686, + "acc_norm": 0.39591836734693875, + "acc_norm_stderr": 0.03130802899065686 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.42786069651741293, + "acc_stderr": 0.03498541988407795, + "acc_norm": 0.42786069651741293, + "acc_norm_stderr": 0.03498541988407795 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3614457831325301, + "acc_stderr": 0.037400593820293204, + "acc_norm": 0.3614457831325301, + "acc_norm_stderr": 0.037400593820293204 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.38596491228070173, + "acc_stderr": 0.03733756969066164, + "acc_norm": 0.38596491228070173, + "acc_norm_stderr": 0.03733756969066164 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.26560587515299877, + "mc1_stderr": 0.015461027627253592, + "mc2": 0.42764768101969397, + "mc2_stderr": 0.015172444186480637 + }, + "all": { + "acc": 0.3424003558258833, + "acc_stderr": 0.0341401669278609, + "acc_norm": 0.34604560573461685, + "acc_norm_stderr": 0.034138524844586, + "mc1": 0.26560587515299877, + "mc1_stderr": 0.015461027627253592, + "mc2": 0.42764768101969397, + "mc2_stderr": 0.015172444186480637 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "AlekseyKorshuk/pygmalion-6b-vicuna-chatml", + "model_sha": "ee3ada91a69a194cedfabbfeab98f1499b75cb44", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4685, + "non-padded": 2, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40045, + "non-padded": 123, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 680, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 16, + "non-truncated": 6120, + "padded": 6120, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "0893dfcb83435e7d", + "hash_cont_tokens": "6159bf1904a8c8fb" + }, + "total_evaluation_time_secondes": "2397.4870216846466", + "truncated": 1492, + "non-truncated": 109527, + "padded": 109290, + "non-padded": 1729, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-07-24T10:58:39.640665.json b/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-07-24T10:58:39.640665.json new file mode 100644 index 0000000000000000000000000000000000000000..9be3e9c29f3e6740d38965f958a76ec9235b728a --- /dev/null +++ b/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-07-24T10:58:39.640665.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.3720136518771331, + "acc_stderr": 0.014124597881844461, + "acc_norm": 0.4061433447098976, + "acc_norm_stderr": 0.01435165669009786 + }, + "harness|hellaswag|10": { + "acc": 0.4963154750049791, + "acc_stderr": 0.004989645929811438, + "acc_norm": 0.6772555267874926, + "acc_norm_stderr": 0.004665704208339031 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4222222222222222, + "acc_stderr": 0.04266763404099582, + "acc_norm": 0.4222222222222222, + "acc_norm_stderr": 0.04266763404099582 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.40131578947368424, + "acc_stderr": 0.03988903703336283, + "acc_norm": 0.40131578947368424, + "acc_norm_stderr": 0.03988903703336283 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.32452830188679244, + "acc_stderr": 0.028815615713432115, + "acc_norm": 0.32452830188679244, + "acc_norm_stderr": 0.028815615713432115 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2916666666666667, + "acc_stderr": 0.03800968060554858, + "acc_norm": 0.2916666666666667, + "acc_norm_stderr": 0.03800968060554858 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.28901734104046245, + "acc_stderr": 0.034564257450869995, + "acc_norm": 0.28901734104046245, + "acc_norm_stderr": 0.034564257450869995 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3574468085106383, + "acc_stderr": 0.03132941789476425, + "acc_norm": 0.3574468085106383, + "acc_norm_stderr": 0.03132941789476425 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21929824561403508, + "acc_stderr": 0.03892431106518752, + "acc_norm": 0.21929824561403508, + "acc_norm_stderr": 0.03892431106518752 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3724137931034483, + "acc_stderr": 0.0402873153294756, + "acc_norm": 0.3724137931034483, + "acc_norm_stderr": 0.0402873153294756 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2275132275132275, + "acc_stderr": 0.021591269407823778, + "acc_norm": 0.2275132275132275, + "acc_norm_stderr": 0.021591269407823778 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.18253968253968253, + "acc_stderr": 0.0345507101910215, + "acc_norm": 0.18253968253968253, + "acc_norm_stderr": 0.0345507101910215 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720683, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720683 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.34838709677419355, + "acc_stderr": 0.027104826328100944, + "acc_norm": 0.34838709677419355, + "acc_norm_stderr": 0.027104826328100944 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2512315270935961, + "acc_stderr": 0.030516530732694433, + "acc_norm": 0.2512315270935961, + "acc_norm_stderr": 0.030516530732694433 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.40606060606060607, + "acc_stderr": 0.03834816355401181, + "acc_norm": 0.40606060606060607, + "acc_norm_stderr": 0.03834816355401181 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.36363636363636365, + "acc_stderr": 0.03427308652999934, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.03427308652999934 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.36787564766839376, + "acc_stderr": 0.03480175668466037, + "acc_norm": 0.36787564766839376, + "acc_norm_stderr": 0.03480175668466037 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.023901157979402527, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.023901157979402527 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.027309140588230182, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.027309140588230182 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3067226890756303, + "acc_stderr": 0.02995382389188705, + "acc_norm": 0.3067226890756303, + "acc_norm_stderr": 0.02995382389188705 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.25165562913907286, + "acc_stderr": 0.035433042343899844, + "acc_norm": 0.25165562913907286, + "acc_norm_stderr": 0.035433042343899844 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3302752293577982, + "acc_stderr": 0.02016446633634298, + "acc_norm": 0.3302752293577982, + "acc_norm_stderr": 0.02016446633634298 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.17592592592592593, + "acc_stderr": 0.02596742095825853, + "acc_norm": 0.17592592592592593, + "acc_norm_stderr": 0.02596742095825853 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.03410785338904719, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.03410785338904719 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.3924050632911392, + "acc_stderr": 0.03178471874564729, + "acc_norm": 0.3924050632911392, + "acc_norm_stderr": 0.03178471874564729 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.42152466367713004, + "acc_stderr": 0.03314190222110657, + "acc_norm": 0.42152466367713004, + "acc_norm_stderr": 0.03314190222110657 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3435114503816794, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.3435114503816794, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.48760330578512395, + "acc_stderr": 0.045629515481807666, + "acc_norm": 0.48760330578512395, + "acc_norm_stderr": 0.045629515481807666 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.04750077341199986, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.04750077341199986 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3496932515337423, + "acc_stderr": 0.03746668325470021, + "acc_norm": 0.3496932515337423, + "acc_norm_stderr": 0.03746668325470021 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.04521829902833586, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.04521829902833586 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3786407766990291, + "acc_stderr": 0.04802694698258973, + "acc_norm": 0.3786407766990291, + "acc_norm_stderr": 0.04802694698258973 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.43162393162393164, + "acc_stderr": 0.0324483553531149, + "acc_norm": 0.43162393162393164, + "acc_norm_stderr": 0.0324483553531149 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.39080459770114945, + "acc_stderr": 0.017448366067062526, + "acc_norm": 0.39080459770114945, + "acc_norm_stderr": 0.017448366067062526 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.3583815028901734, + "acc_stderr": 0.0258167567915842, + "acc_norm": 0.3583815028901734, + "acc_norm_stderr": 0.0258167567915842 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23575418994413408, + "acc_stderr": 0.014196375686290804, + "acc_norm": 0.23575418994413408, + "acc_norm_stderr": 0.014196375686290804 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.42483660130718953, + "acc_stderr": 0.028304576673141107, + "acc_norm": 0.42483660130718953, + "acc_norm_stderr": 0.028304576673141107 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.34726688102893893, + "acc_stderr": 0.027040745502307336, + "acc_norm": 0.34726688102893893, + "acc_norm_stderr": 0.027040745502307336 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.3487654320987654, + "acc_stderr": 0.02651759772446501, + "acc_norm": 0.3487654320987654, + "acc_norm_stderr": 0.02651759772446501 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.33687943262411346, + "acc_stderr": 0.02819553487396673, + "acc_norm": 0.33687943262411346, + "acc_norm_stderr": 0.02819553487396673 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3324641460234681, + "acc_stderr": 0.01203202233226052, + "acc_norm": 0.3324641460234681, + "acc_norm_stderr": 0.01203202233226052 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2426470588235294, + "acc_stderr": 0.026040662474201257, + "acc_norm": 0.2426470588235294, + "acc_norm_stderr": 0.026040662474201257 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.31862745098039214, + "acc_stderr": 0.018850084696468712, + "acc_norm": 0.31862745098039214, + "acc_norm_stderr": 0.018850084696468712 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.33636363636363636, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.33636363636363636, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.39591836734693875, + "acc_stderr": 0.03130802899065686, + "acc_norm": 0.39591836734693875, + "acc_norm_stderr": 0.03130802899065686 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.42786069651741293, + "acc_stderr": 0.03498541988407795, + "acc_norm": 0.42786069651741293, + "acc_norm_stderr": 0.03498541988407795 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3614457831325301, + "acc_stderr": 0.037400593820293204, + "acc_norm": 0.3614457831325301, + "acc_norm_stderr": 0.037400593820293204 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.38596491228070173, + "acc_stderr": 0.03733756969066164, + "acc_norm": 0.38596491228070173, + "acc_norm_stderr": 0.03733756969066164 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.26560587515299877, + "mc1_stderr": 0.015461027627253592, + "mc2": 0.42764768101969397, + "mc2_stderr": 0.015172444186480637 + }, + "all": { + "acc": 0.3424003558258833, + "acc_stderr": 0.0341401669278609, + "acc_norm": 0.34604560573461685, + "acc_norm_stderr": 0.034138524844586, + "mc1": 0.26560587515299877, + "mc1_stderr": 0.015461027627253592, + "mc2": 0.42764768101969397, + "mc2_stderr": 0.015172444186480637 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "AlekseyKorshuk/pygmalion-6b-vicuna-chatml", + "model_sha": "ee3ada91a69a194cedfabbfeab98f1499b75cb44", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4685, + "non-padded": 2, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40045, + "non-padded": 123, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 680, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 16, + "non-truncated": 6120, + "padded": 6120, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "0893dfcb83435e7d", + "hash_cont_tokens": "6159bf1904a8c8fb" + }, + "total_evaluation_time_secondes": "2397.4870216846466", + "truncated": 1492, + "non-truncated": 109527, + "padded": 109290, + "non-padded": 1729, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-08-01T14-16-25.052724.json b/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-08-01T14-16-25.052724.json new file mode 100644 index 0000000000000000000000000000000000000000..141d855c0400b980a79cb784a3dcfa30563626fa --- /dev/null +++ b/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-08-01T14-16-25.052724.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.3720136518771331, + "acc_stderr": 0.014124597881844461, + "acc_norm": 0.4061433447098976, + "acc_norm_stderr": 0.01435165669009786 + }, + "harness|hellaswag|10": { + "acc": 0.4963154750049791, + "acc_stderr": 0.004989645929811438, + "acc_norm": 0.6772555267874926, + "acc_norm_stderr": 0.004665704208339031 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4222222222222222, + "acc_stderr": 0.04266763404099582, + "acc_norm": 0.4222222222222222, + "acc_norm_stderr": 0.04266763404099582 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.40131578947368424, + "acc_stderr": 0.03988903703336283, + "acc_norm": 0.40131578947368424, + "acc_norm_stderr": 0.03988903703336283 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.32452830188679244, + "acc_stderr": 0.028815615713432115, + "acc_norm": 0.32452830188679244, + "acc_norm_stderr": 0.028815615713432115 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2916666666666667, + "acc_stderr": 0.03800968060554858, + "acc_norm": 0.2916666666666667, + "acc_norm_stderr": 0.03800968060554858 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.28901734104046245, + "acc_stderr": 0.034564257450869995, + "acc_norm": 0.28901734104046245, + "acc_norm_stderr": 0.034564257450869995 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3574468085106383, + "acc_stderr": 0.03132941789476425, + "acc_norm": 0.3574468085106383, + "acc_norm_stderr": 0.03132941789476425 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21929824561403508, + "acc_stderr": 0.03892431106518752, + "acc_norm": 0.21929824561403508, + "acc_norm_stderr": 0.03892431106518752 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3724137931034483, + "acc_stderr": 0.0402873153294756, + "acc_norm": 0.3724137931034483, + "acc_norm_stderr": 0.0402873153294756 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2275132275132275, + "acc_stderr": 0.021591269407823778, + "acc_norm": 0.2275132275132275, + "acc_norm_stderr": 0.021591269407823778 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.18253968253968253, + "acc_stderr": 0.0345507101910215, + "acc_norm": 0.18253968253968253, + "acc_norm_stderr": 0.0345507101910215 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720683, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720683 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.34838709677419355, + "acc_stderr": 0.027104826328100944, + "acc_norm": 0.34838709677419355, + "acc_norm_stderr": 0.027104826328100944 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2512315270935961, + "acc_stderr": 0.030516530732694433, + "acc_norm": 0.2512315270935961, + "acc_norm_stderr": 0.030516530732694433 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.40606060606060607, + "acc_stderr": 0.03834816355401181, + "acc_norm": 0.40606060606060607, + "acc_norm_stderr": 0.03834816355401181 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.36363636363636365, + "acc_stderr": 0.03427308652999934, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.03427308652999934 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.36787564766839376, + "acc_stderr": 0.03480175668466037, + "acc_norm": 0.36787564766839376, + "acc_norm_stderr": 0.03480175668466037 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.023901157979402527, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.023901157979402527 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.027309140588230182, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.027309140588230182 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3067226890756303, + "acc_stderr": 0.02995382389188705, + "acc_norm": 0.3067226890756303, + "acc_norm_stderr": 0.02995382389188705 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.25165562913907286, + "acc_stderr": 0.035433042343899844, + "acc_norm": 0.25165562913907286, + "acc_norm_stderr": 0.035433042343899844 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3302752293577982, + "acc_stderr": 0.02016446633634298, + "acc_norm": 0.3302752293577982, + "acc_norm_stderr": 0.02016446633634298 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.17592592592592593, + "acc_stderr": 0.02596742095825853, + "acc_norm": 0.17592592592592593, + "acc_norm_stderr": 0.02596742095825853 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.03410785338904719, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.03410785338904719 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.3924050632911392, + "acc_stderr": 0.03178471874564729, + "acc_norm": 0.3924050632911392, + "acc_norm_stderr": 0.03178471874564729 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.42152466367713004, + "acc_stderr": 0.03314190222110657, + "acc_norm": 0.42152466367713004, + "acc_norm_stderr": 0.03314190222110657 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3435114503816794, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.3435114503816794, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.48760330578512395, + "acc_stderr": 0.045629515481807666, + "acc_norm": 0.48760330578512395, + "acc_norm_stderr": 0.045629515481807666 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.04750077341199986, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.04750077341199986 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3496932515337423, + "acc_stderr": 0.03746668325470021, + "acc_norm": 0.3496932515337423, + "acc_norm_stderr": 0.03746668325470021 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.04521829902833586, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.04521829902833586 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3786407766990291, + "acc_stderr": 0.04802694698258973, + "acc_norm": 0.3786407766990291, + "acc_norm_stderr": 0.04802694698258973 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.43162393162393164, + "acc_stderr": 0.0324483553531149, + "acc_norm": 0.43162393162393164, + "acc_norm_stderr": 0.0324483553531149 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.39080459770114945, + "acc_stderr": 0.017448366067062526, + "acc_norm": 0.39080459770114945, + "acc_norm_stderr": 0.017448366067062526 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.3583815028901734, + "acc_stderr": 0.0258167567915842, + "acc_norm": 0.3583815028901734, + "acc_norm_stderr": 0.0258167567915842 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23575418994413408, + "acc_stderr": 0.014196375686290804, + "acc_norm": 0.23575418994413408, + "acc_norm_stderr": 0.014196375686290804 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.42483660130718953, + "acc_stderr": 0.028304576673141107, + "acc_norm": 0.42483660130718953, + "acc_norm_stderr": 0.028304576673141107 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.34726688102893893, + "acc_stderr": 0.027040745502307336, + "acc_norm": 0.34726688102893893, + "acc_norm_stderr": 0.027040745502307336 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.3487654320987654, + "acc_stderr": 0.02651759772446501, + "acc_norm": 0.3487654320987654, + "acc_norm_stderr": 0.02651759772446501 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.33687943262411346, + "acc_stderr": 0.02819553487396673, + "acc_norm": 0.33687943262411346, + "acc_norm_stderr": 0.02819553487396673 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3324641460234681, + "acc_stderr": 0.01203202233226052, + "acc_norm": 0.3324641460234681, + "acc_norm_stderr": 0.01203202233226052 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2426470588235294, + "acc_stderr": 0.026040662474201257, + "acc_norm": 0.2426470588235294, + "acc_norm_stderr": 0.026040662474201257 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.31862745098039214, + "acc_stderr": 0.018850084696468712, + "acc_norm": 0.31862745098039214, + "acc_norm_stderr": 0.018850084696468712 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.33636363636363636, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.33636363636363636, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.39591836734693875, + "acc_stderr": 0.03130802899065686, + "acc_norm": 0.39591836734693875, + "acc_norm_stderr": 0.03130802899065686 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.42786069651741293, + "acc_stderr": 0.03498541988407795, + "acc_norm": 0.42786069651741293, + "acc_norm_stderr": 0.03498541988407795 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3614457831325301, + "acc_stderr": 0.037400593820293204, + "acc_norm": 0.3614457831325301, + "acc_norm_stderr": 0.037400593820293204 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.38596491228070173, + "acc_stderr": 0.03733756969066164, + "acc_norm": 0.38596491228070173, + "acc_norm_stderr": 0.03733756969066164 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.26560587515299877, + "mc1_stderr": 0.015461027627253592, + "mc2": 0.42764768101969397, + "mc2_stderr": 0.015172444186480637 + }, + "all": { + "acc": 0.3424003558258833, + "acc_stderr": 0.0341401669278609, + "acc_norm": 0.34604560573461685, + "acc_norm_stderr": 0.034138524844586, + "mc1": 0.26560587515299877, + "mc1_stderr": 0.015461027627253592, + "mc2": 0.42764768101969397, + "mc2_stderr": 0.015172444186480637 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "AlekseyKorshuk/pygmalion-6b-vicuna-chatml", + "model_sha": "ee3ada91a69a194cedfabbfeab98f1499b75cb44", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4685, + "non-padded": 2, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40045, + "non-padded": 123, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 680, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 16, + "non-truncated": 6120, + "padded": 6120, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "0893dfcb83435e7d", + "hash_cont_tokens": "6159bf1904a8c8fb" + }, + "total_evaluation_time_secondes": "2375.0004572868347", + "truncated": 1492, + "non-truncated": 109527, + "padded": 109290, + "non-padded": 1729, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-08-01T14:16:25.052724.json b/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-08-01T14:16:25.052724.json new file mode 100644 index 0000000000000000000000000000000000000000..141d855c0400b980a79cb784a3dcfa30563626fa --- /dev/null +++ b/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-08-01T14:16:25.052724.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.3720136518771331, + "acc_stderr": 0.014124597881844461, + "acc_norm": 0.4061433447098976, + "acc_norm_stderr": 0.01435165669009786 + }, + "harness|hellaswag|10": { + "acc": 0.4963154750049791, + "acc_stderr": 0.004989645929811438, + "acc_norm": 0.6772555267874926, + "acc_norm_stderr": 0.004665704208339031 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4222222222222222, + "acc_stderr": 0.04266763404099582, + "acc_norm": 0.4222222222222222, + "acc_norm_stderr": 0.04266763404099582 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.40131578947368424, + "acc_stderr": 0.03988903703336283, + "acc_norm": 0.40131578947368424, + "acc_norm_stderr": 0.03988903703336283 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.32452830188679244, + "acc_stderr": 0.028815615713432115, + "acc_norm": 0.32452830188679244, + "acc_norm_stderr": 0.028815615713432115 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2916666666666667, + "acc_stderr": 0.03800968060554858, + "acc_norm": 0.2916666666666667, + "acc_norm_stderr": 0.03800968060554858 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.28901734104046245, + "acc_stderr": 0.034564257450869995, + "acc_norm": 0.28901734104046245, + "acc_norm_stderr": 0.034564257450869995 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3574468085106383, + "acc_stderr": 0.03132941789476425, + "acc_norm": 0.3574468085106383, + "acc_norm_stderr": 0.03132941789476425 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21929824561403508, + "acc_stderr": 0.03892431106518752, + "acc_norm": 0.21929824561403508, + "acc_norm_stderr": 0.03892431106518752 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3724137931034483, + "acc_stderr": 0.0402873153294756, + "acc_norm": 0.3724137931034483, + "acc_norm_stderr": 0.0402873153294756 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2275132275132275, + "acc_stderr": 0.021591269407823778, + "acc_norm": 0.2275132275132275, + "acc_norm_stderr": 0.021591269407823778 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.18253968253968253, + "acc_stderr": 0.0345507101910215, + "acc_norm": 0.18253968253968253, + "acc_norm_stderr": 0.0345507101910215 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720683, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720683 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.34838709677419355, + "acc_stderr": 0.027104826328100944, + "acc_norm": 0.34838709677419355, + "acc_norm_stderr": 0.027104826328100944 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2512315270935961, + "acc_stderr": 0.030516530732694433, + "acc_norm": 0.2512315270935961, + "acc_norm_stderr": 0.030516530732694433 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.40606060606060607, + "acc_stderr": 0.03834816355401181, + "acc_norm": 0.40606060606060607, + "acc_norm_stderr": 0.03834816355401181 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.36363636363636365, + "acc_stderr": 0.03427308652999934, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.03427308652999934 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.36787564766839376, + "acc_stderr": 0.03480175668466037, + "acc_norm": 0.36787564766839376, + "acc_norm_stderr": 0.03480175668466037 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.023901157979402527, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.023901157979402527 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.027309140588230182, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.027309140588230182 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3067226890756303, + "acc_stderr": 0.02995382389188705, + "acc_norm": 0.3067226890756303, + "acc_norm_stderr": 0.02995382389188705 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.25165562913907286, + "acc_stderr": 0.035433042343899844, + "acc_norm": 0.25165562913907286, + "acc_norm_stderr": 0.035433042343899844 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3302752293577982, + "acc_stderr": 0.02016446633634298, + "acc_norm": 0.3302752293577982, + "acc_norm_stderr": 0.02016446633634298 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.17592592592592593, + "acc_stderr": 0.02596742095825853, + "acc_norm": 0.17592592592592593, + "acc_norm_stderr": 0.02596742095825853 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.03410785338904719, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.03410785338904719 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.3924050632911392, + "acc_stderr": 0.03178471874564729, + "acc_norm": 0.3924050632911392, + "acc_norm_stderr": 0.03178471874564729 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.42152466367713004, + "acc_stderr": 0.03314190222110657, + "acc_norm": 0.42152466367713004, + "acc_norm_stderr": 0.03314190222110657 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3435114503816794, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.3435114503816794, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.48760330578512395, + "acc_stderr": 0.045629515481807666, + "acc_norm": 0.48760330578512395, + "acc_norm_stderr": 0.045629515481807666 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.04750077341199986, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.04750077341199986 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3496932515337423, + "acc_stderr": 0.03746668325470021, + "acc_norm": 0.3496932515337423, + "acc_norm_stderr": 0.03746668325470021 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.04521829902833586, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.04521829902833586 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3786407766990291, + "acc_stderr": 0.04802694698258973, + "acc_norm": 0.3786407766990291, + "acc_norm_stderr": 0.04802694698258973 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.43162393162393164, + "acc_stderr": 0.0324483553531149, + "acc_norm": 0.43162393162393164, + "acc_norm_stderr": 0.0324483553531149 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.39080459770114945, + "acc_stderr": 0.017448366067062526, + "acc_norm": 0.39080459770114945, + "acc_norm_stderr": 0.017448366067062526 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.3583815028901734, + "acc_stderr": 0.0258167567915842, + "acc_norm": 0.3583815028901734, + "acc_norm_stderr": 0.0258167567915842 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23575418994413408, + "acc_stderr": 0.014196375686290804, + "acc_norm": 0.23575418994413408, + "acc_norm_stderr": 0.014196375686290804 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.42483660130718953, + "acc_stderr": 0.028304576673141107, + "acc_norm": 0.42483660130718953, + "acc_norm_stderr": 0.028304576673141107 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.34726688102893893, + "acc_stderr": 0.027040745502307336, + "acc_norm": 0.34726688102893893, + "acc_norm_stderr": 0.027040745502307336 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.3487654320987654, + "acc_stderr": 0.02651759772446501, + "acc_norm": 0.3487654320987654, + "acc_norm_stderr": 0.02651759772446501 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.33687943262411346, + "acc_stderr": 0.02819553487396673, + "acc_norm": 0.33687943262411346, + "acc_norm_stderr": 0.02819553487396673 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3324641460234681, + "acc_stderr": 0.01203202233226052, + "acc_norm": 0.3324641460234681, + "acc_norm_stderr": 0.01203202233226052 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2426470588235294, + "acc_stderr": 0.026040662474201257, + "acc_norm": 0.2426470588235294, + "acc_norm_stderr": 0.026040662474201257 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.31862745098039214, + "acc_stderr": 0.018850084696468712, + "acc_norm": 0.31862745098039214, + "acc_norm_stderr": 0.018850084696468712 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.33636363636363636, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.33636363636363636, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.39591836734693875, + "acc_stderr": 0.03130802899065686, + "acc_norm": 0.39591836734693875, + "acc_norm_stderr": 0.03130802899065686 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.42786069651741293, + "acc_stderr": 0.03498541988407795, + "acc_norm": 0.42786069651741293, + "acc_norm_stderr": 0.03498541988407795 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3614457831325301, + "acc_stderr": 0.037400593820293204, + "acc_norm": 0.3614457831325301, + "acc_norm_stderr": 0.037400593820293204 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.38596491228070173, + "acc_stderr": 0.03733756969066164, + "acc_norm": 0.38596491228070173, + "acc_norm_stderr": 0.03733756969066164 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.26560587515299877, + "mc1_stderr": 0.015461027627253592, + "mc2": 0.42764768101969397, + "mc2_stderr": 0.015172444186480637 + }, + "all": { + "acc": 0.3424003558258833, + "acc_stderr": 0.0341401669278609, + "acc_norm": 0.34604560573461685, + "acc_norm_stderr": 0.034138524844586, + "mc1": 0.26560587515299877, + "mc1_stderr": 0.015461027627253592, + "mc2": 0.42764768101969397, + "mc2_stderr": 0.015172444186480637 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "AlekseyKorshuk/pygmalion-6b-vicuna-chatml", + "model_sha": "ee3ada91a69a194cedfabbfeab98f1499b75cb44", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4685, + "non-padded": 2, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40045, + "non-padded": 123, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 680, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 16, + "non-truncated": 6120, + "padded": 6120, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "0893dfcb83435e7d", + "hash_cont_tokens": "6159bf1904a8c8fb" + }, + "total_evaluation_time_secondes": "2375.0004572868347", + "truncated": 1492, + "non-truncated": 109527, + "padded": 109290, + "non-padded": 1729, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-09-09T13-36-28.958118.json b/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-09-09T13-36-28.958118.json new file mode 100644 index 0000000000000000000000000000000000000000..5fa944c806b853e1e373a7cce5bfa628108adfc0 --- /dev/null +++ b/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-09-09T13-36-28.958118.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "AlekseyKorshuk/pygmalion-6b-vicuna-chatml", + "model_sha": "ee3ada91a69a194cedfabbfeab98f1499b75cb44", + "model_size": "11.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.022336409395973155, + "em_stderr": 0.001513355825080532, + "f1": 0.07709626677852371, + "f1_stderr": 0.0019730522657271923 + }, + "harness|gsm8k|5": { + "acc": 0.025018953752843062, + "acc_stderr": 0.004302045046564293 + }, + "harness|winogrande|5": { + "acc": 0.6306235201262825, + "acc_stderr": 0.013564470596053525 + }, + "all": { + "em": 0.022336409395973155, + "em_stderr": 0.001513355825080532, + "f1": 0.07709626677852371, + "f1_stderr": 0.0019730522657271923, + "acc": 0.3278212369395628, + "acc_stderr": 0.00893325782130891 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f21277d2c2d2e06c", + "hash_cont_tokens": "d7b7fcd0702ec261" + }, + "truncated": 382, + "non-truncated": 9154, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "f191a21e6082b576" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "26cd3631535039d0", + "hash_cont_tokens": "de68f7850ee6db3d" + }, + "total_evaluation_time_secondes": "5156.145171165466", + "truncated": 382, + "non-truncated": 13007, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-12-02T15-52-04.252951.json b/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-12-02T15-52-04.252951.json new file mode 100644 index 0000000000000000000000000000000000000000..365c58c84dd880e194c9860f05a72a729ec4f346 --- /dev/null +++ b/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-12-02T15-52-04.252951.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1379465.414490334, + "end_time": 1381751.784140608, + "total_evaluation_time_secondes": "2286.3696502740495", + "model_name": "AlekseyKorshuk/pygmalion-6b-vicuna-chatml", + "model_sha": "ee3ada91a69a194cedfabbfeab98f1499b75cb44", + "model_dtype": "torch.float16", + "model_size": "11.28 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.04397270659590599, + "acc_stderr": 0.005647666449126458 + }, + "all": { + "acc": 0.04397270659590599, + "acc_stderr": 0.005647666449126458 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "f191a21e6082b576" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "ce30a215cb9fb6d1", + "hash_cont_tokens": "1685e98f0fcdfe91" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-12-02T15-52-11.848314.json b/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-12-02T15-52-11.848314.json new file mode 100644 index 0000000000000000000000000000000000000000..82930b06ddc48734156b99599f8ec1ddfad2d075 --- /dev/null +++ b/eval-results/AlekseyKorshuk/pygmalion-6b-vicuna-chatml/results_2023-12-02T15-52-11.848314.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1390201.169224904, + "end_time": 1392498.943953603, + "total_evaluation_time_secondes": "2297.7747286991216", + "model_name": "AlekseyKorshuk/pygmalion-6b-vicuna-chatml", + "model_sha": "ee3ada91a69a194cedfabbfeab98f1499b75cb44", + "model_dtype": "torch.float16", + "model_size": "11.28 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.04397270659590599, + "acc_stderr": 0.005647666449126458 + }, + "all": { + "acc": 0.04397270659590599, + "acc_stderr": 0.005647666449126458 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "f191a21e6082b576" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "ce30a215cb9fb6d1", + "hash_cont_tokens": "1685e98f0fcdfe91" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/AlekseyKorshuk/vic15-exp-syn-fight-cp3838/results_2023-10-01T15-44-18.303081.json b/eval-results/AlekseyKorshuk/vic15-exp-syn-fight-cp3838/results_2023-10-01T15-44-18.303081.json new file mode 100644 index 0000000000000000000000000000000000000000..8f61849027323e42f325335f3e95016b162ae357 --- /dev/null +++ b/eval-results/AlekseyKorshuk/vic15-exp-syn-fight-cp3838/results_2023-10-01T15-44-18.303081.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "AlekseyKorshuk/vic15-exp-syn-fight-cp3838", + "model_sha": "91ce25dbdb67793ad1fcfdfd59f7603c2be65aea", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.49658703071672355, + "acc_stderr": 0.014611050403244081, + "acc_norm": 0.5179180887372014, + "acc_norm_stderr": 0.014602005585490978 + }, + "harness|hellaswag|10": { + "acc": 0.5786695877315275, + "acc_stderr": 0.004927631806477561, + "acc_norm": 0.7579167496514638, + "acc_norm_stderr": 0.0042746901436291375 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720683, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720683 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.48026315789473684, + "acc_stderr": 0.04065771002562605, + "acc_norm": 0.48026315789473684, + "acc_norm_stderr": 0.04065771002562605 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5207547169811321, + "acc_stderr": 0.030746349975723456, + "acc_norm": 0.5207547169811321, + "acc_norm_stderr": 0.030746349975723456 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4930555555555556, + "acc_stderr": 0.04180806750294938, + "acc_norm": 0.4930555555555556, + "acc_norm_stderr": 0.04180806750294938 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.47398843930635837, + "acc_stderr": 0.038073017265045105, + "acc_norm": 0.47398843930635837, + "acc_norm_stderr": 0.038073017265045105 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.0379328118530781, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.0379328118530781 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.46382978723404256, + "acc_stderr": 0.032600385118357715, + "acc_norm": 0.46382978723404256, + "acc_norm_stderr": 0.032600385118357715 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159393, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159393 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.45517241379310347, + "acc_stderr": 0.04149886942192117, + "acc_norm": 0.45517241379310347, + "acc_norm_stderr": 0.04149886942192117 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30423280423280424, + "acc_stderr": 0.023695415009463087, + "acc_norm": 0.30423280423280424, + "acc_norm_stderr": 0.023695415009463087 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04285714285714281, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04285714285714281 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5451612903225806, + "acc_stderr": 0.028327743091561074, + "acc_norm": 0.5451612903225806, + "acc_norm_stderr": 0.028327743091561074 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.35960591133004927, + "acc_stderr": 0.033764582465095665, + "acc_norm": 0.35960591133004927, + "acc_norm_stderr": 0.033764582465095665 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6303030303030303, + "acc_stderr": 0.037694303145125674, + "acc_norm": 0.6303030303030303, + "acc_norm_stderr": 0.037694303145125674 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6161616161616161, + "acc_stderr": 0.03464881675016339, + "acc_norm": 0.6161616161616161, + "acc_norm_stderr": 0.03464881675016339 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.694300518134715, + "acc_stderr": 0.033248379397581594, + "acc_norm": 0.694300518134715, + "acc_norm_stderr": 0.033248379397581594 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4641025641025641, + "acc_stderr": 0.025285585990017838, + "acc_norm": 0.4641025641025641, + "acc_norm_stderr": 0.025285585990017838 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23333333333333334, + "acc_stderr": 0.025787874220959316, + "acc_norm": 0.23333333333333334, + "acc_norm_stderr": 0.025787874220959316 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4495798319327731, + "acc_stderr": 0.03231293497137707, + "acc_norm": 0.4495798319327731, + "acc_norm_stderr": 0.03231293497137707 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.24503311258278146, + "acc_stderr": 0.03511807571804723, + "acc_norm": 0.24503311258278146, + "acc_norm_stderr": 0.03511807571804723 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6990825688073394, + "acc_stderr": 0.019664751366802114, + "acc_norm": 0.6990825688073394, + "acc_norm_stderr": 0.019664751366802114 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.38425925925925924, + "acc_stderr": 0.03317354514310742, + "acc_norm": 0.38425925925925924, + "acc_norm_stderr": 0.03317354514310742 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.696078431372549, + "acc_stderr": 0.03228210387037892, + "acc_norm": 0.696078431372549, + "acc_norm_stderr": 0.03228210387037892 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.729957805907173, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.729957805907173, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5964125560538116, + "acc_stderr": 0.03292802819330314, + "acc_norm": 0.5964125560538116, + "acc_norm_stderr": 0.03292802819330314 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5877862595419847, + "acc_stderr": 0.04317171194870254, + "acc_norm": 0.5877862595419847, + "acc_norm_stderr": 0.04317171194870254 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5950413223140496, + "acc_stderr": 0.04481137755942469, + "acc_norm": 0.5950413223140496, + "acc_norm_stderr": 0.04481137755942469 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5648148148148148, + "acc_stderr": 0.04792898170907061, + "acc_norm": 0.5648148148148148, + "acc_norm_stderr": 0.04792898170907061 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5398773006134969, + "acc_stderr": 0.03915857291436971, + "acc_norm": 0.5398773006134969, + "acc_norm_stderr": 0.03915857291436971 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.44642857142857145, + "acc_stderr": 0.04718471485219588, + "acc_norm": 0.44642857142857145, + "acc_norm_stderr": 0.04718471485219588 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6504854368932039, + "acc_stderr": 0.04721188506097174, + "acc_norm": 0.6504854368932039, + "acc_norm_stderr": 0.04721188506097174 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.027236013946196694, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.027236013946196694 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.56, + "acc_stderr": 0.0498887651569859, + "acc_norm": 0.56, + "acc_norm_stderr": 0.0498887651569859 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6845466155810983, + "acc_stderr": 0.016617501738763394, + "acc_norm": 0.6845466155810983, + "acc_norm_stderr": 0.016617501738763394 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5578034682080925, + "acc_stderr": 0.026738603643807403, + "acc_norm": 0.5578034682080925, + "acc_norm_stderr": 0.026738603643807403 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23575418994413408, + "acc_stderr": 0.014196375686290804, + "acc_norm": 0.23575418994413408, + "acc_norm_stderr": 0.014196375686290804 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5522875816993464, + "acc_stderr": 0.028472938478033526, + "acc_norm": 0.5522875816993464, + "acc_norm_stderr": 0.028472938478033526 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5755627009646302, + "acc_stderr": 0.028071928247946205, + "acc_norm": 0.5755627009646302, + "acc_norm_stderr": 0.028071928247946205 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5432098765432098, + "acc_stderr": 0.027716661650194038, + "acc_norm": 0.5432098765432098, + "acc_norm_stderr": 0.027716661650194038 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3723404255319149, + "acc_stderr": 0.028838921471251458, + "acc_norm": 0.3723404255319149, + "acc_norm_stderr": 0.028838921471251458 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3670143415906128, + "acc_stderr": 0.012310264244842125, + "acc_norm": 0.3670143415906128, + "acc_norm_stderr": 0.012310264244842125 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.03032024326500413, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.03032024326500413 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.47549019607843135, + "acc_stderr": 0.020203517280261436, + "acc_norm": 0.47549019607843135, + "acc_norm_stderr": 0.020203517280261436 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5909090909090909, + "acc_stderr": 0.04709306978661895, + "acc_norm": 0.5909090909090909, + "acc_norm_stderr": 0.04709306978661895 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6081632653061224, + "acc_stderr": 0.031251275910891656, + "acc_norm": 0.6081632653061224, + "acc_norm_stderr": 0.031251275910891656 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.681592039800995, + "acc_stderr": 0.032941184790540944, + "acc_norm": 0.681592039800995, + "acc_norm_stderr": 0.032941184790540944 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.73, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.73, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.43373493975903615, + "acc_stderr": 0.03858158940685517, + "acc_norm": 0.43373493975903615, + "acc_norm_stderr": 0.03858158940685517 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7134502923976608, + "acc_stderr": 0.03467826685703826, + "acc_norm": 0.7134502923976608, + "acc_norm_stderr": 0.03467826685703826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3329253365973072, + "mc1_stderr": 0.016497402382012052, + "mc2": 0.49613197888405214, + "mc2_stderr": 0.015701759057597957 + }, + "all": { + "acc": 0.5035250204495572, + "acc_stderr": 0.03500483713757127, + "acc_norm": 0.5069246512960048, + "acc_norm_stderr": 0.034993617027730566, + "mc1": 0.3329253365973072, + "mc1_stderr": 0.016497402382012052, + "mc2": 0.49613197888405214, + "mc2_stderr": 0.015701759057597957 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "7526.873604774475", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/AlekseyKorshuk/vic15-exp-syn-fight-cp3838/results_2023-10-28T08-01-43.627403.json b/eval-results/AlekseyKorshuk/vic15-exp-syn-fight-cp3838/results_2023-10-28T08-01-43.627403.json new file mode 100644 index 0000000000000000000000000000000000000000..5e31c5384ee3d849b08acc5546ff1151cedce05c --- /dev/null +++ b/eval-results/AlekseyKorshuk/vic15-exp-syn-fight-cp3838/results_2023-10-28T08-01-43.627403.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "AlekseyKorshuk/vic15-exp-syn-fight-cp3838", + "model_sha": "91ce25dbdb67793ad1fcfdfd59f7603c2be65aea", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.005977348993288591, + "em_stderr": 0.0007893908687131981, + "f1": 0.07782088926174525, + "f1_stderr": 0.0017265401569443591 + }, + "harness|gsm8k|5": { + "acc": 0.06595905989385899, + "acc_stderr": 0.006836951192034235 + }, + "harness|winogrande|5": { + "acc": 0.7182320441988951, + "acc_stderr": 0.012643326011852944 + }, + "all": { + "em": 0.005977348993288591, + "em_stderr": 0.0007893908687131981, + "f1": 0.07782088926174525, + "f1_stderr": 0.0017265401569443591, + "acc": 0.392095552046377, + "acc_stderr": 0.00974013860194359 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "84e9a06f8c987e5e" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "dd5b6b04546c8149" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "f2d1062f403e6c88" + }, + "total_evaluation_time_secondes": "9554.29204249382", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca/results_2023-07-24T10-33-21.195135.json b/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca/results_2023-07-24T10-33-21.195135.json new file mode 100644 index 0000000000000000000000000000000000000000..a1e320559eed3c2d6530d71bd8cc8a2aad1a33d2 --- /dev/null +++ b/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca/results_2023-07-24T10-33-21.195135.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.363481228668942, + "acc_stderr": 0.014056207319068285, + "acc_norm": 0.3890784982935154, + "acc_norm_stderr": 0.014247309976045609 + }, + "harness|hellaswag|10": { + "acc": 0.49283011352320255, + "acc_stderr": 0.004989268362968724, + "acc_norm": 0.6761601274646485, + "acc_norm_stderr": 0.004669834130977062 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2074074074074074, + "acc_stderr": 0.03502553170678317, + "acc_norm": 0.2074074074074074, + "acc_norm_stderr": 0.03502553170678317 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.24342105263157895, + "acc_stderr": 0.034923496688842384, + "acc_norm": 0.24342105263157895, + "acc_norm_stderr": 0.034923496688842384 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.26037735849056604, + "acc_stderr": 0.027008766090708094, + "acc_norm": 0.26037735849056604, + "acc_norm_stderr": 0.027008766090708094 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2361111111111111, + "acc_stderr": 0.03551446610810826, + "acc_norm": 0.2361111111111111, + "acc_norm_stderr": 0.03551446610810826 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.21, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.21, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2138728323699422, + "acc_stderr": 0.03126511206173043, + "acc_norm": 0.2138728323699422, + "acc_norm_stderr": 0.03126511206173043 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237657, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237657 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3276595744680851, + "acc_stderr": 0.030683020843231008, + "acc_norm": 0.3276595744680851, + "acc_norm_stderr": 0.030683020843231008 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.0433913832257986, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.0433913832257986 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.20689655172413793, + "acc_stderr": 0.03375672449560554, + "acc_norm": 0.20689655172413793, + "acc_norm_stderr": 0.03375672449560554 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.022644212615525218, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.022644212615525218 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.04163453031302859, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.04163453031302859 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.27419354838709675, + "acc_stderr": 0.025378139970885193, + "acc_norm": 0.27419354838709675, + "acc_norm_stderr": 0.025378139970885193 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2955665024630542, + "acc_stderr": 0.03210494433751458, + "acc_norm": 0.2955665024630542, + "acc_norm_stderr": 0.03210494433751458 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.23636363636363636, + "acc_stderr": 0.033175059300091805, + "acc_norm": 0.23636363636363636, + "acc_norm_stderr": 0.033175059300091805 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.35353535353535354, + "acc_stderr": 0.03406086723547153, + "acc_norm": 0.35353535353535354, + "acc_norm_stderr": 0.03406086723547153 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.22797927461139897, + "acc_stderr": 0.030276909945178263, + "acc_norm": 0.22797927461139897, + "acc_norm_stderr": 0.030276909945178263 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.28974358974358977, + "acc_stderr": 0.023000628243687978, + "acc_norm": 0.28974358974358977, + "acc_norm_stderr": 0.023000628243687978 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24814814814814815, + "acc_stderr": 0.0263357394040558, + "acc_norm": 0.24814814814814815, + "acc_norm_stderr": 0.0263357394040558 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.029597329730978086, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.029597329730978086 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3509933774834437, + "acc_stderr": 0.03896981964257375, + "acc_norm": 0.3509933774834437, + "acc_norm_stderr": 0.03896981964257375 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3394495412844037, + "acc_stderr": 0.02030210934266235, + "acc_norm": 0.3394495412844037, + "acc_norm_stderr": 0.02030210934266235 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.47685185185185186, + "acc_stderr": 0.03406315360711507, + "acc_norm": 0.47685185185185186, + "acc_norm_stderr": 0.03406315360711507 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.029771775228145638, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.029771775228145638 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.22784810126582278, + "acc_stderr": 0.02730348459906942, + "acc_norm": 0.22784810126582278, + "acc_norm_stderr": 0.02730348459906942 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.31390134529147984, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.31390134529147984, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.24427480916030533, + "acc_stderr": 0.037683359597287455, + "acc_norm": 0.24427480916030533, + "acc_norm_stderr": 0.037683359597287455 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.24793388429752067, + "acc_stderr": 0.039418975265163005, + "acc_norm": 0.24793388429752067, + "acc_norm_stderr": 0.039418975265163005 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.28703703703703703, + "acc_stderr": 0.043733130409147614, + "acc_norm": 0.28703703703703703, + "acc_norm_stderr": 0.043733130409147614 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.18404907975460122, + "acc_stderr": 0.03044677768797174, + "acc_norm": 0.18404907975460122, + "acc_norm_stderr": 0.03044677768797174 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.23214285714285715, + "acc_stderr": 0.04007341809755805, + "acc_norm": 0.23214285714285715, + "acc_norm_stderr": 0.04007341809755805 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.27184466019417475, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.27184466019417475, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.29914529914529914, + "acc_stderr": 0.029996951858349483, + "acc_norm": 0.29914529914529914, + "acc_norm_stderr": 0.029996951858349483 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2771392081736909, + "acc_stderr": 0.016005636294122428, + "acc_norm": 0.2771392081736909, + "acc_norm_stderr": 0.016005636294122428 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2976878612716763, + "acc_stderr": 0.024617055388677003, + "acc_norm": 0.2976878612716763, + "acc_norm_stderr": 0.024617055388677003 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24022346368715083, + "acc_stderr": 0.01428834380392528, + "acc_norm": 0.24022346368715083, + "acc_norm_stderr": 0.01428834380392528 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2973856209150327, + "acc_stderr": 0.02617390850671858, + "acc_norm": 0.2973856209150327, + "acc_norm_stderr": 0.02617390850671858 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2733118971061093, + "acc_stderr": 0.02531176597542612, + "acc_norm": 0.2733118971061093, + "acc_norm_stderr": 0.02531176597542612 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.02438366553103545, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.02438366553103545 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2730496453900709, + "acc_stderr": 0.02657786094330786, + "acc_norm": 0.2730496453900709, + "acc_norm_stderr": 0.02657786094330786 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.23859191655801826, + "acc_stderr": 0.010885929742002202, + "acc_norm": 0.23859191655801826, + "acc_norm_stderr": 0.010885929742002202 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.44485294117647056, + "acc_stderr": 0.030187532060329376, + "acc_norm": 0.44485294117647056, + "acc_norm_stderr": 0.030187532060329376 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25, + "acc_stderr": 0.01751781884501444, + "acc_norm": 0.25, + "acc_norm_stderr": 0.01751781884501444 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.3181818181818182, + "acc_stderr": 0.04461272175910507, + "acc_norm": 0.3181818181818182, + "acc_norm_stderr": 0.04461272175910507 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2693877551020408, + "acc_stderr": 0.02840125202902294, + "acc_norm": 0.2693877551020408, + "acc_norm_stderr": 0.02840125202902294 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23880597014925373, + "acc_stderr": 0.03014777593540922, + "acc_norm": 0.23880597014925373, + "acc_norm_stderr": 0.03014777593540922 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2710843373493976, + "acc_stderr": 0.034605799075530255, + "acc_norm": 0.2710843373493976, + "acc_norm_stderr": 0.034605799075530255 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.033773102522091945, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.033773102522091945 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2386780905752754, + "mc1_stderr": 0.014922629695456421, + "mc2": 0.35295403100483275, + "mc2_stderr": 0.013476504364264511 + }, + "all": { + "acc": 0.27740100642463217, + "acc_stderr": 0.03222917582596356, + "acc_norm": 0.28094214682405627, + "acc_norm_stderr": 0.03222700071452264, + "mc1": 0.2386780905752754, + "mc1_stderr": 0.014922629695456421, + "mc2": 0.35295403100483275, + "mc2_stderr": 0.013476504364264511 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca", + "model_sha": "07d9d32cd091148295d4e13802ba63486599aff4", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "99ff49c78917d666", + "hash_cont_tokens": "568988b9c3bfc83c" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "27b384658a4b826e", + "hash_cont_tokens": "5966c7ceee7144f8" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40153, + "non-padded": 15, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "dac91b437d631599", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "06cd9a69af842291", + "hash_cont_tokens": "b408913f391dc598" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "7e0363633bd4c661", + "hash_cont_tokens": "4ab285fa2a75c029" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "a1b916a7277078b4", + "hash_cont_tokens": "15baabbd71328cbe" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "af46942ff5deb21d", + "hash_cont_tokens": "96c880c9478a4037" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "5882d6931ded2237", + "hash_cont_tokens": "6268ee610a672867" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "b24180b880da9cdc", + "hash_cont_tokens": "7b194ff8e7e390ce" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "9bc1d680b14c82ee", + "hash_cont_tokens": "2fe5eee1df1b81bb" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "79aced2bcafe02e4", + "hash_cont_tokens": "499ffd87e7a60146" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "3e657aa09cc216ff", + "hash_cont_tokens": "e5df51bb12073b7b" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5f521206bd8121ad", + "hash_cont_tokens": "4abfe03c09581bce" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "b12ce1e36c118558", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "221bbd7b0d39e269", + "hash_cont_tokens": "4dc3a1c45702aea2" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "d475018fde7b68bf", + "hash_cont_tokens": "abfc7c631218ed32" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "964e79b20780ee59", + "hash_cont_tokens": "195db06c037d7c81" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 569, + "non-padded": 11, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "829b84905d5794d7", + "hash_cont_tokens": "4274dfcea97c4e27" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "83233577e0f66071", + "hash_cont_tokens": "aadc96b61f4bea54" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "b45c36cf0fc38f67", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "47f5c034c56e090f", + "hash_cont_tokens": "6ea5c6b690913b0f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "13286ca334f1e8e7", + "hash_cont_tokens": "befe57dcb5a5a7d3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e3a3351b698e7311", + "hash_cont_tokens": "8da78e4005b8faf9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "6639a9e4f4eb57c5", + "hash_cont_tokens": "ff5ae57ff23b53d1" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "cfe8f73d53615fc7", + "hash_cont_tokens": "db85309de1591035" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "1f8541aadce8b236", + "hash_cont_tokens": "6890e2bc35a602ef" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8da2d7f4edfdafd5", + "hash_cont_tokens": "6132e48ff0edea66" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "52328f9dec1844ed", + "hash_cont_tokens": "d201a0126c9a530c" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "04d97c91eee4e141", + "hash_cont_tokens": "596c4f1066a38e91" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d8d05cf169bd7639", + "hash_cont_tokens": "fcefc753d295e446" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "03f858b330d55fed", + "hash_cont_tokens": "a4a552f563078902" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "ce2ca0558b9a5f27", + "hash_cont_tokens": "85dbbdba6017eaec" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a3884e14c3c038b5", + "hash_cont_tokens": "7d705edd113a3d4d" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b3f5f4615f906023", + "hash_cont_tokens": "211397dca1d04c0a" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "0d806b9b33c54432", + "hash_cont_tokens": "b196c68db4825727" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "4c9f4c1de8d94adf", + "hash_cont_tokens": "ffc3b70128684ad0" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "4e565cd482620bbe", + "hash_cont_tokens": "bcaed810d47c62aa" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "13cbfca1b5b84f78", + "hash_cont_tokens": "ea7ff206c4da6f57" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "bf707bcaadcd1b7f", + "hash_cont_tokens": "4a853cb5874d2adc" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "78808255dea01f83", + "hash_cont_tokens": "9e40b162dc928ce5" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6bab60a3ce133e17", + "hash_cont_tokens": "c93d7596aa2246ea" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "d0fcde4d547d9832", + "hash_cont_tokens": "af4b0ee8ee2bb07f" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "78c8a1b611a22020", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "690c7a1333c1030b", + "hash_cont_tokens": "5b068e21debc566e" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "de74e3025a1cd4e3", + "hash_cont_tokens": "8d79c8c8d3b1fa75" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1384, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "77cf2aceb27a9b48", + "hash_cont_tokens": "30d3a442342e5f19" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c149e4bfa0bd49e2", + "hash_cont_tokens": "231f307b052cc303" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "8e8dd2f09979a669", + "hash_cont_tokens": "faaa18e05a96eb91" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "beb7b4488967bf13", + "hash_cont_tokens": "3fa5ef4207c2fae2" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "6dead6c7a78a877e", + "hash_cont_tokens": "711398f4a1641e99" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "a3cf3a06ebd3a4c2", + "hash_cont_tokens": "5c9515fd601cb0d7" + }, + "truncated": 92, + "non-truncated": 6044, + "padded": 6032, + "non-padded": 104, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "8ef46fa5025f8036", + "hash_cont_tokens": "bb99427ea7c63f48" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "592938a865df4169", + "hash_cont_tokens": "cdbe1515e8c6e3ce" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "6708e93b0c611917", + "hash_cont_tokens": "c54f38d507746b57" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d9c3e621c2145453", + "hash_cont_tokens": "16d346d36b44190b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "862a1d43b0709cc8", + "hash_cont_tokens": "e329121c50bb2b96" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "0f8b3d09b9f523d6", + "hash_cont_tokens": "446207f22323db3e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "543430e3d6af520f", + "hash_cont_tokens": "30dcb20b1aeaf10b" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "a9f37ee284fec309", + "hash_cont_tokens": "f8476c0c6f07dff2" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "bc9ef61861cd1b47", + "hash_cont_tokens": "d07001d4d0214aa3" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5718915646c336d4", + "hash_cont_tokens": "be8494d5ebf3309a" + }, + "total_evaluation_time_secondes": "1972.7119643688202", + "truncated": 1568, + "non-truncated": 109451, + "padded": 109413, + "non-padded": 1606, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca/results_2023-08-04T13-31-56.530209.json b/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca/results_2023-08-04T13-31-56.530209.json new file mode 100644 index 0000000000000000000000000000000000000000..538d8686d4d985fa261437a942755ae5d3cc26fc --- /dev/null +++ b/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca/results_2023-08-04T13-31-56.530209.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.22610921501706485, + "acc_stderr": 0.012224202097063281, + "acc_norm": 0.24829351535836178, + "acc_norm_stderr": 0.012624912868089767 + }, + "harness|hellaswag|10": { + "acc": 0.2597092212706632, + "acc_stderr": 0.004375788991216851, + "acc_norm": 0.26309500099581756, + "acc_norm_stderr": 0.00439413672417301 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.03712537833614866, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.03712537833614866 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.27631578947368424, + "acc_stderr": 0.03639057569952925, + "acc_norm": 0.27631578947368424, + "acc_norm_stderr": 0.03639057569952925 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768077, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768077 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.30566037735849055, + "acc_stderr": 0.028353298073322663, + "acc_norm": 0.30566037735849055, + "acc_norm_stderr": 0.028353298073322663 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2361111111111111, + "acc_stderr": 0.03551446610810826, + "acc_norm": 0.2361111111111111, + "acc_norm_stderr": 0.03551446610810826 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036623, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036623 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2832369942196532, + "acc_stderr": 0.034355680560478746, + "acc_norm": 0.2832369942196532, + "acc_norm_stderr": 0.034355680560478746 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.04336432707993178, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.04336432707993178 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2723404255319149, + "acc_stderr": 0.029101290698386715, + "acc_norm": 0.2723404255319149, + "acc_norm_stderr": 0.029101290698386715 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.04372748290278008, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.04372748290278008 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.296551724137931, + "acc_stderr": 0.03806142687309994, + "acc_norm": 0.296551724137931, + "acc_norm_stderr": 0.03806142687309994 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2275132275132275, + "acc_stderr": 0.02159126940782379, + "acc_norm": 0.2275132275132275, + "acc_norm_stderr": 0.02159126940782379 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.19047619047619047, + "acc_stderr": 0.035122074123020534, + "acc_norm": 0.19047619047619047, + "acc_norm_stderr": 0.035122074123020534 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.29354838709677417, + "acc_stderr": 0.02590608702131929, + "acc_norm": 0.29354838709677417, + "acc_norm_stderr": 0.02590608702131929 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.24630541871921183, + "acc_stderr": 0.030315099285617732, + "acc_norm": 0.24630541871921183, + "acc_norm_stderr": 0.030315099285617732 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2606060606060606, + "acc_stderr": 0.03427743175816524, + "acc_norm": 0.2606060606060606, + "acc_norm_stderr": 0.03427743175816524 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.23737373737373738, + "acc_stderr": 0.030313710538198906, + "acc_norm": 0.23737373737373738, + "acc_norm_stderr": 0.030313710538198906 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.23834196891191708, + "acc_stderr": 0.030748905363909895, + "acc_norm": 0.23834196891191708, + "acc_norm_stderr": 0.030748905363909895 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2717948717948718, + "acc_stderr": 0.022556551010132354, + "acc_norm": 0.2717948717948718, + "acc_norm_stderr": 0.022556551010132354 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.026067159222275794, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.026067159222275794 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.23949579831932774, + "acc_stderr": 0.027722065493361273, + "acc_norm": 0.23949579831932774, + "acc_norm_stderr": 0.027722065493361273 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2582781456953642, + "acc_stderr": 0.035737053147634576, + "acc_norm": 0.2582781456953642, + "acc_norm_stderr": 0.035737053147634576 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.23119266055045873, + "acc_stderr": 0.018075750241633156, + "acc_norm": 0.23119266055045873, + "acc_norm_stderr": 0.018075750241633156 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.02835321286686346, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.02835321286686346 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.27941176470588236, + "acc_stderr": 0.031493281045079556, + "acc_norm": 0.27941176470588236, + "acc_norm_stderr": 0.031493281045079556 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.23628691983122363, + "acc_stderr": 0.027652153144159263, + "acc_norm": 0.23628691983122363, + "acc_norm_stderr": 0.027652153144159263 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.25112107623318386, + "acc_stderr": 0.029105220833224626, + "acc_norm": 0.25112107623318386, + "acc_norm_stderr": 0.029105220833224626 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2748091603053435, + "acc_stderr": 0.039153454088478354, + "acc_norm": 0.2748091603053435, + "acc_norm_stderr": 0.039153454088478354 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2644628099173554, + "acc_stderr": 0.04026187527591207, + "acc_norm": 0.2644628099173554, + "acc_norm_stderr": 0.04026187527591207 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.0401910747255735, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.0401910747255735 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.294478527607362, + "acc_stderr": 0.03581165790474082, + "acc_norm": 0.294478527607362, + "acc_norm_stderr": 0.03581165790474082 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25, + "acc_stderr": 0.04109974682633932, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04109974682633932 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.21359223300970873, + "acc_stderr": 0.04058042015646034, + "acc_norm": 0.21359223300970873, + "acc_norm_stderr": 0.04058042015646034 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.02723601394619669, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.02723601394619669 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2541507024265645, + "acc_stderr": 0.015569254692045769, + "acc_norm": 0.2541507024265645, + "acc_norm_stderr": 0.015569254692045769 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2861271676300578, + "acc_stderr": 0.024332146779134135, + "acc_norm": 0.2861271676300578, + "acc_norm_stderr": 0.024332146779134135 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.22569832402234638, + "acc_stderr": 0.013981395058455054, + "acc_norm": 0.22569832402234638, + "acc_norm_stderr": 0.013981395058455054 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.024288619466046112, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.024288619466046112 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2090032154340836, + "acc_stderr": 0.02309314039837422, + "acc_norm": 0.2090032154340836, + "acc_norm_stderr": 0.02309314039837422 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.23765432098765432, + "acc_stderr": 0.023683591837008553, + "acc_norm": 0.23765432098765432, + "acc_norm_stderr": 0.023683591837008553 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2730496453900709, + "acc_stderr": 0.026577860943307854, + "acc_norm": 0.2730496453900709, + "acc_norm_stderr": 0.026577860943307854 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24837027379400262, + "acc_stderr": 0.011035212598034505, + "acc_norm": 0.24837027379400262, + "acc_norm_stderr": 0.011035212598034505 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.24632352941176472, + "acc_stderr": 0.02617343857052, + "acc_norm": 0.24632352941176472, + "acc_norm_stderr": 0.02617343857052 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2696078431372549, + "acc_stderr": 0.017952449196987866, + "acc_norm": 0.2696078431372549, + "acc_norm_stderr": 0.017952449196987866 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.23636363636363636, + "acc_stderr": 0.04069306319721376, + "acc_norm": 0.23636363636363636, + "acc_norm_stderr": 0.04069306319721376 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.24897959183673468, + "acc_stderr": 0.027682979522960227, + "acc_norm": 0.24897959183673468, + "acc_norm_stderr": 0.027682979522960227 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.29850746268656714, + "acc_stderr": 0.03235743789355044, + "acc_norm": 0.29850746268656714, + "acc_norm_stderr": 0.03235743789355044 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.23, + "acc_stderr": 0.042295258468165044, + "acc_norm": 0.23, + "acc_norm_stderr": 0.042295258468165044 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3192771084337349, + "acc_stderr": 0.03629335329947859, + "acc_norm": 0.3192771084337349, + "acc_norm_stderr": 0.03629335329947859 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.30994152046783624, + "acc_stderr": 0.035469769593931624, + "acc_norm": 0.30994152046783624, + "acc_norm_stderr": 0.035469769593931624 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862661, + "mc2": NaN, + "mc2_stderr": NaN + }, + "all": { + "acc": 0.25128839936910946, + "acc_stderr": 0.031530829274574367, + "acc_norm": 0.25172179055667643, + "acc_norm_stderr": 0.03153793196108255, + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862661, + "mc2": NaN, + "mc2_stderr": NaN + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca", + "model_sha": "", + "model_dtype": "torch.float16", + "lighteval_sha": "5f779c2b88600e81a25d5dd5a059c8902022e8fd", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "99ff49c78917d666", + "hash_cont_tokens": "568988b9c3bfc83c" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "27b384658a4b826e", + "hash_cont_tokens": "5966c7ceee7144f8" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40153, + "non-padded": 15, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "dac91b437d631599", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "06cd9a69af842291", + "hash_cont_tokens": "b408913f391dc598" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "7e0363633bd4c661", + "hash_cont_tokens": "4ab285fa2a75c029" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "a1b916a7277078b4", + "hash_cont_tokens": "15baabbd71328cbe" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "af46942ff5deb21d", + "hash_cont_tokens": "96c880c9478a4037" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "5882d6931ded2237", + "hash_cont_tokens": "6268ee610a672867" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "b24180b880da9cdc", + "hash_cont_tokens": "7b194ff8e7e390ce" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "9bc1d680b14c82ee", + "hash_cont_tokens": "2fe5eee1df1b81bb" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "79aced2bcafe02e4", + "hash_cont_tokens": "499ffd87e7a60146" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "3e657aa09cc216ff", + "hash_cont_tokens": "e5df51bb12073b7b" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5f521206bd8121ad", + "hash_cont_tokens": "4abfe03c09581bce" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "b12ce1e36c118558", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "221bbd7b0d39e269", + "hash_cont_tokens": "4dc3a1c45702aea2" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "d475018fde7b68bf", + "hash_cont_tokens": "abfc7c631218ed32" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "964e79b20780ee59", + "hash_cont_tokens": "195db06c037d7c81" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 569, + "non-padded": 11, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "829b84905d5794d7", + "hash_cont_tokens": "4274dfcea97c4e27" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "83233577e0f66071", + "hash_cont_tokens": "aadc96b61f4bea54" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "b45c36cf0fc38f67", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "47f5c034c56e090f", + "hash_cont_tokens": "6ea5c6b690913b0f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "13286ca334f1e8e7", + "hash_cont_tokens": "befe57dcb5a5a7d3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e3a3351b698e7311", + "hash_cont_tokens": "8da78e4005b8faf9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "6639a9e4f4eb57c5", + "hash_cont_tokens": "ff5ae57ff23b53d1" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "cfe8f73d53615fc7", + "hash_cont_tokens": "db85309de1591035" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "1f8541aadce8b236", + "hash_cont_tokens": "6890e2bc35a602ef" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8da2d7f4edfdafd5", + "hash_cont_tokens": "6132e48ff0edea66" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "52328f9dec1844ed", + "hash_cont_tokens": "d201a0126c9a530c" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "04d97c91eee4e141", + "hash_cont_tokens": "596c4f1066a38e91" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d8d05cf169bd7639", + "hash_cont_tokens": "fcefc753d295e446" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "03f858b330d55fed", + "hash_cont_tokens": "a4a552f563078902" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "ce2ca0558b9a5f27", + "hash_cont_tokens": "85dbbdba6017eaec" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a3884e14c3c038b5", + "hash_cont_tokens": "7d705edd113a3d4d" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b3f5f4615f906023", + "hash_cont_tokens": "211397dca1d04c0a" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "0d806b9b33c54432", + "hash_cont_tokens": "b196c68db4825727" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "4c9f4c1de8d94adf", + "hash_cont_tokens": "ffc3b70128684ad0" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "4e565cd482620bbe", + "hash_cont_tokens": "bcaed810d47c62aa" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "13cbfca1b5b84f78", + "hash_cont_tokens": "ea7ff206c4da6f57" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "bf707bcaadcd1b7f", + "hash_cont_tokens": "4a853cb5874d2adc" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "78808255dea01f83", + "hash_cont_tokens": "9e40b162dc928ce5" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6bab60a3ce133e17", + "hash_cont_tokens": "c93d7596aa2246ea" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "d0fcde4d547d9832", + "hash_cont_tokens": "af4b0ee8ee2bb07f" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "78c8a1b611a22020", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "690c7a1333c1030b", + "hash_cont_tokens": "5b068e21debc566e" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "de74e3025a1cd4e3", + "hash_cont_tokens": "8d79c8c8d3b1fa75" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1384, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "77cf2aceb27a9b48", + "hash_cont_tokens": "30d3a442342e5f19" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c149e4bfa0bd49e2", + "hash_cont_tokens": "231f307b052cc303" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "8e8dd2f09979a669", + "hash_cont_tokens": "faaa18e05a96eb91" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "beb7b4488967bf13", + "hash_cont_tokens": "3fa5ef4207c2fae2" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "6dead6c7a78a877e", + "hash_cont_tokens": "711398f4a1641e99" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "a3cf3a06ebd3a4c2", + "hash_cont_tokens": "5c9515fd601cb0d7" + }, + "truncated": 92, + "non-truncated": 6044, + "padded": 6032, + "non-padded": 104, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "8ef46fa5025f8036", + "hash_cont_tokens": "bb99427ea7c63f48" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "592938a865df4169", + "hash_cont_tokens": "cdbe1515e8c6e3ce" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "6708e93b0c611917", + "hash_cont_tokens": "c54f38d507746b57" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d9c3e621c2145453", + "hash_cont_tokens": "16d346d36b44190b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "862a1d43b0709cc8", + "hash_cont_tokens": "e329121c50bb2b96" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "0f8b3d09b9f523d6", + "hash_cont_tokens": "446207f22323db3e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "543430e3d6af520f", + "hash_cont_tokens": "30dcb20b1aeaf10b" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "a9f37ee284fec309", + "hash_cont_tokens": "f8476c0c6f07dff2" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "bc9ef61861cd1b47", + "hash_cont_tokens": "d07001d4d0214aa3" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5718915646c336d4", + "hash_cont_tokens": "be8494d5ebf3309a" + }, + "total_evaluation_time_secondes": "2174.3092885017395", + "truncated": 1568, + "non-truncated": 109451, + "padded": 109413, + "non-padded": 1606, + "num_truncated_few_shots": 0 + } +} diff --git a/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca/results_2023-08-04T13-53-57.188756.json b/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca/results_2023-08-04T13-53-57.188756.json new file mode 100644 index 0000000000000000000000000000000000000000..04befb84009b4220893b441404436ba39785704a --- /dev/null +++ b/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca/results_2023-08-04T13-53-57.188756.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.21928327645051193, + "acc_stderr": 0.012091245787615727, + "acc_norm": 0.2593856655290102, + "acc_norm_stderr": 0.012808273573927102 + }, + "harness|hellaswag|10": { + "acc": 0.25960963951404104, + "acc_stderr": 0.004375244237045127, + "acc_norm": 0.25761800438159727, + "acc_norm_stderr": 0.004364287353415444 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.21481481481481482, + "acc_stderr": 0.03547854198560823, + "acc_norm": 0.21481481481481482, + "acc_norm_stderr": 0.03547854198560823 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.20394736842105263, + "acc_stderr": 0.0327900040631005, + "acc_norm": 0.20394736842105263, + "acc_norm_stderr": 0.0327900040631005 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2, + "acc_stderr": 0.02461829819586651, + "acc_norm": 0.2, + "acc_norm_stderr": 0.02461829819586651 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.18055555555555555, + "acc_stderr": 0.032166008088022675, + "acc_norm": 0.18055555555555555, + "acc_norm_stderr": 0.032166008088022675 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2774566473988439, + "acc_stderr": 0.03414014007044036, + "acc_norm": 0.2774566473988439, + "acc_norm_stderr": 0.03414014007044036 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.044405219061793275, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.044405219061793275 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2765957446808511, + "acc_stderr": 0.029241883869628824, + "acc_norm": 0.2765957446808511, + "acc_norm_stderr": 0.029241883869628824 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813344, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813344 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.23448275862068965, + "acc_stderr": 0.035306258743465914, + "acc_norm": 0.23448275862068965, + "acc_norm_stderr": 0.035306258743465914 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.23544973544973544, + "acc_stderr": 0.021851509822031715, + "acc_norm": 0.23544973544973544, + "acc_norm_stderr": 0.021851509822031715 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.03852273364924316, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.03852273364924316 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.3, + "acc_stderr": 0.04605661864718381, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04605661864718381 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24516129032258063, + "acc_stderr": 0.024472243840895525, + "acc_norm": 0.24516129032258063, + "acc_norm_stderr": 0.024472243840895525 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.22660098522167488, + "acc_stderr": 0.02945486383529297, + "acc_norm": 0.22660098522167488, + "acc_norm_stderr": 0.02945486383529297 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.03477691162163659, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.03477691162163659 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.20202020202020202, + "acc_stderr": 0.02860620428922987, + "acc_norm": 0.20202020202020202, + "acc_norm_stderr": 0.02860620428922987 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.2538860103626943, + "acc_stderr": 0.03141024780565319, + "acc_norm": 0.2538860103626943, + "acc_norm_stderr": 0.03141024780565319 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2230769230769231, + "acc_stderr": 0.02110773012724401, + "acc_norm": 0.2230769230769231, + "acc_norm_stderr": 0.02110773012724401 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.026067159222275805, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.026067159222275805 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.23949579831932774, + "acc_stderr": 0.02772206549336126, + "acc_norm": 0.23949579831932774, + "acc_norm_stderr": 0.02772206549336126 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2185430463576159, + "acc_stderr": 0.03374235550425694, + "acc_norm": 0.2185430463576159, + "acc_norm_stderr": 0.03374235550425694 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.25137614678899084, + "acc_stderr": 0.018599206360287415, + "acc_norm": 0.25137614678899084, + "acc_norm_stderr": 0.018599206360287415 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.027920963147993645, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.027920963147993645 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.030587591351604257, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.030587591351604257 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.25738396624472576, + "acc_stderr": 0.028458820991460305, + "acc_norm": 0.25738396624472576, + "acc_norm_stderr": 0.028458820991460305 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3094170403587444, + "acc_stderr": 0.031024411740572213, + "acc_norm": 0.3094170403587444, + "acc_norm_stderr": 0.031024411740572213 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.20610687022900764, + "acc_stderr": 0.035477710041594626, + "acc_norm": 0.20610687022900764, + "acc_norm_stderr": 0.035477710041594626 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.19834710743801653, + "acc_stderr": 0.036401182719909456, + "acc_norm": 0.19834710743801653, + "acc_norm_stderr": 0.036401182719909456 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04557239513497751, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04557239513497751 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.24539877300613497, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.24539877300613497, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25892857142857145, + "acc_stderr": 0.04157751539865629, + "acc_norm": 0.25892857142857145, + "acc_norm_stderr": 0.04157751539865629 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.23300970873786409, + "acc_stderr": 0.04185832598928315, + "acc_norm": 0.23300970873786409, + "acc_norm_stderr": 0.04185832598928315 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2094017094017094, + "acc_stderr": 0.026655699653922737, + "acc_norm": 0.2094017094017094, + "acc_norm_stderr": 0.026655699653922737 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.26309067688378035, + "acc_stderr": 0.01574549716904904, + "acc_norm": 0.26309067688378035, + "acc_norm_stderr": 0.01574549716904904 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.27167630057803466, + "acc_stderr": 0.023948512905468344, + "acc_norm": 0.27167630057803466, + "acc_norm_stderr": 0.023948512905468344 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.264804469273743, + "acc_stderr": 0.014756906483260659, + "acc_norm": 0.264804469273743, + "acc_norm_stderr": 0.014756906483260659 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.024288619466046123, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.024288619466046123 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.21543408360128619, + "acc_stderr": 0.023350225475471418, + "acc_norm": 0.21543408360128619, + "acc_norm_stderr": 0.023350225475471418 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.24691358024691357, + "acc_stderr": 0.023993501709042117, + "acc_norm": 0.24691358024691357, + "acc_norm_stderr": 0.023993501709042117 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2375886524822695, + "acc_stderr": 0.0253895125527299, + "acc_norm": 0.2375886524822695, + "acc_norm_stderr": 0.0253895125527299 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.25554106910039115, + "acc_stderr": 0.011139857833598518, + "acc_norm": 0.25554106910039115, + "acc_norm_stderr": 0.011139857833598518 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.25, + "acc_stderr": 0.026303648393696036, + "acc_norm": 0.25, + "acc_norm_stderr": 0.026303648393696036 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.272875816993464, + "acc_stderr": 0.01802047414839358, + "acc_norm": 0.272875816993464, + "acc_norm_stderr": 0.01802047414839358 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.3090909090909091, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.3090909090909091, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.22857142857142856, + "acc_stderr": 0.026882144922307748, + "acc_norm": 0.22857142857142856, + "acc_norm_stderr": 0.026882144922307748 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.263681592039801, + "acc_stderr": 0.031157150869355568, + "acc_norm": 0.263681592039801, + "acc_norm_stderr": 0.031157150869355568 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.22289156626506024, + "acc_stderr": 0.03240004825594687, + "acc_norm": 0.22289156626506024, + "acc_norm_stderr": 0.03240004825594687 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.25146198830409355, + "acc_stderr": 0.033275044238468436, + "acc_norm": 0.25146198830409355, + "acc_norm_stderr": 0.033275044238468436 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.211750305997552, + "mc1_stderr": 0.014302068353925616, + "mc2": NaN, + "mc2_stderr": NaN + }, + "all": { + "acc": 0.24629348927227462, + "acc_stderr": 0.03137893388089655, + "acc_norm": 0.24693943425441114, + "acc_norm_stderr": 0.03139090118433183, + "mc1": 0.211750305997552, + "mc1_stderr": 0.014302068353925616, + "mc2": NaN, + "mc2_stderr": NaN + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca", + "model_sha": "", + "model_dtype": "torch.float16", + "lighteval_sha": "5f779c2b88600e81a25d5dd5a059c8902022e8fd", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "99ff49c78917d666", + "hash_cont_tokens": "568988b9c3bfc83c" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "27b384658a4b826e", + "hash_cont_tokens": "5966c7ceee7144f8" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40153, + "non-padded": 15, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "dac91b437d631599", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "06cd9a69af842291", + "hash_cont_tokens": "b408913f391dc598" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "7e0363633bd4c661", + "hash_cont_tokens": "4ab285fa2a75c029" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "a1b916a7277078b4", + "hash_cont_tokens": "15baabbd71328cbe" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "af46942ff5deb21d", + "hash_cont_tokens": "96c880c9478a4037" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "5882d6931ded2237", + "hash_cont_tokens": "6268ee610a672867" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "b24180b880da9cdc", + "hash_cont_tokens": "7b194ff8e7e390ce" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "9bc1d680b14c82ee", + "hash_cont_tokens": "2fe5eee1df1b81bb" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "79aced2bcafe02e4", + "hash_cont_tokens": "499ffd87e7a60146" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "3e657aa09cc216ff", + "hash_cont_tokens": "e5df51bb12073b7b" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5f521206bd8121ad", + "hash_cont_tokens": "4abfe03c09581bce" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "b12ce1e36c118558", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "221bbd7b0d39e269", + "hash_cont_tokens": "4dc3a1c45702aea2" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "d475018fde7b68bf", + "hash_cont_tokens": "abfc7c631218ed32" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "964e79b20780ee59", + "hash_cont_tokens": "195db06c037d7c81" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 569, + "non-padded": 11, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "829b84905d5794d7", + "hash_cont_tokens": "4274dfcea97c4e27" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "83233577e0f66071", + "hash_cont_tokens": "aadc96b61f4bea54" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "b45c36cf0fc38f67", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "47f5c034c56e090f", + "hash_cont_tokens": "6ea5c6b690913b0f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "13286ca334f1e8e7", + "hash_cont_tokens": "befe57dcb5a5a7d3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e3a3351b698e7311", + "hash_cont_tokens": "8da78e4005b8faf9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "6639a9e4f4eb57c5", + "hash_cont_tokens": "ff5ae57ff23b53d1" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "cfe8f73d53615fc7", + "hash_cont_tokens": "db85309de1591035" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "1f8541aadce8b236", + "hash_cont_tokens": "6890e2bc35a602ef" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8da2d7f4edfdafd5", + "hash_cont_tokens": "6132e48ff0edea66" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "52328f9dec1844ed", + "hash_cont_tokens": "d201a0126c9a530c" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "04d97c91eee4e141", + "hash_cont_tokens": "596c4f1066a38e91" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d8d05cf169bd7639", + "hash_cont_tokens": "fcefc753d295e446" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "03f858b330d55fed", + "hash_cont_tokens": "a4a552f563078902" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "ce2ca0558b9a5f27", + "hash_cont_tokens": "85dbbdba6017eaec" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a3884e14c3c038b5", + "hash_cont_tokens": "7d705edd113a3d4d" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b3f5f4615f906023", + "hash_cont_tokens": "211397dca1d04c0a" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "0d806b9b33c54432", + "hash_cont_tokens": "b196c68db4825727" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "4c9f4c1de8d94adf", + "hash_cont_tokens": "ffc3b70128684ad0" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "4e565cd482620bbe", + "hash_cont_tokens": "bcaed810d47c62aa" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "13cbfca1b5b84f78", + "hash_cont_tokens": "ea7ff206c4da6f57" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "bf707bcaadcd1b7f", + "hash_cont_tokens": "4a853cb5874d2adc" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "78808255dea01f83", + "hash_cont_tokens": "9e40b162dc928ce5" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6bab60a3ce133e17", + "hash_cont_tokens": "c93d7596aa2246ea" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "d0fcde4d547d9832", + "hash_cont_tokens": "af4b0ee8ee2bb07f" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "78c8a1b611a22020", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "690c7a1333c1030b", + "hash_cont_tokens": "5b068e21debc566e" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "de74e3025a1cd4e3", + "hash_cont_tokens": "8d79c8c8d3b1fa75" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1384, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "77cf2aceb27a9b48", + "hash_cont_tokens": "30d3a442342e5f19" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c149e4bfa0bd49e2", + "hash_cont_tokens": "231f307b052cc303" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "8e8dd2f09979a669", + "hash_cont_tokens": "faaa18e05a96eb91" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "beb7b4488967bf13", + "hash_cont_tokens": "3fa5ef4207c2fae2" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "6dead6c7a78a877e", + "hash_cont_tokens": "711398f4a1641e99" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "a3cf3a06ebd3a4c2", + "hash_cont_tokens": "5c9515fd601cb0d7" + }, + "truncated": 92, + "non-truncated": 6044, + "padded": 6032, + "non-padded": 104, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "8ef46fa5025f8036", + "hash_cont_tokens": "bb99427ea7c63f48" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "592938a865df4169", + "hash_cont_tokens": "cdbe1515e8c6e3ce" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "6708e93b0c611917", + "hash_cont_tokens": "c54f38d507746b57" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d9c3e621c2145453", + "hash_cont_tokens": "16d346d36b44190b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "862a1d43b0709cc8", + "hash_cont_tokens": "e329121c50bb2b96" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "0f8b3d09b9f523d6", + "hash_cont_tokens": "446207f22323db3e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "543430e3d6af520f", + "hash_cont_tokens": "30dcb20b1aeaf10b" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "a9f37ee284fec309", + "hash_cont_tokens": "f8476c0c6f07dff2" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "bc9ef61861cd1b47", + "hash_cont_tokens": "d07001d4d0214aa3" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5718915646c336d4", + "hash_cont_tokens": "be8494d5ebf3309a" + }, + "total_evaluation_time_secondes": "2098.5994806289673", + "truncated": 1568, + "non-truncated": 109451, + "padded": 109413, + "non-padded": 1606, + "num_truncated_few_shots": 0 + } +} diff --git a/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca/results_2023-09-22T21-20-12.395485.json b/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca/results_2023-09-22T21-20-12.395485.json new file mode 100644 index 0000000000000000000000000000000000000000..456ea9c02af15cf6f36a1379a40c6d48f8c73efe --- /dev/null +++ b/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca/results_2023-09-22T21-20-12.395485.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca", + "model_sha": "07d9d32cd091148295d4e13802ba63486599aff4", + "model_size": "6.4 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0005243288590604027, + "em_stderr": 0.00023443780464835917, + "f1": 0.04621329697986584, + "f1_stderr": 0.001229059804190605 + }, + "harness|gsm8k|5": { + "acc": 0.009097801364670205, + "acc_stderr": 0.002615326510775673 + }, + "harness|winogrande|5": { + "acc": 0.6448303078137332, + "acc_stderr": 0.013450047479569257 + }, + "all": { + "em": 0.0005243288590604027, + "em_stderr": 0.00023443780464835917, + "f1": 0.04621329697986584, + "f1_stderr": 0.001229059804190605, + "acc": 0.3269640545892017, + "acc_stderr": 0.008032686995172466 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "94a4cd17eab28d43", + "hash_cont_tokens": "394126ad11b7738e" + }, + "truncated": 864, + "non-truncated": 8672, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "237c6b7eceaca35e", + "hash_cont_tokens": "858f29a5e460cd6e" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "de8488b97864debc", + "hash_cont_tokens": "33dc409d2c2e3198" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2418, + "non-padded": 116, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "35a6a4f679049d66", + "hash_cont_tokens": "5ad9cb773bba5975" + }, + "total_evaluation_time_secondes": "8881.57160949707", + "truncated": 864, + "non-truncated": 12525, + "padded": 2418, + "non-padded": 10971, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca/results_2023-09-22T21-36-39.212716.json b/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca/results_2023-09-22T21-36-39.212716.json new file mode 100644 index 0000000000000000000000000000000000000000..1a63dbfb78348bbbdbf36097b05032acf92d27d7 --- /dev/null +++ b/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca/results_2023-09-22T21-36-39.212716.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Andron00e/YetAnother_Open-Llama-3B-LoRA-OpenOrca", + "model_sha": "07d9d32cd091148295d4e13802ba63486599aff4", + "model_size": "6.4 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0004404362416107381, + "f1_stderr": 6.976502994544788e-05 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5082872928176796, + "acc_stderr": 0.014050555322824192 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0004404362416107381, + "f1_stderr": 6.976502994544788e-05, + "acc": 0.2541436464088398, + "acc_stderr": 0.007025277661412096 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "94a4cd17eab28d43", + "hash_cont_tokens": "2d08e4f9b6d44aff" + }, + "truncated": 864, + "non-truncated": 8672, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "237c6b7eceaca35e", + "hash_cont_tokens": "f2acaad72897b52c" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "de8488b97864debc", + "hash_cont_tokens": "33dc409d2c2e3198" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2418, + "non-padded": 116, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "35a6a4f679049d66", + "hash_cont_tokens": "a2aeeb45449ff9f2" + }, + "total_evaluation_time_secondes": "18631.2642891407", + "truncated": 864, + "non-truncated": 12525, + "padded": 2418, + "non-padded": 10971, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA/results_2023-08-21T16-02-39.470233.json b/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA/results_2023-08-21T16-02-39.470233.json new file mode 100644 index 0000000000000000000000000000000000000000..e7e1ddc9fe6ac470ba75fd4e6b424fdc1e8daa5f --- /dev/null +++ b/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA/results_2023-08-21T16-02-39.470233.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.21928327645051193, + "acc_stderr": 0.012091245787615727, + "acc_norm": 0.2593856655290102, + "acc_norm_stderr": 0.012808273573927102 + }, + "harness|hellaswag|10": { + "acc": 0.25960963951404104, + "acc_stderr": 0.004375244237045127, + "acc_norm": 0.25761800438159727, + "acc_norm_stderr": 0.004364287353415444 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.21481481481481482, + "acc_stderr": 0.03547854198560823, + "acc_norm": 0.21481481481481482, + "acc_norm_stderr": 0.03547854198560823 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.20394736842105263, + "acc_stderr": 0.0327900040631005, + "acc_norm": 0.20394736842105263, + "acc_norm_stderr": 0.0327900040631005 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2, + "acc_stderr": 0.02461829819586651, + "acc_norm": 0.2, + "acc_norm_stderr": 0.02461829819586651 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.18055555555555555, + "acc_stderr": 0.032166008088022675, + "acc_norm": 0.18055555555555555, + "acc_norm_stderr": 0.032166008088022675 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2774566473988439, + "acc_stderr": 0.03414014007044036, + "acc_norm": 0.2774566473988439, + "acc_norm_stderr": 0.03414014007044036 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.044405219061793275, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.044405219061793275 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2765957446808511, + "acc_stderr": 0.029241883869628824, + "acc_norm": 0.2765957446808511, + "acc_norm_stderr": 0.029241883869628824 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813344, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813344 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.23448275862068965, + "acc_stderr": 0.035306258743465914, + "acc_norm": 0.23448275862068965, + "acc_norm_stderr": 0.035306258743465914 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.23544973544973544, + "acc_stderr": 0.021851509822031715, + "acc_norm": 0.23544973544973544, + "acc_norm_stderr": 0.021851509822031715 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.03852273364924316, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.03852273364924316 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.3, + "acc_stderr": 0.04605661864718381, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04605661864718381 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24516129032258063, + "acc_stderr": 0.024472243840895525, + "acc_norm": 0.24516129032258063, + "acc_norm_stderr": 0.024472243840895525 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.22660098522167488, + "acc_stderr": 0.02945486383529297, + "acc_norm": 0.22660098522167488, + "acc_norm_stderr": 0.02945486383529297 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.03477691162163659, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.03477691162163659 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.20202020202020202, + "acc_stderr": 0.02860620428922987, + "acc_norm": 0.20202020202020202, + "acc_norm_stderr": 0.02860620428922987 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.2538860103626943, + "acc_stderr": 0.03141024780565319, + "acc_norm": 0.2538860103626943, + "acc_norm_stderr": 0.03141024780565319 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2230769230769231, + "acc_stderr": 0.02110773012724401, + "acc_norm": 0.2230769230769231, + "acc_norm_stderr": 0.02110773012724401 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.026067159222275805, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.026067159222275805 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.23949579831932774, + "acc_stderr": 0.02772206549336126, + "acc_norm": 0.23949579831932774, + "acc_norm_stderr": 0.02772206549336126 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2185430463576159, + "acc_stderr": 0.03374235550425694, + "acc_norm": 0.2185430463576159, + "acc_norm_stderr": 0.03374235550425694 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.25137614678899084, + "acc_stderr": 0.018599206360287415, + "acc_norm": 0.25137614678899084, + "acc_norm_stderr": 0.018599206360287415 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.027920963147993645, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.027920963147993645 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.030587591351604257, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.030587591351604257 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.25738396624472576, + "acc_stderr": 0.028458820991460305, + "acc_norm": 0.25738396624472576, + "acc_norm_stderr": 0.028458820991460305 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3094170403587444, + "acc_stderr": 0.031024411740572213, + "acc_norm": 0.3094170403587444, + "acc_norm_stderr": 0.031024411740572213 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.20610687022900764, + "acc_stderr": 0.035477710041594626, + "acc_norm": 0.20610687022900764, + "acc_norm_stderr": 0.035477710041594626 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.19834710743801653, + "acc_stderr": 0.036401182719909456, + "acc_norm": 0.19834710743801653, + "acc_norm_stderr": 0.036401182719909456 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04557239513497751, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04557239513497751 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.24539877300613497, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.24539877300613497, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25892857142857145, + "acc_stderr": 0.04157751539865629, + "acc_norm": 0.25892857142857145, + "acc_norm_stderr": 0.04157751539865629 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.23300970873786409, + "acc_stderr": 0.04185832598928315, + "acc_norm": 0.23300970873786409, + "acc_norm_stderr": 0.04185832598928315 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2094017094017094, + "acc_stderr": 0.026655699653922737, + "acc_norm": 0.2094017094017094, + "acc_norm_stderr": 0.026655699653922737 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.26309067688378035, + "acc_stderr": 0.01574549716904904, + "acc_norm": 0.26309067688378035, + "acc_norm_stderr": 0.01574549716904904 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.27167630057803466, + "acc_stderr": 0.023948512905468344, + "acc_norm": 0.27167630057803466, + "acc_norm_stderr": 0.023948512905468344 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.264804469273743, + "acc_stderr": 0.014756906483260659, + "acc_norm": 0.264804469273743, + "acc_norm_stderr": 0.014756906483260659 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.024288619466046123, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.024288619466046123 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.21543408360128619, + "acc_stderr": 0.023350225475471418, + "acc_norm": 0.21543408360128619, + "acc_norm_stderr": 0.023350225475471418 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.24691358024691357, + "acc_stderr": 0.023993501709042117, + "acc_norm": 0.24691358024691357, + "acc_norm_stderr": 0.023993501709042117 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2375886524822695, + "acc_stderr": 0.0253895125527299, + "acc_norm": 0.2375886524822695, + "acc_norm_stderr": 0.0253895125527299 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.25554106910039115, + "acc_stderr": 0.011139857833598518, + "acc_norm": 0.25554106910039115, + "acc_norm_stderr": 0.011139857833598518 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.25, + "acc_stderr": 0.026303648393696036, + "acc_norm": 0.25, + "acc_norm_stderr": 0.026303648393696036 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.272875816993464, + "acc_stderr": 0.01802047414839358, + "acc_norm": 0.272875816993464, + "acc_norm_stderr": 0.01802047414839358 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.3090909090909091, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.3090909090909091, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.22857142857142856, + "acc_stderr": 0.026882144922307748, + "acc_norm": 0.22857142857142856, + "acc_norm_stderr": 0.026882144922307748 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.263681592039801, + "acc_stderr": 0.031157150869355568, + "acc_norm": 0.263681592039801, + "acc_norm_stderr": 0.031157150869355568 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.22289156626506024, + "acc_stderr": 0.03240004825594687, + "acc_norm": 0.22289156626506024, + "acc_norm_stderr": 0.03240004825594687 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.25146198830409355, + "acc_stderr": 0.033275044238468436, + "acc_norm": 0.25146198830409355, + "acc_norm_stderr": 0.033275044238468436 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.211750305997552, + "mc1_stderr": 0.014302068353925616, + "mc2": NaN, + "mc2_stderr": NaN + }, + "all": { + "acc": 0.24629348927227462, + "acc_stderr": 0.03137893388089655, + "acc_norm": 0.24693943425441114, + "acc_norm_stderr": 0.03139090118433183, + "mc1": 0.211750305997552, + "mc1_stderr": 0.014302068353925616, + "mc2": NaN, + "mc2_stderr": NaN + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "Andron00e/YetAnother_Open-Llama-3B-LoRA", + "model_sha": "52c5cb0178831908ed0571f1750fcb0f0fb125f9", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "99ff49c78917d666", + "hash_cont_tokens": "568988b9c3bfc83c" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "27b384658a4b826e", + "hash_cont_tokens": "5966c7ceee7144f8" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40153, + "non-padded": 15, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "dac91b437d631599", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "06cd9a69af842291", + "hash_cont_tokens": "b408913f391dc598" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "7e0363633bd4c661", + "hash_cont_tokens": "4ab285fa2a75c029" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "a1b916a7277078b4", + "hash_cont_tokens": "15baabbd71328cbe" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "af46942ff5deb21d", + "hash_cont_tokens": "96c880c9478a4037" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "5882d6931ded2237", + "hash_cont_tokens": "6268ee610a672867" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "b24180b880da9cdc", + "hash_cont_tokens": "7b194ff8e7e390ce" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "9bc1d680b14c82ee", + "hash_cont_tokens": "2fe5eee1df1b81bb" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "79aced2bcafe02e4", + "hash_cont_tokens": "499ffd87e7a60146" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "3e657aa09cc216ff", + "hash_cont_tokens": "e5df51bb12073b7b" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5f521206bd8121ad", + "hash_cont_tokens": "4abfe03c09581bce" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "b12ce1e36c118558", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "221bbd7b0d39e269", + "hash_cont_tokens": "4dc3a1c45702aea2" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "d475018fde7b68bf", + "hash_cont_tokens": "abfc7c631218ed32" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "964e79b20780ee59", + "hash_cont_tokens": "195db06c037d7c81" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 569, + "non-padded": 11, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "829b84905d5794d7", + "hash_cont_tokens": "4274dfcea97c4e27" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "83233577e0f66071", + "hash_cont_tokens": "aadc96b61f4bea54" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "b45c36cf0fc38f67", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "47f5c034c56e090f", + "hash_cont_tokens": "6ea5c6b690913b0f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "13286ca334f1e8e7", + "hash_cont_tokens": "befe57dcb5a5a7d3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e3a3351b698e7311", + "hash_cont_tokens": "8da78e4005b8faf9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "6639a9e4f4eb57c5", + "hash_cont_tokens": "ff5ae57ff23b53d1" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "cfe8f73d53615fc7", + "hash_cont_tokens": "db85309de1591035" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "1f8541aadce8b236", + "hash_cont_tokens": "6890e2bc35a602ef" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8da2d7f4edfdafd5", + "hash_cont_tokens": "6132e48ff0edea66" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "52328f9dec1844ed", + "hash_cont_tokens": "d201a0126c9a530c" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "04d97c91eee4e141", + "hash_cont_tokens": "596c4f1066a38e91" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d8d05cf169bd7639", + "hash_cont_tokens": "fcefc753d295e446" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "03f858b330d55fed", + "hash_cont_tokens": "a4a552f563078902" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "ce2ca0558b9a5f27", + "hash_cont_tokens": "85dbbdba6017eaec" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a3884e14c3c038b5", + "hash_cont_tokens": "7d705edd113a3d4d" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b3f5f4615f906023", + "hash_cont_tokens": "211397dca1d04c0a" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "0d806b9b33c54432", + "hash_cont_tokens": "b196c68db4825727" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "4c9f4c1de8d94adf", + "hash_cont_tokens": "ffc3b70128684ad0" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "4e565cd482620bbe", + "hash_cont_tokens": "bcaed810d47c62aa" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "13cbfca1b5b84f78", + "hash_cont_tokens": "ea7ff206c4da6f57" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "bf707bcaadcd1b7f", + "hash_cont_tokens": "4a853cb5874d2adc" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "78808255dea01f83", + "hash_cont_tokens": "9e40b162dc928ce5" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6bab60a3ce133e17", + "hash_cont_tokens": "c93d7596aa2246ea" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "d0fcde4d547d9832", + "hash_cont_tokens": "af4b0ee8ee2bb07f" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "78c8a1b611a22020", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "690c7a1333c1030b", + "hash_cont_tokens": "5b068e21debc566e" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "de74e3025a1cd4e3", + "hash_cont_tokens": "8d79c8c8d3b1fa75" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1384, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "77cf2aceb27a9b48", + "hash_cont_tokens": "30d3a442342e5f19" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c149e4bfa0bd49e2", + "hash_cont_tokens": "231f307b052cc303" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "8e8dd2f09979a669", + "hash_cont_tokens": "faaa18e05a96eb91" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "beb7b4488967bf13", + "hash_cont_tokens": "3fa5ef4207c2fae2" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "6dead6c7a78a877e", + "hash_cont_tokens": "711398f4a1641e99" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "a3cf3a06ebd3a4c2", + "hash_cont_tokens": "5c9515fd601cb0d7" + }, + "truncated": 92, + "non-truncated": 6044, + "padded": 6032, + "non-padded": 104, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "8ef46fa5025f8036", + "hash_cont_tokens": "bb99427ea7c63f48" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "592938a865df4169", + "hash_cont_tokens": "cdbe1515e8c6e3ce" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "6708e93b0c611917", + "hash_cont_tokens": "c54f38d507746b57" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d9c3e621c2145453", + "hash_cont_tokens": "16d346d36b44190b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "862a1d43b0709cc8", + "hash_cont_tokens": "e329121c50bb2b96" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "0f8b3d09b9f523d6", + "hash_cont_tokens": "446207f22323db3e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "543430e3d6af520f", + "hash_cont_tokens": "30dcb20b1aeaf10b" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "a9f37ee284fec309", + "hash_cont_tokens": "f8476c0c6f07dff2" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "bc9ef61861cd1b47", + "hash_cont_tokens": "d07001d4d0214aa3" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5718915646c336d4", + "hash_cont_tokens": "be8494d5ebf3309a" + }, + "total_evaluation_time_secondes": "2005.922060251236", + "truncated": 1568, + "non-truncated": 109451, + "padded": 109413, + "non-padded": 1606, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA/results_2023-09-18T01-32-17.416050.json b/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA/results_2023-09-18T01-32-17.416050.json new file mode 100644 index 0000000000000000000000000000000000000000..764a57ac3f3ae26f79ed10e003d61ca76430cfd0 --- /dev/null +++ b/eval-results/Andron00e/YetAnother_Open-Llama-3B-LoRA/results_2023-09-18T01-32-17.416050.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Andron00e/YetAnother_Open-Llama-3B-LoRA", + "model_sha": "52c5cb0178831908ed0571f1750fcb0f0fb125f9", + "model_size": "6.4 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0004886744966442953, + "f1_stderr": 8.997703088731367e-05 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5138121546961326, + "acc_stderr": 0.014047122916440422 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0004886744966442953, + "f1_stderr": 8.997703088731367e-05, + "acc": 0.2569060773480663, + "acc_stderr": 0.007023561458220211 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "94a4cd17eab28d43", + "hash_cont_tokens": "b4827cb40001d20d" + }, + "truncated": 864, + "non-truncated": 8672, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "237c6b7eceaca35e", + "hash_cont_tokens": "5c2bcc0be8c64b86" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "de8488b97864debc", + "hash_cont_tokens": "33dc409d2c2e3198" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2418, + "non-padded": 116, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "35a6a4f679049d66", + "hash_cont_tokens": "0f7b1e76dad0cc9a" + }, + "total_evaluation_time_secondes": "18318.040421962738", + "truncated": 864, + "non-truncated": 12525, + "padded": 2418, + "non-padded": 10971, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/ByteWave/Yi-8B-Llama/results_2023-11-20T21-15-43.734258.json b/eval-results/ByteWave/Yi-8B-Llama/results_2023-11-20T21-15-43.734258.json new file mode 100644 index 0000000000000000000000000000000000000000..9eb9fdcf3d134615f3e840dcfdd5b59e2fa6619f --- /dev/null +++ b/eval-results/ByteWave/Yi-8B-Llama/results_2023-11-20T21-15-43.734258.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 276424.934702042, + "end_time": 292843.925101558, + "total_evaluation_time_secondes": "16418.990399515955", + "model_name": "ByteWave/Yi-8B-Llama", + "model_sha": "4f3f4d73ff3962487d1c51702b02d795bf1f33a4", + "model_dtype": "torch.float16", + "model_size": "16.28 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.2295221843003413, + "acc_stderr": 0.012288926760890773, + "acc_norm": 0.2568259385665529, + "acc_norm_stderr": 0.0127669237941168 + }, + "harness|hellaswag|10": { + "acc": 0.25712009559848636, + "acc_stderr": 0.004361529679492745, + "acc_norm": 0.2678749253136825, + "acc_norm_stderr": 0.00441946998393918 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.17, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.17, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.03820169914517905, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.03820169914517905 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.20394736842105263, + "acc_stderr": 0.032790004063100515, + "acc_norm": 0.20394736842105263, + "acc_norm_stderr": 0.032790004063100515 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2528301886792453, + "acc_stderr": 0.026749899771241235, + "acc_norm": 0.2528301886792453, + "acc_norm_stderr": 0.026749899771241235 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.22916666666666666, + "acc_stderr": 0.03514697467862388, + "acc_norm": 0.22916666666666666, + "acc_norm_stderr": 0.03514697467862388 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.16, + "acc_stderr": 0.0368452949177471, + "acc_norm": 0.16, + "acc_norm_stderr": 0.0368452949177471 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.1907514450867052, + "acc_stderr": 0.029957851329869337, + "acc_norm": 0.1907514450867052, + "acc_norm_stderr": 0.029957851329869337 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179961, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179961 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.30638297872340425, + "acc_stderr": 0.030135906478517563, + "acc_norm": 0.30638297872340425, + "acc_norm_stderr": 0.030135906478517563 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022057, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022057 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.037245636197746325, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.037245636197746325 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.021132859182754447, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.021132859182754447 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.04006168083848876, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.04006168083848876 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.27741935483870966, + "acc_stderr": 0.025470196835900055, + "acc_norm": 0.27741935483870966, + "acc_norm_stderr": 0.025470196835900055 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.26108374384236455, + "acc_stderr": 0.030903796952114492, + "acc_norm": 0.26108374384236455, + "acc_norm_stderr": 0.030903796952114492 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.24848484848484848, + "acc_stderr": 0.033744026441394036, + "acc_norm": 0.24848484848484848, + "acc_norm_stderr": 0.033744026441394036 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.22727272727272727, + "acc_stderr": 0.029857515673386407, + "acc_norm": 0.22727272727272727, + "acc_norm_stderr": 0.029857515673386407 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.21243523316062177, + "acc_stderr": 0.029519282616817244, + "acc_norm": 0.21243523316062177, + "acc_norm_stderr": 0.029519282616817244 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.19230769230769232, + "acc_stderr": 0.019982347208637296, + "acc_norm": 0.19230769230769232, + "acc_norm_stderr": 0.019982347208637296 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2037037037037037, + "acc_stderr": 0.02455617221914128, + "acc_norm": 0.2037037037037037, + "acc_norm_stderr": 0.02455617221914128 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.19747899159663865, + "acc_stderr": 0.02585916412205145, + "acc_norm": 0.19747899159663865, + "acc_norm_stderr": 0.02585916412205145 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.19205298013245034, + "acc_stderr": 0.032162984205936135, + "acc_norm": 0.19205298013245034, + "acc_norm_stderr": 0.032162984205936135 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.23669724770642203, + "acc_stderr": 0.01822407811729908, + "acc_norm": 0.23669724770642203, + "acc_norm_stderr": 0.01822407811729908 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.12962962962962962, + "acc_stderr": 0.022907883151288604, + "acc_norm": 0.12962962962962962, + "acc_norm_stderr": 0.022907883151288604 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.23039215686274508, + "acc_stderr": 0.029554292605695046, + "acc_norm": 0.23039215686274508, + "acc_norm_stderr": 0.029554292605695046 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.25738396624472576, + "acc_stderr": 0.028458820991460302, + "acc_norm": 0.25738396624472576, + "acc_norm_stderr": 0.028458820991460302 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.336322869955157, + "acc_stderr": 0.031708824268455, + "acc_norm": 0.336322869955157, + "acc_norm_stderr": 0.031708824268455 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2366412213740458, + "acc_stderr": 0.037276735755969195, + "acc_norm": 0.2366412213740458, + "acc_norm_stderr": 0.037276735755969195 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.256198347107438, + "acc_stderr": 0.03984979653302872, + "acc_norm": 0.256198347107438, + "acc_norm_stderr": 0.03984979653302872 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.04133119440243839, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.04133119440243839 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2392638036809816, + "acc_stderr": 0.033519538795212696, + "acc_norm": 0.2392638036809816, + "acc_norm_stderr": 0.033519538795212696 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.29464285714285715, + "acc_stderr": 0.04327040932578728, + "acc_norm": 0.29464285714285715, + "acc_norm_stderr": 0.04327040932578728 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.22330097087378642, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.22330097087378642, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.23076923076923078, + "acc_stderr": 0.027601921381417593, + "acc_norm": 0.23076923076923078, + "acc_norm_stderr": 0.027601921381417593 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2656449553001277, + "acc_stderr": 0.015794302487888715, + "acc_norm": 0.2656449553001277, + "acc_norm_stderr": 0.015794302487888715 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.023929155517351284, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.023929155517351284 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.27009646302250806, + "acc_stderr": 0.025218040373410622, + "acc_norm": 0.27009646302250806, + "acc_norm_stderr": 0.025218040373410622 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25617283950617287, + "acc_stderr": 0.0242885336377261, + "acc_norm": 0.25617283950617287, + "acc_norm_stderr": 0.0242885336377261 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2375886524822695, + "acc_stderr": 0.025389512552729903, + "acc_norm": 0.2375886524822695, + "acc_norm_stderr": 0.025389512552729903 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.23989569752281617, + "acc_stderr": 0.010906282617981633, + "acc_norm": 0.23989569752281617, + "acc_norm_stderr": 0.010906282617981633 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.43014705882352944, + "acc_stderr": 0.030074971917302875, + "acc_norm": 0.43014705882352944, + "acc_norm_stderr": 0.030074971917302875 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.017630827375148383, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.017630827375148383 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2818181818181818, + "acc_stderr": 0.04309118709946458, + "acc_norm": 0.2818181818181818, + "acc_norm_stderr": 0.04309118709946458 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.17142857142857143, + "acc_stderr": 0.02412746346265015, + "acc_norm": 0.17142857142857143, + "acc_norm_stderr": 0.02412746346265015 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.030360490154014645, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.030360490154014645 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3192771084337349, + "acc_stderr": 0.0362933532994786, + "acc_norm": 0.3192771084337349, + "acc_norm_stderr": 0.0362933532994786 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.031885780176863984, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.031885780176863984 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.24357405140758873, + "mc1_stderr": 0.01502635482491078, + "mc2": 0.4779459732030941, + "mc2_stderr": 0.01680308247777984 + }, + "harness|winogrande|5": { + "acc": 0.48303078137332284, + "acc_stderr": 0.014044390401612967 + }, + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "all": { + "acc": 0.24145987747065684, + "acc_stderr": 0.03022043483690372, + "acc_norm": 0.2421030564121721, + "acc_norm_stderr": 0.03101589952520655, + "mc1": 0.24357405140758873, + "mc1_stderr": 0.01502635482491078, + "mc2": 0.4779459732030941, + "mc2_stderr": 0.01680308247777984, + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "c84bbabff7655573", + "hash_cont_tokens": "e23c779c4c2dd1ec" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4682, + "non_padded": 5, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "52e70aa3670e3695", + "hash_cont_tokens": "55da5ba61989a8fe" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40097, + "non_padded": 71, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "085f405a873c9f87", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3b492ddc5de3f57a", + "hash_cont_tokens": "5cc800feae9fa1ad" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa55e6645b3f3526", + "hash_cont_tokens": "655dbb90034f484a" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "5f80d5327a047022", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c0a3ae71b5506278", + "hash_cont_tokens": "f77b74d946d7fc02" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "6fcc5fb2ad3a62b5", + "hash_cont_tokens": "1ba4b1a158d8bf3f" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "b3c5950ef0ab5b9f", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "d4b18e1debc64387", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "78289261a74f39aa", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "5449a8e432780f7f", + "hash_cont_tokens": "78a0ebf66d91c5cf" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "b55be981de130fed", + "hash_cont_tokens": "5a030c95824fdbe5" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "b39d36783fd07415", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "90db261ac05081a8", + "hash_cont_tokens": "2326dc60d0bc41b6" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "3b6ab5e66082a68d", + "hash_cont_tokens": "be908364b6f14dd6" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "a8e0453f990ff5aa", + "hash_cont_tokens": "179280ef597fe1bf" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 564, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "9e30d3a741143c4a", + "hash_cont_tokens": "95cdcdaf1abd0bd2" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "06838690ab0d64b9", + "hash_cont_tokens": "6a4818f3c307c346" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "50dc8670e216ba78", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "0097a3c431b4fc51", + "hash_cont_tokens": "36d0d84455f0bdba" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75f3de0dad7830bc", + "hash_cont_tokens": "c678f794a9b8ee74" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "bc373cd584fa942b", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "507c0abd3d17fd8f", + "hash_cont_tokens": "e9c94304326d875c" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a8ab4dfafa4f65b4", + "hash_cont_tokens": "f937a1349eb483eb" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "e33171fd6e0b4a9c", + "hash_cont_tokens": "8b27dd3907d25b4e" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "f3319223cf191987", + "hash_cont_tokens": "3763cae29e2f938c" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "2f08fbb89a3a31b0", + "hash_cont_tokens": "fd7b555352d765a4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d2ff2b6e81f3e039", + "hash_cont_tokens": "61f46d4a209b9aa2" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "dd50a9b81a6e14a2", + "hash_cont_tokens": "4e7053e7c19d680d" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d5f514e075b8a310", + "hash_cont_tokens": "84d19ae8790476bb" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "3faf848f9d19cb14", + "hash_cont_tokens": "b119c7b668213a4e" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "dafa7c29ee53148d", + "hash_cont_tokens": "a3b126bc622d571f" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "f3f7c0cb054a9101", + "hash_cont_tokens": "9abf19ceb76331ff" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "ee334f2be12733c8", + "hash_cont_tokens": "0e2e725ae9a898da" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "a9997011eacb1c14", + "hash_cont_tokens": "a94c1dea6d775249" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "5e065bb834e5eb5f", + "hash_cont_tokens": "3832f860859bb86b" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6694a4e4327a0eee", + "hash_cont_tokens": "9fac5a0c364fca8a" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "630193f0a85c4db4", + "hash_cont_tokens": "dc53ed31134ddf3a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "481eec60fca7d379", + "hash_cont_tokens": "e272b5456d5552d6" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "5e29b566e42d5c49", + "hash_cont_tokens": "7119d4642957b1f0" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "abc950328f30685d", + "hash_cont_tokens": "099d58c66ece3f11" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "7b7f0526063c20bd", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "2f35d509e71e13d9", + "hash_cont_tokens": "bae342d4e82ba8f7" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "a1fe66c367aec9a4", + "hash_cont_tokens": "578c64cbdbb1e0d4" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "477794fff20bb51b", + "hash_cont_tokens": "79b25f42b3fce0f9" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "f0035147162e2914", + "hash_cont_tokens": "9d1f3b976417156c" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "afde0a4bb78262a8", + "hash_cont_tokens": "88dab560e1e06d97" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "80cbaf9c72217b9b", + "hash_cont_tokens": "04ea847139fe9393" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "34fa03402fe143e2", + "hash_cont_tokens": "0435ff692ad17e68" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1124, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "970559d2709d7dfb", + "hash_cont_tokens": "b852c74e9f8801bd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "e6bad9d3d227482c", + "hash_cont_tokens": "5db0f6460652d063" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "5915ac075f743cd6", + "hash_cont_tokens": "c960676ef7f3dbe5" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "abdaa0333725e504", + "hash_cont_tokens": "3320565f412c4b01" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "5e5e21ce02813577", + "hash_cont_tokens": "218ed775ef60aab9" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "74f6e50f8da04eb6", + "hash_cont_tokens": "20babf5cc4cc7f3d" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "4234573f54827f4f", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "d8f9c3d810f8d6f2", + "hash_cont_tokens": "dc6d57296bea0882" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "a96ae58b7a2f1010", + "hash_cont_tokens": "37f53444db289ed3" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "4214b9bf45e97067", + "hash_cont_tokens": "71a67034827cd30e" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "a7eeaad96f70499b", + "hash_cont_tokens": "c93e9c22fa3077a0" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "0e6ecbc56f7e5009", + "hash_cont_tokens": "d88a220b56cfad8e" + }, + "truncated": 1, + "non_truncated": 9535, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "d488b9ef001d40f5", + "hash_cont_tokens": "122d79a8bdb49297" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "30bfead6e298fa54", + "hash_cont_tokens": "b553d25fc522e095" + }, + "truncated": 1, + "non_truncated": 38194, + "padded": 113445, + "non_padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/ContextualAI/archangel_sft-kto_llama13b/results_2023-12-09T20-01-05.918025.json b/eval-results/ContextualAI/archangel_sft-kto_llama13b/results_2023-12-09T20-01-05.918025.json new file mode 100644 index 0000000000000000000000000000000000000000..ab36c50609de7898a34361c0f8a5184e863b4a5f --- /dev/null +++ b/eval-results/ContextualAI/archangel_sft-kto_llama13b/results_2023-12-09T20-01-05.918025.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 599314.429749275, + "end_time": 606933.884851814, + "total_evaluation_time_secondes": "7619.455102539039", + "model_name": "ContextualAI/archangel_sft-kto_llama13b", + "model_sha": "d596fb0060168006360610d673c2c35edcbbf110", + "model_dtype": "torch.float16", + "model_size": "24.28 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5264505119453925, + "acc_stderr": 0.01459093135812017, + "acc_norm": 0.5614334470989761, + "acc_norm_stderr": 0.014500682618212864 + }, + "harness|hellaswag|10": { + "acc": 0.6093407687711612, + "acc_stderr": 0.004869010152280754, + "acc_norm": 0.8080063732324239, + "acc_norm_stderr": 0.003930631369978262 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847415, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847415 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.46710526315789475, + "acc_stderr": 0.04060127035236395, + "acc_norm": 0.46710526315789475, + "acc_norm_stderr": 0.04060127035236395 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4641509433962264, + "acc_stderr": 0.030693675018458003, + "acc_norm": 0.4641509433962264, + "acc_norm_stderr": 0.030693675018458003 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4861111111111111, + "acc_stderr": 0.04179596617581, + "acc_norm": 0.4861111111111111, + "acc_norm_stderr": 0.04179596617581 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.41040462427745666, + "acc_stderr": 0.037507570448955356, + "acc_norm": 0.41040462427745666, + "acc_norm_stderr": 0.037507570448955356 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179963, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179963 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.39574468085106385, + "acc_stderr": 0.03196758697835361, + "acc_norm": 0.39574468085106385, + "acc_norm_stderr": 0.03196758697835361 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.43448275862068964, + "acc_stderr": 0.041307408795554966, + "acc_norm": 0.43448275862068964, + "acc_norm_stderr": 0.041307408795554966 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.02264421261552521, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.02264421261552521 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.042163702135578345, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.042163702135578345 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5225806451612903, + "acc_stderr": 0.028414985019707868, + "acc_norm": 0.5225806451612903, + "acc_norm_stderr": 0.028414985019707868 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.28078817733990147, + "acc_stderr": 0.0316185633535861, + "acc_norm": 0.28078817733990147, + "acc_norm_stderr": 0.0316185633535861 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6121212121212121, + "acc_stderr": 0.038049136539710114, + "acc_norm": 0.6121212121212121, + "acc_norm_stderr": 0.038049136539710114 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5454545454545454, + "acc_stderr": 0.03547601494006937, + "acc_norm": 0.5454545454545454, + "acc_norm_stderr": 0.03547601494006937 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6632124352331606, + "acc_stderr": 0.03410780251836183, + "acc_norm": 0.6632124352331606, + "acc_norm_stderr": 0.03410780251836183 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.025294608023986472, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.025294608023986472 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.026719240783712173, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.026719240783712173 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4579831932773109, + "acc_stderr": 0.03236361111951941, + "acc_norm": 0.4579831932773109, + "acc_norm_stderr": 0.03236361111951941 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943342, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943342 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.618348623853211, + "acc_stderr": 0.020828148517022582, + "acc_norm": 0.618348623853211, + "acc_norm_stderr": 0.020828148517022582 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2916666666666667, + "acc_stderr": 0.03099866630456052, + "acc_norm": 0.2916666666666667, + "acc_norm_stderr": 0.03099866630456052 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.03460228327239171, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.03460228327239171 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6919831223628692, + "acc_stderr": 0.0300523893356057, + "acc_norm": 0.6919831223628692, + "acc_norm_stderr": 0.0300523893356057 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5291479820627802, + "acc_stderr": 0.03350073248773403, + "acc_norm": 0.5291479820627802, + "acc_norm_stderr": 0.03350073248773403 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5801526717557252, + "acc_stderr": 0.04328577215262971, + "acc_norm": 0.5801526717557252, + "acc_norm_stderr": 0.04328577215262971 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.043913262867240704, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.043913262867240704 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.04830366024635331, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.04830366024635331 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5214723926380368, + "acc_stderr": 0.03924746876751129, + "acc_norm": 0.5214723926380368, + "acc_norm_stderr": 0.03924746876751129 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.30357142857142855, + "acc_stderr": 0.04364226155841044, + "acc_norm": 0.30357142857142855, + "acc_norm_stderr": 0.04364226155841044 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6699029126213593, + "acc_stderr": 0.0465614711001235, + "acc_norm": 0.6699029126213593, + "acc_norm_stderr": 0.0465614711001235 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7307692307692307, + "acc_stderr": 0.029058588303748842, + "acc_norm": 0.7307692307692307, + "acc_norm_stderr": 0.029058588303748842 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.55, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.55, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6615581098339719, + "acc_stderr": 0.016920869586210675, + "acc_norm": 0.6615581098339719, + "acc_norm_stderr": 0.016920869586210675 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5144508670520231, + "acc_stderr": 0.02690784985628254, + "acc_norm": 0.5144508670520231, + "acc_norm_stderr": 0.02690784985628254 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2916201117318436, + "acc_stderr": 0.015201032512520436, + "acc_norm": 0.2916201117318436, + "acc_norm_stderr": 0.015201032512520436 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5130718954248366, + "acc_stderr": 0.028620130800700246, + "acc_norm": 0.5130718954248366, + "acc_norm_stderr": 0.028620130800700246 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5498392282958199, + "acc_stderr": 0.028256660723360173, + "acc_norm": 0.5498392282958199, + "acc_norm_stderr": 0.028256660723360173 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5154320987654321, + "acc_stderr": 0.02780749004427619, + "acc_norm": 0.5154320987654321, + "acc_norm_stderr": 0.02780749004427619 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.34397163120567376, + "acc_stderr": 0.028338017428611324, + "acc_norm": 0.34397163120567376, + "acc_norm_stderr": 0.028338017428611324 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.37614080834419816, + "acc_stderr": 0.012372214430599814, + "acc_norm": 0.37614080834419816, + "acc_norm_stderr": 0.012372214430599814 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5147058823529411, + "acc_stderr": 0.03035969707904611, + "acc_norm": 0.5147058823529411, + "acc_norm_stderr": 0.03035969707904611 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4820261437908497, + "acc_stderr": 0.020214761037872404, + "acc_norm": 0.4820261437908497, + "acc_norm_stderr": 0.020214761037872404 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6, + "acc_stderr": 0.0469237132203465, + "acc_norm": 0.6, + "acc_norm_stderr": 0.0469237132203465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5387755102040817, + "acc_stderr": 0.031912820526692774, + "acc_norm": 0.5387755102040817, + "acc_norm_stderr": 0.031912820526692774 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6069651741293532, + "acc_stderr": 0.0345368246603156, + "acc_norm": 0.6069651741293532, + "acc_norm_stderr": 0.0345368246603156 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4457831325301205, + "acc_stderr": 0.03869543323472101, + "acc_norm": 0.4457831325301205, + "acc_norm_stderr": 0.03869543323472101 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.695906432748538, + "acc_stderr": 0.0352821125824523, + "acc_norm": 0.695906432748538, + "acc_norm_stderr": 0.0352821125824523 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.26193390452876375, + "mc1_stderr": 0.015392118805015023, + "mc2": 0.39418229629364515, + "mc2_stderr": 0.013748123967336172 + }, + "harness|winogrande|5": { + "acc": 0.7616416732438832, + "acc_stderr": 0.011974948667702311 + }, + "harness|gsm8k|5": { + "acc": 0.1683093252463988, + "acc_stderr": 0.010305695358125522 + }, + "all": { + "acc": 0.4808497396801513, + "acc_stderr": 0.0342816178342491, + "acc_norm": 0.48534799426464065, + "acc_norm_stderr": 0.03504863417527385, + "mc1": 0.26193390452876375, + "mc1_stderr": 0.015392118805015023, + "mc2": 0.39418229629364515, + "mc2_stderr": 0.013748123967336172 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "c2d55d68c4441c39", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "38dc8458e001ab84", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "5e69bf9422c979cd", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "55065fe953492209", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non_truncated": -495, + "padded": 0, + "non_padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "0903f3aba4ea094f", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non_truncated": -612, + "padded": 0, + "non_padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non_truncated": 229, + "padded": 940, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non_truncated": 930, + "padded": 5524, + "non_padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "efcabb2197e1e282" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "08c39bfaff1d11e0", + "hash_cont_tokens": "eeb74a16fb1a6320" + }, + "truncated": 2088, + "non_truncated": 26571, + "padded": 111256, + "non_padded": 3616, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Delcos/Mistral-Pygmalion-7b/results_2023-10-10T20-14-17.715432.json b/eval-results/Delcos/Mistral-Pygmalion-7b/results_2023-10-10T20-14-17.715432.json new file mode 100644 index 0000000000000000000000000000000000000000..891ab0f65225b4e92f6368d6e049d70a1cad2117 --- /dev/null +++ b/eval-results/Delcos/Mistral-Pygmalion-7b/results_2023-10-10T20-14-17.715432.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Delcos/Mistral-Pygmalion-7b", + "model_sha": "4e5fa9ae7f572b4841b02c3f96d8a3c7a7e59521", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5017064846416383, + "acc_stderr": 0.014611305705056987, + "acc_norm": 0.5443686006825939, + "acc_norm_stderr": 0.014553749939306863 + }, + "harness|hellaswag|10": { + "acc": 0.5864369647480582, + "acc_stderr": 0.004914655063329499, + "acc_norm": 0.7848038239394542, + "acc_norm_stderr": 0.00410118487096418 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5111111111111111, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.5111111111111111, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.42105263157894735, + "acc_stderr": 0.04017901275981749, + "acc_norm": 0.42105263157894735, + "acc_norm_stderr": 0.04017901275981749 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5169811320754717, + "acc_stderr": 0.030755120364119905, + "acc_norm": 0.5169811320754717, + "acc_norm_stderr": 0.030755120364119905 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5, + "acc_stderr": 0.04181210050035455, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04181210050035455 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4277456647398844, + "acc_stderr": 0.037724468575180255, + "acc_norm": 0.4277456647398844, + "acc_norm_stderr": 0.037724468575180255 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.1568627450980392, + "acc_stderr": 0.036186648199362466, + "acc_norm": 0.1568627450980392, + "acc_norm_stderr": 0.036186648199362466 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4297872340425532, + "acc_stderr": 0.03236214467715563, + "acc_norm": 0.4297872340425532, + "acc_norm_stderr": 0.03236214467715563 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322004, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322004 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30952380952380953, + "acc_stderr": 0.023809523809523853, + "acc_norm": 0.30952380952380953, + "acc_norm_stderr": 0.023809523809523853 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.040735243221471255, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.040735243221471255 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5290322580645161, + "acc_stderr": 0.028396016402761, + "acc_norm": 0.5290322580645161, + "acc_norm_stderr": 0.028396016402761 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3694581280788177, + "acc_stderr": 0.033959703819985726, + "acc_norm": 0.3694581280788177, + "acc_norm_stderr": 0.033959703819985726 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6181818181818182, + "acc_stderr": 0.037937131711656344, + "acc_norm": 0.6181818181818182, + "acc_norm_stderr": 0.037937131711656344 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5959595959595959, + "acc_stderr": 0.03496130972056128, + "acc_norm": 0.5959595959595959, + "acc_norm_stderr": 0.03496130972056128 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7305699481865285, + "acc_stderr": 0.03201867122877794, + "acc_norm": 0.7305699481865285, + "acc_norm_stderr": 0.03201867122877794 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.02529460802398647, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.02529460802398647 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.027634907264178544, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.027634907264178544 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.0322529423239964, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.0322529423239964 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943342, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943342 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6605504587155964, + "acc_stderr": 0.02030210934266235, + "acc_norm": 0.6605504587155964, + "acc_norm_stderr": 0.02030210934266235 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3611111111111111, + "acc_stderr": 0.03275773486100999, + "acc_norm": 0.3611111111111111, + "acc_norm_stderr": 0.03275773486100999 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6715686274509803, + "acc_stderr": 0.032962451101722294, + "acc_norm": 0.6715686274509803, + "acc_norm_stderr": 0.032962451101722294 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7088607594936709, + "acc_stderr": 0.029571601065753374, + "acc_norm": 0.7088607594936709, + "acc_norm_stderr": 0.029571601065753374 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5739910313901345, + "acc_stderr": 0.0331883328621728, + "acc_norm": 0.5739910313901345, + "acc_norm_stderr": 0.0331883328621728 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5877862595419847, + "acc_stderr": 0.04317171194870255, + "acc_norm": 0.5877862595419847, + "acc_norm_stderr": 0.04317171194870255 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6198347107438017, + "acc_stderr": 0.04431324501968431, + "acc_norm": 0.6198347107438017, + "acc_norm_stderr": 0.04431324501968431 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.04803752235190193, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.04803752235190193 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.558282208588957, + "acc_stderr": 0.03901591825836184, + "acc_norm": 0.558282208588957, + "acc_norm_stderr": 0.03901591825836184 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.38392857142857145, + "acc_stderr": 0.04616143075028547, + "acc_norm": 0.38392857142857145, + "acc_norm_stderr": 0.04616143075028547 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6019417475728155, + "acc_stderr": 0.048467482539772386, + "acc_norm": 0.6019417475728155, + "acc_norm_stderr": 0.048467482539772386 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7136752136752137, + "acc_stderr": 0.02961432369045665, + "acc_norm": 0.7136752136752137, + "acc_norm_stderr": 0.02961432369045665 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.55, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.55, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6819923371647509, + "acc_stderr": 0.01665348627561539, + "acc_norm": 0.6819923371647509, + "acc_norm_stderr": 0.01665348627561539 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5346820809248555, + "acc_stderr": 0.02685425792825888, + "acc_norm": 0.5346820809248555, + "acc_norm_stderr": 0.02685425792825888 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2435754189944134, + "acc_stderr": 0.014355911964767867, + "acc_norm": 0.2435754189944134, + "acc_norm_stderr": 0.014355911964767867 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.028580341065138296, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.028580341065138296 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6237942122186495, + "acc_stderr": 0.027513925683549434, + "acc_norm": 0.6237942122186495, + "acc_norm_stderr": 0.027513925683549434 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5246913580246914, + "acc_stderr": 0.02778680093142745, + "acc_norm": 0.5246913580246914, + "acc_norm_stderr": 0.02778680093142745 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3829787234042553, + "acc_stderr": 0.02899908090480618, + "acc_norm": 0.3829787234042553, + "acc_norm_stderr": 0.02899908090480618 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.36962190352020863, + "acc_stderr": 0.012328445778575253, + "acc_norm": 0.36962190352020863, + "acc_norm_stderr": 0.012328445778575253 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5441176470588235, + "acc_stderr": 0.030254372573976715, + "acc_norm": 0.5441176470588235, + "acc_norm_stderr": 0.030254372573976715 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4820261437908497, + "acc_stderr": 0.020214761037872408, + "acc_norm": 0.4820261437908497, + "acc_norm_stderr": 0.020214761037872408 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5818181818181818, + "acc_stderr": 0.04724577405731572, + "acc_norm": 0.5818181818181818, + "acc_norm_stderr": 0.04724577405731572 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5510204081632653, + "acc_stderr": 0.03184213866687579, + "acc_norm": 0.5510204081632653, + "acc_norm_stderr": 0.03184213866687579 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.681592039800995, + "acc_stderr": 0.03294118479054095, + "acc_norm": 0.681592039800995, + "acc_norm_stderr": 0.03294118479054095 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252609, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252609 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.39759036144578314, + "acc_stderr": 0.038099730845402184, + "acc_norm": 0.39759036144578314, + "acc_norm_stderr": 0.038099730845402184 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7076023391812866, + "acc_stderr": 0.03488647713457922, + "acc_norm": 0.7076023391812866, + "acc_norm_stderr": 0.03488647713457922 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2778457772337821, + "mc1_stderr": 0.01568092936402465, + "mc2": 0.41821115723032093, + "mc2_stderr": 0.013974820403469736 + }, + "all": { + "acc": 0.4940842782473963, + "acc_stderr": 0.03510437959075512, + "acc_norm": 0.4981695151157412, + "acc_norm_stderr": 0.03508961643892265, + "mc1": 0.2778457772337821, + "mc1_stderr": 0.01568092936402465, + "mc2": 0.41821115723032093, + "mc2_stderr": 0.013974820403469736 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4245.52036690712", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Delcos/Mistral-Pygmalion-7b/results_2023-10-28T13-05-25.339926.json b/eval-results/Delcos/Mistral-Pygmalion-7b/results_2023-10-28T13-05-25.339926.json new file mode 100644 index 0000000000000000000000000000000000000000..49e04281d9f2b04eb0e0c3b7a37e46f076334943 --- /dev/null +++ b/eval-results/Delcos/Mistral-Pygmalion-7b/results_2023-10-28T13-05-25.339926.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Delcos/Mistral-Pygmalion-7b", + "model_sha": "1abd9c77daf9db4744823dc0f8fa31e94c71a101", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001363255033557047, + "em_stderr": 0.0003778609196460774, + "f1": 0.05936241610738259, + "f1_stderr": 0.0013656193493625718 + }, + "harness|gsm8k|5": { + "acc": 0.06823351023502654, + "acc_stderr": 0.006945358944067431 + }, + "harness|winogrande|5": { + "acc": 0.7529597474348856, + "acc_stderr": 0.012121402942855575 + }, + "all": { + "em": 0.001363255033557047, + "em_stderr": 0.0003778609196460774, + "f1": 0.05936241610738259, + "f1_stderr": 0.0013656193493625718, + "acc": 0.41059662883495607, + "acc_stderr": 0.009533380943461503 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "1492761f44bd3a5d" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "e1f25b89acd14fbc" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "4d4ebc45083d397b" + }, + "total_evaluation_time_secondes": "9800.566268920898", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Delcos/NATE-7b/results_2023-10-12T03-21-56.889828.json b/eval-results/Delcos/NATE-7b/results_2023-10-12T03-21-56.889828.json new file mode 100644 index 0000000000000000000000000000000000000000..634fba0298b20f1feac976a24a6a410dd5111b18 --- /dev/null +++ b/eval-results/Delcos/NATE-7b/results_2023-10-12T03-21-56.889828.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Delcos/NATE-7b", + "model_sha": "dd844a22b3b1ec4ad1757ce1ce184b8c765ae4c9", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5784982935153583, + "acc_stderr": 0.014430197069326025, + "acc_norm": 0.6092150170648464, + "acc_norm_stderr": 0.014258563880513778 + }, + "harness|hellaswag|10": { + "acc": 0.620991834295957, + "acc_stderr": 0.004841486716855774, + "acc_norm": 0.8209520015933081, + "acc_norm_stderr": 0.0038260895866500536 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5111111111111111, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.5111111111111111, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5789473684210527, + "acc_stderr": 0.04017901275981749, + "acc_norm": 0.5789473684210527, + "acc_norm_stderr": 0.04017901275981749 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6415094339622641, + "acc_stderr": 0.02951470358398177, + "acc_norm": 0.6415094339622641, + "acc_norm_stderr": 0.02951470358398177 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6180555555555556, + "acc_stderr": 0.040629907841466674, + "acc_norm": 0.6180555555555556, + "acc_norm_stderr": 0.040629907841466674 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5606936416184971, + "acc_stderr": 0.03784271932887467, + "acc_norm": 0.5606936416184971, + "acc_norm_stderr": 0.03784271932887467 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3431372549019608, + "acc_stderr": 0.047240073523838876, + "acc_norm": 0.3431372549019608, + "acc_norm_stderr": 0.047240073523838876 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.046882617226215055, + "acc_norm": 0.68, + "acc_norm_stderr": 0.046882617226215055 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5063829787234042, + "acc_stderr": 0.032683358999363366, + "acc_norm": 0.5063829787234042, + "acc_norm_stderr": 0.032683358999363366 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.32456140350877194, + "acc_stderr": 0.04404556157374767, + "acc_norm": 0.32456140350877194, + "acc_norm_stderr": 0.04404556157374767 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.36243386243386244, + "acc_stderr": 0.02475747390275206, + "acc_norm": 0.36243386243386244, + "acc_norm_stderr": 0.02475747390275206 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.043902592653775614, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.043902592653775614 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7032258064516129, + "acc_stderr": 0.025988500792411898, + "acc_norm": 0.7032258064516129, + "acc_norm_stderr": 0.025988500792411898 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.46798029556650245, + "acc_stderr": 0.035107665979592154, + "acc_norm": 0.46798029556650245, + "acc_norm_stderr": 0.035107665979592154 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.703030303030303, + "acc_stderr": 0.03567969772268049, + "acc_norm": 0.703030303030303, + "acc_norm_stderr": 0.03567969772268049 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7676767676767676, + "acc_stderr": 0.030088629490217483, + "acc_norm": 0.7676767676767676, + "acc_norm_stderr": 0.030088629490217483 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8549222797927462, + "acc_stderr": 0.025416343096306433, + "acc_norm": 0.8549222797927462, + "acc_norm_stderr": 0.025416343096306433 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6358974358974359, + "acc_stderr": 0.024396672985094767, + "acc_norm": 0.6358974358974359, + "acc_norm_stderr": 0.024396672985094767 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32222222222222224, + "acc_stderr": 0.028493465091028597, + "acc_norm": 0.32222222222222224, + "acc_norm_stderr": 0.028493465091028597 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6008403361344538, + "acc_stderr": 0.03181110032413926, + "acc_norm": 0.6008403361344538, + "acc_norm_stderr": 0.03181110032413926 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.03861557546255169, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.03861557546255169 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7944954128440367, + "acc_stderr": 0.017324352325016022, + "acc_norm": 0.7944954128440367, + "acc_norm_stderr": 0.017324352325016022 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.03372343271653064, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.03372343271653064 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8235294117647058, + "acc_stderr": 0.026756401538078962, + "acc_norm": 0.8235294117647058, + "acc_norm_stderr": 0.026756401538078962 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7637130801687764, + "acc_stderr": 0.027652153144159263, + "acc_norm": 0.7637130801687764, + "acc_norm_stderr": 0.027652153144159263 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7040358744394619, + "acc_stderr": 0.030636591348699796, + "acc_norm": 0.7040358744394619, + "acc_norm_stderr": 0.030636591348699796 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6412213740458015, + "acc_stderr": 0.04206739313864908, + "acc_norm": 0.6412213740458015, + "acc_norm_stderr": 0.04206739313864908 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7024793388429752, + "acc_stderr": 0.04173349148083499, + "acc_norm": 0.7024793388429752, + "acc_norm_stderr": 0.04173349148083499 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6871165644171779, + "acc_stderr": 0.03642914578292406, + "acc_norm": 0.6871165644171779, + "acc_norm_stderr": 0.03642914578292406 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4107142857142857, + "acc_stderr": 0.04669510663875191, + "acc_norm": 0.4107142857142857, + "acc_norm_stderr": 0.04669510663875191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7184466019417476, + "acc_stderr": 0.044532548363264673, + "acc_norm": 0.7184466019417476, + "acc_norm_stderr": 0.044532548363264673 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8290598290598291, + "acc_stderr": 0.024662496845209825, + "acc_norm": 0.8290598290598291, + "acc_norm_stderr": 0.024662496845209825 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7752234993614304, + "acc_stderr": 0.014927447101937153, + "acc_norm": 0.7752234993614304, + "acc_norm_stderr": 0.014927447101937153 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6589595375722543, + "acc_stderr": 0.025522474632121615, + "acc_norm": 0.6589595375722543, + "acc_norm_stderr": 0.025522474632121615 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.43798882681564244, + "acc_stderr": 0.016593394227564846, + "acc_norm": 0.43798882681564244, + "acc_norm_stderr": 0.016593394227564846 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6535947712418301, + "acc_stderr": 0.02724561304721536, + "acc_norm": 0.6535947712418301, + "acc_norm_stderr": 0.02724561304721536 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6559485530546624, + "acc_stderr": 0.026981478043648043, + "acc_norm": 0.6559485530546624, + "acc_norm_stderr": 0.026981478043648043 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6697530864197531, + "acc_stderr": 0.026168298456732852, + "acc_norm": 0.6697530864197531, + "acc_norm_stderr": 0.026168298456732852 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4574468085106383, + "acc_stderr": 0.029719281272236837, + "acc_norm": 0.4574468085106383, + "acc_norm_stderr": 0.029719281272236837 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44589308996088656, + "acc_stderr": 0.012695244711379778, + "acc_norm": 0.44589308996088656, + "acc_norm_stderr": 0.012695244711379778 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5661764705882353, + "acc_stderr": 0.03010563657001663, + "acc_norm": 0.5661764705882353, + "acc_norm_stderr": 0.03010563657001663 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5915032679738562, + "acc_stderr": 0.019886221037501862, + "acc_norm": 0.5915032679738562, + "acc_norm_stderr": 0.019886221037501862 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.04494290866252091, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.04494290866252091 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6612244897959184, + "acc_stderr": 0.030299506562154185, + "acc_norm": 0.6612244897959184, + "acc_norm_stderr": 0.030299506562154185 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7810945273631841, + "acc_stderr": 0.029239174636647, + "acc_norm": 0.7810945273631841, + "acc_norm_stderr": 0.029239174636647 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5060240963855421, + "acc_stderr": 0.03892212195333045, + "acc_norm": 0.5060240963855421, + "acc_norm_stderr": 0.03892212195333045 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7719298245614035, + "acc_stderr": 0.032180937956023566, + "acc_norm": 0.7719298245614035, + "acc_norm_stderr": 0.032180937956023566 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3953488372093023, + "mc1_stderr": 0.017115815632418197, + "mc2": 0.571756969256499, + "mc2_stderr": 0.01564827771634302 + }, + "all": { + "acc": 0.5894733182245001, + "acc_stderr": 0.03414332313258469, + "acc_norm": 0.5933830960354635, + "acc_norm_stderr": 0.03412320397463523, + "mc1": 0.3953488372093023, + "mc1_stderr": 0.017115815632418197, + "mc2": 0.571756969256499, + "mc2_stderr": 0.01564827771634302 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6390.07342004776", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Delcos/Starling-LM-11B-alpha/results_2023-12-09T16-46-23.982029.json b/eval-results/Delcos/Starling-LM-11B-alpha/results_2023-12-09T16-46-23.982029.json new file mode 100644 index 0000000000000000000000000000000000000000..010e4f0ea03b86e65e4dbc4f5f0841415e668956 --- /dev/null +++ b/eval-results/Delcos/Starling-LM-11B-alpha/results_2023-12-09T16-46-23.982029.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 584323.861240555, + "end_time": 595259.022095222, + "total_evaluation_time_secondes": "10935.160854667076", + "model_name": "Delcos/Starling-LM-11B-alpha", + "model_sha": "16086688b70e4f54e1ba4f54a1a847c30b987a74", + "model_dtype": "torch.float16", + "model_size": "21.41 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6006825938566553, + "acc_stderr": 0.014312094557946705, + "acc_norm": 0.6296928327645052, + "acc_norm_stderr": 0.01411129875167495 + }, + "harness|hellaswag|10": { + "acc": 0.668990240987851, + "acc_stderr": 0.004696148339570979, + "acc_norm": 0.8485361481776539, + "acc_norm_stderr": 0.0035776774950640783 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6, + "acc_stderr": 0.042320736951515885, + "acc_norm": 0.6, + "acc_norm_stderr": 0.042320736951515885 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6973684210526315, + "acc_stderr": 0.03738520676119669, + "acc_norm": 0.6973684210526315, + "acc_norm_stderr": 0.03738520676119669 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6566037735849056, + "acc_stderr": 0.02922452646912479, + "acc_norm": 0.6566037735849056, + "acc_norm_stderr": 0.02922452646912479 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7430555555555556, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.7430555555555556, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6994219653179191, + "acc_stderr": 0.03496101481191179, + "acc_norm": 0.6994219653179191, + "acc_norm_stderr": 0.03496101481191179 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.45098039215686275, + "acc_stderr": 0.049512182523962625, + "acc_norm": 0.45098039215686275, + "acc_norm_stderr": 0.049512182523962625 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5702127659574469, + "acc_stderr": 0.03236214467715564, + "acc_norm": 0.5702127659574469, + "acc_norm_stderr": 0.03236214467715564 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.046970851366478626, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.046970851366478626 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.025305906241590632, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.025305906241590632 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5238095238095238, + "acc_stderr": 0.04467062628403273, + "acc_norm": 0.5238095238095238, + "acc_norm_stderr": 0.04467062628403273 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7548387096774194, + "acc_stderr": 0.024472243840895535, + "acc_norm": 0.7548387096774194, + "acc_norm_stderr": 0.024472243840895535 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.458128078817734, + "acc_stderr": 0.03505630140785741, + "acc_norm": 0.458128078817734, + "acc_norm_stderr": 0.03505630140785741 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.03192271569548301, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.03192271569548301 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8080808080808081, + "acc_stderr": 0.02805779167298902, + "acc_norm": 0.8080808080808081, + "acc_norm_stderr": 0.02805779167298902 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8911917098445595, + "acc_stderr": 0.02247325333276878, + "acc_norm": 0.8911917098445595, + "acc_norm_stderr": 0.02247325333276878 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6410256410256411, + "acc_stderr": 0.024321738484602354, + "acc_norm": 0.6410256410256411, + "acc_norm_stderr": 0.024321738484602354 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.028037929969114986, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.028037929969114986 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.634453781512605, + "acc_stderr": 0.031282177063684614, + "acc_norm": 0.634453781512605, + "acc_norm_stderr": 0.031282177063684614 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.36423841059602646, + "acc_stderr": 0.03929111781242742, + "acc_norm": 0.36423841059602646, + "acc_norm_stderr": 0.03929111781242742 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8495412844036697, + "acc_stderr": 0.015328563932669235, + "acc_norm": 0.8495412844036697, + "acc_norm_stderr": 0.015328563932669235 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.47685185185185186, + "acc_stderr": 0.034063153607115065, + "acc_norm": 0.47685185185185186, + "acc_norm_stderr": 0.034063153607115065 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7941176470588235, + "acc_stderr": 0.028379449451588667, + "acc_norm": 0.7941176470588235, + "acc_norm_stderr": 0.028379449451588667 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.810126582278481, + "acc_stderr": 0.025530100460233504, + "acc_norm": 0.810126582278481, + "acc_norm_stderr": 0.025530100460233504 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7309417040358744, + "acc_stderr": 0.029763779406874972, + "acc_norm": 0.7309417040358744, + "acc_norm_stderr": 0.029763779406874972 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7557251908396947, + "acc_stderr": 0.037683359597287434, + "acc_norm": 0.7557251908396947, + "acc_norm_stderr": 0.037683359597287434 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7768595041322314, + "acc_stderr": 0.03800754475228733, + "acc_norm": 0.7768595041322314, + "acc_norm_stderr": 0.03800754475228733 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7423312883435583, + "acc_stderr": 0.03436150827846917, + "acc_norm": 0.7423312883435583, + "acc_norm_stderr": 0.03436150827846917 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4642857142857143, + "acc_stderr": 0.04733667890053756, + "acc_norm": 0.4642857142857143, + "acc_norm_stderr": 0.04733667890053756 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8155339805825242, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.8155339805825242, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8589743589743589, + "acc_stderr": 0.02280138253459754, + "acc_norm": 0.8589743589743589, + "acc_norm_stderr": 0.02280138253459754 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.71, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8109833971902938, + "acc_stderr": 0.014000791294407004, + "acc_norm": 0.8109833971902938, + "acc_norm_stderr": 0.014000791294407004 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6763005780346821, + "acc_stderr": 0.025190181327608422, + "acc_norm": 0.6763005780346821, + "acc_norm_stderr": 0.025190181327608422 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4223463687150838, + "acc_stderr": 0.01651959427529712, + "acc_norm": 0.4223463687150838, + "acc_norm_stderr": 0.01651959427529712 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7026143790849673, + "acc_stderr": 0.02617390850671858, + "acc_norm": 0.7026143790849673, + "acc_norm_stderr": 0.02617390850671858 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7009646302250804, + "acc_stderr": 0.02600330111788514, + "acc_norm": 0.7009646302250804, + "acc_norm_stderr": 0.02600330111788514 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7253086419753086, + "acc_stderr": 0.02483605786829468, + "acc_norm": 0.7253086419753086, + "acc_norm_stderr": 0.02483605786829468 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5106382978723404, + "acc_stderr": 0.02982074719142244, + "acc_norm": 0.5106382978723404, + "acc_norm_stderr": 0.02982074719142244 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.46284224250325945, + "acc_stderr": 0.012734923579532063, + "acc_norm": 0.46284224250325945, + "acc_norm_stderr": 0.012734923579532063 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6838235294117647, + "acc_stderr": 0.02824568739146292, + "acc_norm": 0.6838235294117647, + "acc_norm_stderr": 0.02824568739146292 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6683006535947712, + "acc_stderr": 0.019047485239360375, + "acc_norm": 0.6683006535947712, + "acc_norm_stderr": 0.019047485239360375 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7, + "acc_stderr": 0.04389311454644287, + "acc_norm": 0.7, + "acc_norm_stderr": 0.04389311454644287 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7306122448979592, + "acc_stderr": 0.02840125202902294, + "acc_norm": 0.7306122448979592, + "acc_norm_stderr": 0.02840125202902294 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8407960199004975, + "acc_stderr": 0.02587064676616914, + "acc_norm": 0.8407960199004975, + "acc_norm_stderr": 0.02587064676616914 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.03379976689896308, + "acc_norm": 0.87, + "acc_norm_stderr": 0.03379976689896308 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5301204819277109, + "acc_stderr": 0.03885425420866767, + "acc_norm": 0.5301204819277109, + "acc_norm_stderr": 0.03885425420866767 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7894736842105263, + "acc_stderr": 0.03126781714663179, + "acc_norm": 0.7894736842105263, + "acc_norm_stderr": 0.03126781714663179 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.38922888616891066, + "mc1_stderr": 0.01706855268069033, + "mc2": 0.5452023492477854, + "mc2_stderr": 0.016056772234309992 + }, + "harness|winogrande|5": { + "acc": 0.7782162588792423, + "acc_stderr": 0.011676109244497813 + }, + "harness|gsm8k|5": { + "acc": 0.379833206974981, + "acc_stderr": 0.013368818096960498 + }, + "all": { + "acc": 0.6362170386987497, + "acc_stderr": 0.03232328033089801, + "acc_norm": 0.6416906108621553, + "acc_norm_stderr": 0.03297213400326341, + "mc1": 0.38922888616891066, + "mc1_stderr": 0.01706855268069033, + "mc2": 0.5452023492477854, + "mc2_stderr": 0.016056772234309992 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "a7a093a09885b882" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "0c1ad7daef5930ee" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Delcos/Velara/results_2023-12-08T00-16-45.141900.json b/eval-results/Delcos/Velara/results_2023-12-08T00-16-45.141900.json new file mode 100644 index 0000000000000000000000000000000000000000..bca758b816dcc0a69394954de7e656e576959826 --- /dev/null +++ b/eval-results/Delcos/Velara/results_2023-12-08T00-16-45.141900.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 434176.815404223, + "end_time": 449471.210024807, + "total_evaluation_time_secondes": "15294.394620584033", + "model_name": "Delcos/Velara", + "model_sha": "0fad8e711563d3a5a4631500d6a1d6b87f10d396", + "model_dtype": "torch.float16", + "model_size": "22.0 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5708191126279863, + "acc_stderr": 0.014464085894870653, + "acc_norm": 0.5895904436860068, + "acc_norm_stderr": 0.014374922192642664 + }, + "harness|hellaswag|10": { + "acc": 0.6477793268273252, + "acc_stderr": 0.004766860907171539, + "acc_norm": 0.8283210515833499, + "acc_norm_stderr": 0.00376330474609875 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.04292596718256981, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.04292596718256981 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6381578947368421, + "acc_stderr": 0.03910525752849724, + "acc_norm": 0.6381578947368421, + "acc_norm_stderr": 0.03910525752849724 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6075471698113207, + "acc_stderr": 0.030052580579557845, + "acc_norm": 0.6075471698113207, + "acc_norm_stderr": 0.030052580579557845 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6805555555555556, + "acc_stderr": 0.03899073687357334, + "acc_norm": 0.6805555555555556, + "acc_norm_stderr": 0.03899073687357334 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5549132947976878, + "acc_stderr": 0.03789401760283647, + "acc_norm": 0.5549132947976878, + "acc_norm_stderr": 0.03789401760283647 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.37254901960784315, + "acc_stderr": 0.04810840148082635, + "acc_norm": 0.37254901960784315, + "acc_norm_stderr": 0.04810840148082635 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.71, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5234042553191489, + "acc_stderr": 0.032650194750335815, + "acc_norm": 0.5234042553191489, + "acc_norm_stderr": 0.032650194750335815 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4298245614035088, + "acc_stderr": 0.04657047260594964, + "acc_norm": 0.4298245614035088, + "acc_norm_stderr": 0.04657047260594964 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4689655172413793, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.4689655172413793, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4365079365079365, + "acc_stderr": 0.025542846817400513, + "acc_norm": 0.4365079365079365, + "acc_norm_stderr": 0.025542846817400513 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.04403438954768177, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.04403438954768177 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6806451612903226, + "acc_stderr": 0.026522709674667768, + "acc_norm": 0.6806451612903226, + "acc_norm_stderr": 0.026522709674667768 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4630541871921182, + "acc_stderr": 0.035083705204426656, + "acc_norm": 0.4630541871921182, + "acc_norm_stderr": 0.035083705204426656 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7424242424242424, + "acc_stderr": 0.031156269519646836, + "acc_norm": 0.7424242424242424, + "acc_norm_stderr": 0.031156269519646836 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8549222797927462, + "acc_stderr": 0.025416343096306433, + "acc_norm": 0.8549222797927462, + "acc_norm_stderr": 0.025416343096306433 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5769230769230769, + "acc_stderr": 0.02504919787604234, + "acc_norm": 0.5769230769230769, + "acc_norm_stderr": 0.02504919787604234 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.027940457136228402, + "acc_norm": 0.3, + "acc_norm_stderr": 0.027940457136228402 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.031566630992154156, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.031566630992154156 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2847682119205298, + "acc_stderr": 0.03684881521389024, + "acc_norm": 0.2847682119205298, + "acc_norm_stderr": 0.03684881521389024 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.763302752293578, + "acc_stderr": 0.0182240781172991, + "acc_norm": 0.763302752293578, + "acc_norm_stderr": 0.0182240781172991 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4305555555555556, + "acc_stderr": 0.03376922151252336, + "acc_norm": 0.4305555555555556, + "acc_norm_stderr": 0.03376922151252336 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7745098039215687, + "acc_stderr": 0.029331162294251735, + "acc_norm": 0.7745098039215687, + "acc_norm_stderr": 0.029331162294251735 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7679324894514767, + "acc_stderr": 0.027479744550808517, + "acc_norm": 0.7679324894514767, + "acc_norm_stderr": 0.027479744550808517 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.03102441174057221, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.03102441174057221 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.04236511258094632, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.04236511258094632 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.754601226993865, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.754601226993865, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4732142857142857, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.4732142857142857, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7475728155339806, + "acc_stderr": 0.04301250399690878, + "acc_norm": 0.7475728155339806, + "acc_norm_stderr": 0.04301250399690878 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8205128205128205, + "acc_stderr": 0.025140935950335445, + "acc_norm": 0.8205128205128205, + "acc_norm_stderr": 0.025140935950335445 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7803320561941252, + "acc_stderr": 0.014805384478371151, + "acc_norm": 0.7803320561941252, + "acc_norm_stderr": 0.014805384478371151 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6560693641618497, + "acc_stderr": 0.025574123786546665, + "acc_norm": 0.6560693641618497, + "acc_norm_stderr": 0.025574123786546665 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.0142426300705749, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.0142426300705749 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6601307189542484, + "acc_stderr": 0.027121956071388856, + "acc_norm": 0.6601307189542484, + "acc_norm_stderr": 0.027121956071388856 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.662379421221865, + "acc_stderr": 0.02685882587948853, + "acc_norm": 0.662379421221865, + "acc_norm_stderr": 0.02685882587948853 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6759259259259259, + "acc_stderr": 0.02604176620271716, + "acc_norm": 0.6759259259259259, + "acc_norm_stderr": 0.02604176620271716 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.475177304964539, + "acc_stderr": 0.029790719243829707, + "acc_norm": 0.475177304964539, + "acc_norm_stderr": 0.029790719243829707 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4511082138200782, + "acc_stderr": 0.012709037347346233, + "acc_norm": 0.4511082138200782, + "acc_norm_stderr": 0.012709037347346233 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5919117647058824, + "acc_stderr": 0.029855261393483924, + "acc_norm": 0.5919117647058824, + "acc_norm_stderr": 0.029855261393483924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6209150326797386, + "acc_stderr": 0.01962744474841223, + "acc_norm": 0.6209150326797386, + "acc_norm_stderr": 0.01962744474841223 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6272727272727273, + "acc_stderr": 0.04631381319425465, + "acc_norm": 0.6272727272727273, + "acc_norm_stderr": 0.04631381319425465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6857142857142857, + "acc_stderr": 0.029719329422417475, + "acc_norm": 0.6857142857142857, + "acc_norm_stderr": 0.029719329422417475 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7910447761194029, + "acc_stderr": 0.028748298931728655, + "acc_norm": 0.7910447761194029, + "acc_norm_stderr": 0.028748298931728655 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5120481927710844, + "acc_stderr": 0.03891364495835816, + "acc_norm": 0.5120481927710844, + "acc_norm_stderr": 0.03891364495835816 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7426900584795322, + "acc_stderr": 0.03352799844161865, + "acc_norm": 0.7426900584795322, + "acc_norm_stderr": 0.03352799844161865 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2778457772337821, + "mc1_stderr": 0.015680929364024637, + "mc2": 0.44699355725588724, + "mc2_stderr": 0.015255919110214552 + }, + "harness|winogrande|5": { + "acc": 0.7379636937647988, + "acc_stderr": 0.012358944431637563 + }, + "harness|gsm8k|5": { + "acc": 0.40333586050037906, + "acc_stderr": 0.013512654781814687 + }, + "all": { + "acc": 0.5941805681088884, + "acc_stderr": 0.03328213036591988, + "acc_norm": 0.5983564094269671, + "acc_norm_stderr": 0.03395331581770101, + "mc1": 0.2778457772337821, + "mc1_stderr": 0.015680929364024637, + "mc2": 0.44699355725588724, + "mc2_stderr": 0.015255919110214552 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "bf2514dcf2b1eb02" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "89ce94141628858a" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Devio/test-1400/results_2023-09-03T06-25-15.872451.json b/eval-results/Devio/test-1400/results_2023-09-03T06-25-15.872451.json new file mode 100644 index 0000000000000000000000000000000000000000..b57d4fde8adcc4460d6442af098a104eef27336b --- /dev/null +++ b/eval-results/Devio/test-1400/results_2023-09-03T06-25-15.872451.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "Devio/test-1400", + "model_sha": "95194d494effb691edae0d596bc5df9856ee92d7", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.35238907849829354, + "acc_stderr": 0.013960142600598685, + "acc_norm": 0.38139931740614336, + "acc_norm_stderr": 0.014194389086685263 + }, + "harness|hellaswag|10": { + "acc": 0.4785899223262298, + "acc_stderr": 0.004985204766555062, + "acc_norm": 0.6619199362676758, + "acc_norm_stderr": 0.004720891597174716 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.22962962962962963, + "acc_stderr": 0.036333844140734636, + "acc_norm": 0.22962962962962963, + "acc_norm_stderr": 0.036333844140734636 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.03925523381052932, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.03925523381052932 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.3169811320754717, + "acc_stderr": 0.028637235639800935, + "acc_norm": 0.3169811320754717, + "acc_norm_stderr": 0.028637235639800935 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.24305555555555555, + "acc_stderr": 0.03586879280080343, + "acc_norm": 0.24305555555555555, + "acc_norm_stderr": 0.03586879280080343 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3063583815028902, + "acc_stderr": 0.03514942551267439, + "acc_norm": 0.3063583815028902, + "acc_norm_stderr": 0.03514942551267439 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.04488482852329017, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.04488482852329017 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3446808510638298, + "acc_stderr": 0.03106898596312215, + "acc_norm": 0.3446808510638298, + "acc_norm_stderr": 0.03106898596312215 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813344, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813344 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2620689655172414, + "acc_stderr": 0.036646663372252565, + "acc_norm": 0.2620689655172414, + "acc_norm_stderr": 0.036646663372252565 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.023266512213730564, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.023266512213730564 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.04163453031302859, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.04163453031302859 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3225806451612903, + "acc_stderr": 0.026593084516572274, + "acc_norm": 0.3225806451612903, + "acc_norm_stderr": 0.026593084516572274 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.0317852971064275, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.0317852971064275 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2606060606060606, + "acc_stderr": 0.034277431758165236, + "acc_norm": 0.2606060606060606, + "acc_norm_stderr": 0.034277431758165236 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3686868686868687, + "acc_stderr": 0.034373055019806184, + "acc_norm": 0.3686868686868687, + "acc_norm_stderr": 0.034373055019806184 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.35233160621761656, + "acc_stderr": 0.03447478286414359, + "acc_norm": 0.35233160621761656, + "acc_norm_stderr": 0.03447478286414359 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.36153846153846153, + "acc_stderr": 0.02435958146539698, + "acc_norm": 0.36153846153846153, + "acc_norm_stderr": 0.02435958146539698 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.026067159222275805, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.026067159222275805 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.33613445378151263, + "acc_stderr": 0.03068473711513536, + "acc_norm": 0.33613445378151263, + "acc_norm_stderr": 0.03068473711513536 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.03822746937658754, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.03822746937658754 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3376146788990826, + "acc_stderr": 0.0202752659866389, + "acc_norm": 0.3376146788990826, + "acc_norm_stderr": 0.0202752659866389 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.03350991604696043, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.03350991604696043 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.03077855467869326, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.03077855467869326 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.19831223628691982, + "acc_stderr": 0.025955020841621115, + "acc_norm": 0.19831223628691982, + "acc_norm_stderr": 0.025955020841621115 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.26905829596412556, + "acc_stderr": 0.029763779406874972, + "acc_norm": 0.26905829596412556, + "acc_norm_stderr": 0.029763779406874972 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.31297709923664124, + "acc_stderr": 0.04066962905677697, + "acc_norm": 0.31297709923664124, + "acc_norm_stderr": 0.04066962905677697 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.1322314049586777, + "acc_stderr": 0.030922788320445784, + "acc_norm": 0.1322314049586777, + "acc_norm_stderr": 0.030922788320445784 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2085889570552147, + "acc_stderr": 0.03192193448934722, + "acc_norm": 0.2085889570552147, + "acc_norm_stderr": 0.03192193448934722 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.16071428571428573, + "acc_stderr": 0.03485946096475741, + "acc_norm": 0.16071428571428573, + "acc_norm_stderr": 0.03485946096475741 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.4077669902912621, + "acc_stderr": 0.048657775704107696, + "acc_norm": 0.4077669902912621, + "acc_norm_stderr": 0.048657775704107696 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.23504273504273504, + "acc_stderr": 0.02777883590493543, + "acc_norm": 0.23504273504273504, + "acc_norm_stderr": 0.02777883590493543 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.24521072796934865, + "acc_stderr": 0.015384352284543932, + "acc_norm": 0.24521072796934865, + "acc_norm_stderr": 0.015384352284543932 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2745664739884393, + "acc_stderr": 0.024027745155265023, + "acc_norm": 0.2745664739884393, + "acc_norm_stderr": 0.024027745155265023 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2536312849162011, + "acc_stderr": 0.014551553659369922, + "acc_norm": 0.2536312849162011, + "acc_norm_stderr": 0.014551553659369922 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.3104575163398693, + "acc_stderr": 0.0264930332251459, + "acc_norm": 0.3104575163398693, + "acc_norm_stderr": 0.0264930332251459 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.29260450160771706, + "acc_stderr": 0.02583989833487798, + "acc_norm": 0.29260450160771706, + "acc_norm_stderr": 0.02583989833487798 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25617283950617287, + "acc_stderr": 0.0242885336377261, + "acc_norm": 0.25617283950617287, + "acc_norm_stderr": 0.0242885336377261 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.28368794326241137, + "acc_stderr": 0.026891709428343957, + "acc_norm": 0.28368794326241137, + "acc_norm_stderr": 0.026891709428343957 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2438070404172099, + "acc_stderr": 0.010966507972178479, + "acc_norm": 0.2438070404172099, + "acc_norm_stderr": 0.010966507972178479 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4227941176470588, + "acc_stderr": 0.03000856284500347, + "acc_norm": 0.4227941176470588, + "acc_norm_stderr": 0.03000856284500347 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.01716058723504635, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.01716058723504635 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2818181818181818, + "acc_stderr": 0.043091187099464585, + "acc_norm": 0.2818181818181818, + "acc_norm_stderr": 0.043091187099464585 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4, + "acc_stderr": 0.031362502409358936, + "acc_norm": 0.4, + "acc_norm_stderr": 0.031362502409358936 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.03333333333333335, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.03333333333333335 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.25301204819277107, + "acc_stderr": 0.03384429155233136, + "acc_norm": 0.25301204819277107, + "acc_norm_stderr": 0.03384429155233136 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.23391812865497075, + "acc_stderr": 0.03246721765117826, + "acc_norm": 0.23391812865497075, + "acc_norm_stderr": 0.03246721765117826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.22766217870257038, + "mc1_stderr": 0.01467925503211107, + "mc2": 0.3686966632375142, + "mc2_stderr": 0.014163025545486835 + }, + "all": { + "acc": 0.29066385939253414, + "acc_stderr": 0.032634153881095015, + "acc_norm": 0.2942628467289629, + "acc_norm_stderr": 0.03263364427629342, + "mc1": 0.22766217870257038, + "mc1_stderr": 0.01467925503211107, + "mc2": 0.3686966632375142, + "mc2_stderr": 0.014163025545486835 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6229.124125242233", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Devio/test-22B/results_2023-09-02T01-38-52.675251.json b/eval-results/Devio/test-22B/results_2023-09-02T01-38-52.675251.json new file mode 100644 index 0000000000000000000000000000000000000000..9fad4df60402af6bc71a2f7cbafe00d1e74eed9a --- /dev/null +++ b/eval-results/Devio/test-22B/results_2023-09-02T01-38-52.675251.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "Devio/test-22B", + "model_sha": "cd72f5954ab5801dd2c1b499e59265f7504f9ee6", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.3438566552901024, + "acc_stderr": 0.01388064457015621, + "acc_norm": 0.39419795221843, + "acc_norm_stderr": 0.014280522667467325 + }, + "harness|hellaswag|10": { + "acc": 0.4690300736904999, + "acc_stderr": 0.004980200451851677, + "acc_norm": 0.6450906193985262, + "acc_norm_stderr": 0.004775079636567092 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768081, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768081 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.22962962962962963, + "acc_stderr": 0.03633384414073462, + "acc_norm": 0.22962962962962963, + "acc_norm_stderr": 0.03633384414073462 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.32894736842105265, + "acc_stderr": 0.03823428969926604, + "acc_norm": 0.32894736842105265, + "acc_norm_stderr": 0.03823428969926604 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.17, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.17, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2981132075471698, + "acc_stderr": 0.028152837942493854, + "acc_norm": 0.2981132075471698, + "acc_norm_stderr": 0.028152837942493854 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.24305555555555555, + "acc_stderr": 0.03586879280080341, + "acc_norm": 0.24305555555555555, + "acc_norm_stderr": 0.03586879280080341 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3063583815028902, + "acc_stderr": 0.03514942551267438, + "acc_norm": 0.3063583815028902, + "acc_norm_stderr": 0.03514942551267438 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.045338381959297736, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.045338381959297736 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.21, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.21, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.28936170212765955, + "acc_stderr": 0.02964400657700962, + "acc_norm": 0.28936170212765955, + "acc_norm_stderr": 0.02964400657700962 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813344, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813344 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2689655172413793, + "acc_stderr": 0.036951833116502325, + "acc_norm": 0.2689655172413793, + "acc_norm_stderr": 0.036951833116502325 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2698412698412698, + "acc_stderr": 0.022860838309232072, + "acc_norm": 0.2698412698412698, + "acc_norm_stderr": 0.022860838309232072 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23015873015873015, + "acc_stderr": 0.03764950879790606, + "acc_norm": 0.23015873015873015, + "acc_norm_stderr": 0.03764950879790606 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.33225806451612905, + "acc_stderr": 0.026795560848122797, + "acc_norm": 0.33225806451612905, + "acc_norm_stderr": 0.026795560848122797 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2955665024630542, + "acc_stderr": 0.032104944337514575, + "acc_norm": 0.2955665024630542, + "acc_norm_stderr": 0.032104944337514575 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.24848484848484848, + "acc_stderr": 0.03374402644139404, + "acc_norm": 0.24848484848484848, + "acc_norm_stderr": 0.03374402644139404 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.35353535353535354, + "acc_stderr": 0.03406086723547153, + "acc_norm": 0.35353535353535354, + "acc_norm_stderr": 0.03406086723547153 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.35233160621761656, + "acc_stderr": 0.03447478286414359, + "acc_norm": 0.35233160621761656, + "acc_norm_stderr": 0.03447478286414359 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3564102564102564, + "acc_stderr": 0.024283140529467295, + "acc_norm": 0.3564102564102564, + "acc_norm_stderr": 0.024283140529467295 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.02684205787383371, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.02684205787383371 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.031041941304059288, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.031041941304059288 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.36423841059602646, + "acc_stderr": 0.03929111781242741, + "acc_norm": 0.36423841059602646, + "acc_norm_stderr": 0.03929111781242741 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3486238532110092, + "acc_stderr": 0.020431254090714324, + "acc_norm": 0.3486238532110092, + "acc_norm_stderr": 0.020431254090714324 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.03324708911809117, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.03324708911809117 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.03077855467869326, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.03077855467869326 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.20253164556962025, + "acc_stderr": 0.026160568246601457, + "acc_norm": 0.20253164556962025, + "acc_norm_stderr": 0.026160568246601457 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.1210762331838565, + "acc_stderr": 0.021894174113185737, + "acc_norm": 0.1210762331838565, + "acc_norm_stderr": 0.021894174113185737 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.32061068702290074, + "acc_stderr": 0.04093329229834277, + "acc_norm": 0.32061068702290074, + "acc_norm_stderr": 0.04093329229834277 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.17355371900826447, + "acc_stderr": 0.03457272836917671, + "acc_norm": 0.17355371900826447, + "acc_norm_stderr": 0.03457272836917671 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2331288343558282, + "acc_stderr": 0.033220157957767414, + "acc_norm": 0.2331288343558282, + "acc_norm_stderr": 0.033220157957767414 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.16964285714285715, + "acc_stderr": 0.03562367850095391, + "acc_norm": 0.16964285714285715, + "acc_norm_stderr": 0.03562367850095391 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3786407766990291, + "acc_stderr": 0.04802694698258972, + "acc_norm": 0.3786407766990291, + "acc_norm_stderr": 0.04802694698258972 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.19658119658119658, + "acc_stderr": 0.02603538609895129, + "acc_norm": 0.19658119658119658, + "acc_norm_stderr": 0.02603538609895129 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.21839080459770116, + "acc_stderr": 0.0147743583199345, + "acc_norm": 0.21839080459770116, + "acc_norm_stderr": 0.0147743583199345 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.21676300578034682, + "acc_stderr": 0.02218347766841286, + "acc_norm": 0.21676300578034682, + "acc_norm_stderr": 0.02218347766841286 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24134078212290502, + "acc_stderr": 0.014310999547961459, + "acc_norm": 0.24134078212290502, + "acc_norm_stderr": 0.014310999547961459 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.026090162504279053, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.026090162504279053 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2604501607717042, + "acc_stderr": 0.02492672322484554, + "acc_norm": 0.2604501607717042, + "acc_norm_stderr": 0.02492672322484554 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.22839506172839505, + "acc_stderr": 0.023358211840626267, + "acc_norm": 0.22839506172839505, + "acc_norm_stderr": 0.023358211840626267 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2695035460992908, + "acc_stderr": 0.026469036818590624, + "acc_norm": 0.2695035460992908, + "acc_norm_stderr": 0.026469036818590624 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24511082138200782, + "acc_stderr": 0.010986307870045517, + "acc_norm": 0.24511082138200782, + "acc_norm_stderr": 0.010986307870045517 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4485294117647059, + "acc_stderr": 0.030211479609121593, + "acc_norm": 0.4485294117647059, + "acc_norm_stderr": 0.030211479609121593 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2173202614379085, + "acc_stderr": 0.01668482092914859, + "acc_norm": 0.2173202614379085, + "acc_norm_stderr": 0.01668482092914859 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.04172343038705383, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.04172343038705383 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4, + "acc_stderr": 0.031362502409358936, + "acc_norm": 0.4, + "acc_norm_stderr": 0.031362502409358936 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.29850746268656714, + "acc_stderr": 0.032357437893550424, + "acc_norm": 0.29850746268656714, + "acc_norm_stderr": 0.032357437893550424 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.1927710843373494, + "acc_stderr": 0.030709824050565274, + "acc_norm": 0.1927710843373494, + "acc_norm_stderr": 0.030709824050565274 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.1695906432748538, + "acc_stderr": 0.028782108105401712, + "acc_norm": 0.1695906432748538, + "acc_norm_stderr": 0.028782108105401712 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.22888616891064872, + "mc1_stderr": 0.014706994909055027, + "mc2": 0.3713156999785537, + "mc2_stderr": 0.014209414703476026 + }, + "all": { + "acc": 0.2758923391191029, + "acc_stderr": 0.03201388252201146, + "acc_norm": 0.2797296584858208, + "acc_norm_stderr": 0.032017183492893266, + "mc1": 0.22888616891064872, + "mc1_stderr": 0.014706994909055027, + "mc2": 0.3713156999785537, + "mc2_stderr": 0.014209414703476026 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6153.056109905243", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Devio/test-22B/results_2023-10-16T03-23-54.397499.json b/eval-results/Devio/test-22B/results_2023-10-16T03-23-54.397499.json new file mode 100644 index 0000000000000000000000000000000000000000..28800a0731ea5d0fbec9d56dd6593f814747a681 --- /dev/null +++ b/eval-results/Devio/test-22B/results_2023-10-16T03-23-54.397499.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Devio/test-22B", + "model_sha": "9a9db8625209b6c47b7e80e4a98e095650b7f353", + "model_size": "40.7 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.002936241610738255, + "em_stderr": 0.0005541113054709917, + "f1": 0.03323510906040272, + "f1_stderr": 0.0011026689087019657 + }, + "harness|gsm8k|5": { + "acc": 0.0037907505686125853, + "acc_stderr": 0.0016927007401501832 + }, + "harness|winogrande|5": { + "acc": 0.5769534333070244, + "acc_stderr": 0.013885055359056472 + }, + "all": { + "em": 0.002936241610738255, + "em_stderr": 0.0005541113054709917, + "f1": 0.03323510906040272, + "f1_stderr": 0.0011026689087019657, + "acc": 0.2903720919378185, + "acc_stderr": 0.0077888780496033275 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "88e4a0eecc23c960" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "ec596a75878398f0" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "d8fd7c444d1bba9a" + }, + "total_evaluation_time_secondes": "23919.04701924324", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Devio/test-3b/results_2023-09-02T16-42-09.049307.json b/eval-results/Devio/test-3b/results_2023-09-02T16-42-09.049307.json new file mode 100644 index 0000000000000000000000000000000000000000..9b981d5d2a7357040334c7127a6f5afb53096345 --- /dev/null +++ b/eval-results/Devio/test-3b/results_2023-09-02T16-42-09.049307.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "Devio/test-3b", + "model_sha": "b81c038ee2fa2addd285acde08b1a7ca3cb2854d", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.25597269624573377, + "acc_stderr": 0.012753013241244521, + "acc_norm": 0.2764505119453925, + "acc_norm_stderr": 0.013069662474252428 + }, + "harness|hellaswag|10": { + "acc": 0.35988846843258315, + "acc_stderr": 0.004789865379084508, + "acc_norm": 0.4479187412865963, + "acc_norm_stderr": 0.004962638446396 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.037498507091740206, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.037498507091740206 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.21710526315789475, + "acc_stderr": 0.03355045304882923, + "acc_norm": 0.21710526315789475, + "acc_norm_stderr": 0.03355045304882923 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2339622641509434, + "acc_stderr": 0.026055296901152922, + "acc_norm": 0.2339622641509434, + "acc_norm_stderr": 0.026055296901152922 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2152777777777778, + "acc_stderr": 0.03437079344106135, + "acc_norm": 0.2152777777777778, + "acc_norm_stderr": 0.03437079344106135 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.21965317919075145, + "acc_stderr": 0.031568093627031744, + "acc_norm": 0.21965317919075145, + "acc_norm_stderr": 0.031568093627031744 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179963, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179963 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3276595744680851, + "acc_stderr": 0.030683020843231, + "acc_norm": 0.3276595744680851, + "acc_norm_stderr": 0.030683020843231 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21929824561403508, + "acc_stderr": 0.03892431106518754, + "acc_norm": 0.21929824561403508, + "acc_norm_stderr": 0.03892431106518754 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.25517241379310346, + "acc_stderr": 0.03632984052707842, + "acc_norm": 0.25517241379310346, + "acc_norm_stderr": 0.03632984052707842 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2566137566137566, + "acc_stderr": 0.022494510767503154, + "acc_norm": 0.2566137566137566, + "acc_norm_stderr": 0.022494510767503154 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.19047619047619047, + "acc_stderr": 0.03512207412302052, + "acc_norm": 0.19047619047619047, + "acc_norm_stderr": 0.03512207412302052 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.29354838709677417, + "acc_stderr": 0.02590608702131929, + "acc_norm": 0.29354838709677417, + "acc_norm_stderr": 0.02590608702131929 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2315270935960591, + "acc_stderr": 0.029678333141444455, + "acc_norm": 0.2315270935960591, + "acc_norm_stderr": 0.029678333141444455 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.16, + "acc_stderr": 0.036845294917747094, + "acc_norm": 0.16, + "acc_norm_stderr": 0.036845294917747094 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.22424242424242424, + "acc_stderr": 0.032568666616811015, + "acc_norm": 0.22424242424242424, + "acc_norm_stderr": 0.032568666616811015 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2474747474747475, + "acc_stderr": 0.030746300742124498, + "acc_norm": 0.2474747474747475, + "acc_norm_stderr": 0.030746300742124498 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.23316062176165803, + "acc_stderr": 0.030516111371476008, + "acc_norm": 0.23316062176165803, + "acc_norm_stderr": 0.030516111371476008 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.24871794871794872, + "acc_stderr": 0.0219169577092138, + "acc_norm": 0.24871794871794872, + "acc_norm_stderr": 0.0219169577092138 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.026067159222275805, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.026067159222275805 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.24369747899159663, + "acc_stderr": 0.02788682807838056, + "acc_norm": 0.24369747899159663, + "acc_norm_stderr": 0.02788682807838056 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2052980132450331, + "acc_stderr": 0.03297986648473836, + "acc_norm": 0.2052980132450331, + "acc_norm_stderr": 0.03297986648473836 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.22568807339449543, + "acc_stderr": 0.01792308766780305, + "acc_norm": 0.22568807339449543, + "acc_norm_stderr": 0.01792308766780305 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4351851851851852, + "acc_stderr": 0.033812000056435254, + "acc_norm": 0.4351851851851852, + "acc_norm_stderr": 0.033812000056435254 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.028867431449849303, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.028867431449849303 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.26582278481012656, + "acc_stderr": 0.028756799629658342, + "acc_norm": 0.26582278481012656, + "acc_norm_stderr": 0.028756799629658342 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.18834080717488788, + "acc_stderr": 0.026241132996407252, + "acc_norm": 0.18834080717488788, + "acc_norm_stderr": 0.026241132996407252 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.22900763358778625, + "acc_stderr": 0.036853466317118506, + "acc_norm": 0.22900763358778625, + "acc_norm_stderr": 0.036853466317118506 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.256198347107438, + "acc_stderr": 0.03984979653302872, + "acc_norm": 0.256198347107438, + "acc_norm_stderr": 0.03984979653302872 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.294478527607362, + "acc_stderr": 0.03581165790474082, + "acc_norm": 0.294478527607362, + "acc_norm_stderr": 0.03581165790474082 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.24107142857142858, + "acc_stderr": 0.04059867246952687, + "acc_norm": 0.24107142857142858, + "acc_norm_stderr": 0.04059867246952687 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.20388349514563106, + "acc_stderr": 0.0398913985953177, + "acc_norm": 0.20388349514563106, + "acc_norm_stderr": 0.0398913985953177 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.20085470085470086, + "acc_stderr": 0.026246772946890477, + "acc_norm": 0.20085470085470086, + "acc_norm_stderr": 0.026246772946890477 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.28607918263090676, + "acc_stderr": 0.016160871405127543, + "acc_norm": 0.28607918263090676, + "acc_norm_stderr": 0.016160871405127543 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.1994219653179191, + "acc_stderr": 0.021511900654252524, + "acc_norm": 0.1994219653179191, + "acc_norm_stderr": 0.021511900654252524 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574875, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574875 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.24836601307189543, + "acc_stderr": 0.02473998135511359, + "acc_norm": 0.24836601307189543, + "acc_norm_stderr": 0.02473998135511359 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.19292604501607716, + "acc_stderr": 0.022411516780911366, + "acc_norm": 0.19292604501607716, + "acc_norm_stderr": 0.022411516780911366 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.024659685185967277, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.024659685185967277 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2553191489361702, + "acc_stderr": 0.026011992930902013, + "acc_norm": 0.2553191489361702, + "acc_norm_stderr": 0.026011992930902013 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24902216427640156, + "acc_stderr": 0.01104489226404077, + "acc_norm": 0.24902216427640156, + "acc_norm_stderr": 0.01104489226404077 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.24632352941176472, + "acc_stderr": 0.02617343857052, + "acc_norm": 0.24632352941176472, + "acc_norm_stderr": 0.02617343857052 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25326797385620914, + "acc_stderr": 0.017593486895366835, + "acc_norm": 0.25326797385620914, + "acc_norm_stderr": 0.017593486895366835 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.24545454545454545, + "acc_stderr": 0.04122066502878285, + "acc_norm": 0.24545454545454545, + "acc_norm_stderr": 0.04122066502878285 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.24081632653061225, + "acc_stderr": 0.027372942201788163, + "acc_norm": 0.24081632653061225, + "acc_norm_stderr": 0.027372942201788163 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.208955223880597, + "acc_stderr": 0.028748298931728655, + "acc_norm": 0.208955223880597, + "acc_norm_stderr": 0.028748298931728655 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.19879518072289157, + "acc_stderr": 0.031069390260789424, + "acc_norm": 0.19879518072289157, + "acc_norm_stderr": 0.031069390260789424 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.0312678171466318, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.0312678171466318 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23011015911872704, + "mc1_stderr": 0.014734557959807765, + "mc2": 0.41415759101311883, + "mc2_stderr": 0.014688710447803573 + }, + "all": { + "acc": 0.2377565764307277, + "acc_stderr": 0.030680693861815076, + "acc_norm": 0.23959569657570515, + "acc_norm_stderr": 0.030688989155040824, + "mc1": 0.23011015911872704, + "mc1_stderr": 0.014734557959807765, + "mc2": 0.41415759101311883, + "mc2_stderr": 0.014688710447803573 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "2689.15381360054", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Devio/test-3b/results_2023-10-14T16-45-10.101567.json b/eval-results/Devio/test-3b/results_2023-10-14T16-45-10.101567.json new file mode 100644 index 0000000000000000000000000000000000000000..8c0ce3e2165a7e14525beef78056d2ede05567b8 --- /dev/null +++ b/eval-results/Devio/test-3b/results_2023-10-14T16-45-10.101567.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Devio/test-3b", + "model_sha": "b81c038ee2fa2addd285acde08b1a7ca3cb2854d", + "model_size": "6.55 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.006816275167785235, + "em_stderr": 0.000842612709585923, + "f1": 0.0460748741610739, + "f1_stderr": 0.001322491101848216 + }, + "harness|gsm8k|5": { + "acc": 0.003032600454890068, + "acc_stderr": 0.0015145735612245457 + }, + "harness|winogrande|5": { + "acc": 0.5548539857932123, + "acc_stderr": 0.013967662954355491 + }, + "all": { + "em": 0.006816275167785235, + "em_stderr": 0.000842612709585923, + "f1": 0.0460748741610739, + "f1_stderr": 0.001322491101848216, + "acc": 0.2789432931240512, + "acc_stderr": 0.0077411182577900185 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "d1b3a0b73d72ab84" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "f224e40e57fe381a" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "fdd269dd3e659457" + }, + "total_evaluation_time_secondes": "12406.458235025406", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Devio/test-9k-fn/results_2023-10-04T07-45-21.870360.json b/eval-results/Devio/test-9k-fn/results_2023-10-04T07-45-21.870360.json new file mode 100644 index 0000000000000000000000000000000000000000..06b79a474eb553541e7a60bad6ae5a7137ea02b4 --- /dev/null +++ b/eval-results/Devio/test-9k-fn/results_2023-10-04T07-45-21.870360.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Devio/test-9k-fn", + "model_sha": "b2fc754748ee94428298de3528e549b296d51c1e", + "model_size": "40.7 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.35665529010238906, + "acc_stderr": 0.013998056902620192, + "acc_norm": 0.4087030716723549, + "acc_norm_stderr": 0.014365750345427008 + }, + "harness|hellaswag|10": { + "acc": 0.5057757418840868, + "acc_stderr": 0.004989448490164429, + "acc_norm": 0.6944831706831308, + "acc_norm_stderr": 0.004596845936356623 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.0391545063041425, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.0391545063041425 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3092105263157895, + "acc_stderr": 0.037610708698674805, + "acc_norm": 0.3092105263157895, + "acc_norm_stderr": 0.037610708698674805 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.3018867924528302, + "acc_stderr": 0.028254200344438655, + "acc_norm": 0.3018867924528302, + "acc_norm_stderr": 0.028254200344438655 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2986111111111111, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.2986111111111111, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2774566473988439, + "acc_stderr": 0.03414014007044037, + "acc_norm": 0.2774566473988439, + "acc_norm_stderr": 0.03414014007044037 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179963, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179963 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.32340425531914896, + "acc_stderr": 0.030579442773610337, + "acc_norm": 0.32340425531914896, + "acc_norm_stderr": 0.030579442773610337 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.20175438596491227, + "acc_stderr": 0.037752050135836386, + "acc_norm": 0.20175438596491227, + "acc_norm_stderr": 0.037752050135836386 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3103448275862069, + "acc_stderr": 0.03855289616378947, + "acc_norm": 0.3103448275862069, + "acc_norm_stderr": 0.03855289616378947 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.22486772486772486, + "acc_stderr": 0.02150209607822914, + "acc_norm": 0.22486772486772486, + "acc_norm_stderr": 0.02150209607822914 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.041905964388711366, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.041905964388711366 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2709677419354839, + "acc_stderr": 0.025284416114900156, + "acc_norm": 0.2709677419354839, + "acc_norm_stderr": 0.025284416114900156 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.18719211822660098, + "acc_stderr": 0.027444924966882618, + "acc_norm": 0.18719211822660098, + "acc_norm_stderr": 0.027444924966882618 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21212121212121213, + "acc_stderr": 0.03192271569548299, + "acc_norm": 0.21212121212121213, + "acc_norm_stderr": 0.03192271569548299 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3181818181818182, + "acc_stderr": 0.03318477333845331, + "acc_norm": 0.3181818181818182, + "acc_norm_stderr": 0.03318477333845331 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.35751295336787564, + "acc_stderr": 0.03458816042181005, + "acc_norm": 0.35751295336787564, + "acc_norm_stderr": 0.03458816042181005 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3717948717948718, + "acc_stderr": 0.024503472557110936, + "acc_norm": 0.3717948717948718, + "acc_norm_stderr": 0.024503472557110936 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.027634907264178544, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.027634907264178544 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2815126050420168, + "acc_stderr": 0.029213549414372163, + "acc_norm": 0.2815126050420168, + "acc_norm_stderr": 0.029213549414372163 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3211009174311927, + "acc_stderr": 0.020018149772733747, + "acc_norm": 0.3211009174311927, + "acc_norm_stderr": 0.020018149772733747 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3611111111111111, + "acc_stderr": 0.032757734861009996, + "acc_norm": 0.3611111111111111, + "acc_norm_stderr": 0.032757734861009996 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.030964517926923403, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.030964517926923403 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.030685820596610795, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.030685820596610795 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.34080717488789236, + "acc_stderr": 0.031811497470553604, + "acc_norm": 0.34080717488789236, + "acc_norm_stderr": 0.031811497470553604 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3282442748091603, + "acc_stderr": 0.04118438565806298, + "acc_norm": 0.3282442748091603, + "acc_norm_stderr": 0.04118438565806298 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2644628099173554, + "acc_stderr": 0.04026187527591205, + "acc_norm": 0.2644628099173554, + "acc_norm_stderr": 0.04026187527591205 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.25153374233128833, + "acc_stderr": 0.034089978868575295, + "acc_norm": 0.25153374233128833, + "acc_norm_stderr": 0.034089978868575295 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.04572372358737431, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.04572372358737431 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.23300970873786409, + "acc_stderr": 0.041858325989283136, + "acc_norm": 0.23300970873786409, + "acc_norm_stderr": 0.041858325989283136 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.3974358974358974, + "acc_stderr": 0.032059534537892925, + "acc_norm": 0.3974358974358974, + "acc_norm_stderr": 0.032059534537892925 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2554278416347382, + "acc_stderr": 0.015594955384455772, + "acc_norm": 0.2554278416347382, + "acc_norm_stderr": 0.015594955384455772 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2774566473988439, + "acc_stderr": 0.024105712607754307, + "acc_norm": 0.2774566473988439, + "acc_norm_stderr": 0.024105712607754307 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574877, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574877 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.39215686274509803, + "acc_stderr": 0.02795604616542451, + "acc_norm": 0.39215686274509803, + "acc_norm_stderr": 0.02795604616542451 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2604501607717042, + "acc_stderr": 0.02492672322484555, + "acc_norm": 0.2604501607717042, + "acc_norm_stderr": 0.02492672322484555 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.02438366553103545, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.02438366553103545 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2801418439716312, + "acc_stderr": 0.026789172351140242, + "acc_norm": 0.2801418439716312, + "acc_norm_stderr": 0.026789172351140242 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2666232073011734, + "acc_stderr": 0.011293836031612131, + "acc_norm": 0.2666232073011734, + "acc_norm_stderr": 0.011293836031612131 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.31985294117647056, + "acc_stderr": 0.02833295951403124, + "acc_norm": 0.31985294117647056, + "acc_norm_stderr": 0.02833295951403124 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.017630827375148383, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.017630827375148383 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2818181818181818, + "acc_stderr": 0.04309118709946458, + "acc_norm": 0.2818181818181818, + "acc_norm_stderr": 0.04309118709946458 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3306122448979592, + "acc_stderr": 0.030116426296540592, + "acc_norm": 0.3306122448979592, + "acc_norm_stderr": 0.030116426296540592 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.31840796019900497, + "acc_stderr": 0.03294118479054096, + "acc_norm": 0.31840796019900497, + "acc_norm_stderr": 0.03294118479054096 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.30120481927710846, + "acc_stderr": 0.0357160923005348, + "acc_norm": 0.30120481927710846, + "acc_norm_stderr": 0.0357160923005348 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.32748538011695905, + "acc_stderr": 0.035993357714560276, + "acc_norm": 0.32748538011695905, + "acc_norm_stderr": 0.035993357714560276 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23378212974296206, + "mc1_stderr": 0.014816195991931583, + "mc2": 0.3914546223201993, + "mc2_stderr": 0.013969580332280395 + }, + "all": { + "acc": 0.29934767850730726, + "acc_stderr": 0.033158996935405735, + "acc_norm": 0.3034282752932227, + "acc_norm_stderr": 0.03315857474708369, + "mc1": 0.23378212974296206, + "mc1_stderr": 0.014816195991931583, + "mc2": 0.3914546223201993, + "mc2_stderr": 0.013969580332280395 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6220.008844137192", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Devio/test100/results_2023-09-02T17-29-14.649417.json b/eval-results/Devio/test100/results_2023-09-02T17-29-14.649417.json new file mode 100644 index 0000000000000000000000000000000000000000..084b6b4dbfa19f341abb062ff41d0444b8a960cc --- /dev/null +++ b/eval-results/Devio/test100/results_2023-09-02T17-29-14.649417.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "Devio/test100", + "model_sha": "6bd139260f60232328b05b2cd973c3d8f07c0c02", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.3370307167235495, + "acc_stderr": 0.013813476652902272, + "acc_norm": 0.37372013651877134, + "acc_norm_stderr": 0.014137708601759098 + }, + "harness|hellaswag|10": { + "acc": 0.4312885879306911, + "acc_stderr": 0.004942440746328494, + "acc_norm": 0.5854411471818363, + "acc_norm_stderr": 0.0049163889621423205 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.22962962962962963, + "acc_stderr": 0.03633384414073461, + "acc_norm": 0.22962962962962963, + "acc_norm_stderr": 0.03633384414073461 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3355263157894737, + "acc_stderr": 0.03842498559395268, + "acc_norm": 0.3355263157894737, + "acc_norm_stderr": 0.03842498559395268 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2943396226415094, + "acc_stderr": 0.028049186315695248, + "acc_norm": 0.2943396226415094, + "acc_norm_stderr": 0.028049186315695248 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.24305555555555555, + "acc_stderr": 0.03586879280080341, + "acc_norm": 0.24305555555555555, + "acc_norm_stderr": 0.03586879280080341 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.32947976878612717, + "acc_stderr": 0.03583901754736411, + "acc_norm": 0.32947976878612717, + "acc_norm_stderr": 0.03583901754736411 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.37254901960784315, + "acc_stderr": 0.04810840148082633, + "acc_norm": 0.37254901960784315, + "acc_norm_stderr": 0.04810840148082633 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2170212765957447, + "acc_stderr": 0.026947483121496217, + "acc_norm": 0.2170212765957447, + "acc_norm_stderr": 0.026947483121496217 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813344, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813344 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.022182037202948368, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.022182037202948368 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.04306241259127153, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.04306241259127153 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3225806451612903, + "acc_stderr": 0.02659308451657228, + "acc_norm": 0.3225806451612903, + "acc_norm_stderr": 0.02659308451657228 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.28078817733990147, + "acc_stderr": 0.03161856335358609, + "acc_norm": 0.28078817733990147, + "acc_norm_stderr": 0.03161856335358609 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.35353535353535354, + "acc_stderr": 0.03406086723547153, + "acc_norm": 0.35353535353535354, + "acc_norm_stderr": 0.03406086723547153 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.36787564766839376, + "acc_stderr": 0.03480175668466036, + "acc_norm": 0.36787564766839376, + "acc_norm_stderr": 0.03480175668466036 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3641025641025641, + "acc_stderr": 0.02439667298509477, + "acc_norm": 0.3641025641025641, + "acc_norm_stderr": 0.02439667298509477 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.02671924078371216, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.02671924078371216 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3487394957983193, + "acc_stderr": 0.03095663632856655, + "acc_norm": 0.3487394957983193, + "acc_norm_stderr": 0.03095663632856655 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3522935779816514, + "acc_stderr": 0.020480568843998997, + "acc_norm": 0.3522935779816514, + "acc_norm_stderr": 0.020480568843998997 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.030587591351604246, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.030587591351604246 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.20253164556962025, + "acc_stderr": 0.026160568246601457, + "acc_norm": 0.20253164556962025, + "acc_norm_stderr": 0.026160568246601457 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.10762331838565023, + "acc_stderr": 0.020799400082879997, + "acc_norm": 0.10762331838565023, + "acc_norm_stderr": 0.020799400082879997 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2824427480916031, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.2824427480916031, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.18181818181818182, + "acc_stderr": 0.035208939510976554, + "acc_norm": 0.18181818181818182, + "acc_norm_stderr": 0.035208939510976554 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2331288343558282, + "acc_stderr": 0.033220157957767414, + "acc_norm": 0.2331288343558282, + "acc_norm_stderr": 0.033220157957767414 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.16071428571428573, + "acc_stderr": 0.03485946096475741, + "acc_norm": 0.16071428571428573, + "acc_norm_stderr": 0.03485946096475741 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3786407766990291, + "acc_stderr": 0.04802694698258972, + "acc_norm": 0.3786407766990291, + "acc_norm_stderr": 0.04802694698258972 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.19658119658119658, + "acc_stderr": 0.02603538609895129, + "acc_norm": 0.19658119658119658, + "acc_norm_stderr": 0.02603538609895129 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909281, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909281 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.20434227330779056, + "acc_stderr": 0.0144191239809319, + "acc_norm": 0.20434227330779056, + "acc_norm_stderr": 0.0144191239809319 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2138728323699422, + "acc_stderr": 0.022075709251757183, + "acc_norm": 0.2138728323699422, + "acc_norm_stderr": 0.022075709251757183 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.27039106145251396, + "acc_stderr": 0.014854993938010102, + "acc_norm": 0.27039106145251396, + "acc_norm_stderr": 0.014854993938010102 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2973856209150327, + "acc_stderr": 0.02617390850671858, + "acc_norm": 0.2973856209150327, + "acc_norm_stderr": 0.02617390850671858 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2540192926045016, + "acc_stderr": 0.024723861504771696, + "acc_norm": 0.2540192926045016, + "acc_norm_stderr": 0.024723861504771696 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.22530864197530864, + "acc_stderr": 0.023246202647819746, + "acc_norm": 0.22530864197530864, + "acc_norm_stderr": 0.023246202647819746 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2375886524822695, + "acc_stderr": 0.025389512552729906, + "acc_norm": 0.2375886524822695, + "acc_norm_stderr": 0.025389512552729906 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24771838331160365, + "acc_stderr": 0.011025499291443738, + "acc_norm": 0.24771838331160365, + "acc_norm_stderr": 0.011025499291443738 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4485294117647059, + "acc_stderr": 0.030211479609121593, + "acc_norm": 0.4485294117647059, + "acc_norm_stderr": 0.030211479609121593 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2173202614379085, + "acc_stderr": 0.01668482092914859, + "acc_norm": 0.2173202614379085, + "acc_norm_stderr": 0.01668482092914859 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.22727272727272727, + "acc_stderr": 0.04013964554072774, + "acc_norm": 0.22727272727272727, + "acc_norm_stderr": 0.04013964554072774 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4, + "acc_stderr": 0.031362502409358936, + "acc_norm": 0.4, + "acc_norm_stderr": 0.031362502409358936 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2885572139303483, + "acc_stderr": 0.03203841040213321, + "acc_norm": 0.2885572139303483, + "acc_norm_stderr": 0.03203841040213321 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.1927710843373494, + "acc_stderr": 0.030709824050565274, + "acc_norm": 0.1927710843373494, + "acc_norm_stderr": 0.030709824050565274 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.1695906432748538, + "acc_stderr": 0.028782108105401712, + "acc_norm": 0.1695906432748538, + "acc_norm_stderr": 0.028782108105401712 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.19706242350061198, + "mc1_stderr": 0.013925080734473736, + "mc2": 0.3401260823172781, + "mc2_stderr": 0.014194140794117406 + }, + "all": { + "acc": 0.2766497501153852, + "acc_stderr": 0.031976576858827, + "acc_norm": 0.2798843599297305, + "acc_norm_stderr": 0.031981630759923114, + "mc1": 0.19706242350061198, + "mc1_stderr": 0.013925080734473736, + "mc2": 0.3401260823172781, + "mc2_stderr": 0.014194140794117406 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6410.898904085159", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Devio/testC/results_2023-09-02T17-27-16.860385.json b/eval-results/Devio/testC/results_2023-09-02T17-27-16.860385.json new file mode 100644 index 0000000000000000000000000000000000000000..936df5b0fc46edbaec31ac60b2f244dee66018f6 --- /dev/null +++ b/eval-results/Devio/testC/results_2023-09-02T17-27-16.860385.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "Devio/testC", + "model_sha": "318159010931164dcacb5dc2a7a54d48990fb969", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.35494880546075086, + "acc_stderr": 0.013983036904094097, + "acc_norm": 0.39590443686006827, + "acc_norm_stderr": 0.014291228393536583 + }, + "harness|hellaswag|10": { + "acc": 0.4529974108743278, + "acc_stderr": 0.004967685204073108, + "acc_norm": 0.6287592113124876, + "acc_norm_stderr": 0.004821492994082116 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.04605661864718381, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04605661864718381 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.22962962962962963, + "acc_stderr": 0.03633384414073461, + "acc_norm": 0.22962962962962963, + "acc_norm_stderr": 0.03633384414073461 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.32894736842105265, + "acc_stderr": 0.03823428969926603, + "acc_norm": 0.32894736842105265, + "acc_norm_stderr": 0.03823428969926603 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.21, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.21, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2981132075471698, + "acc_stderr": 0.028152837942493857, + "acc_norm": 0.2981132075471698, + "acc_norm_stderr": 0.028152837942493857 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.27167630057803466, + "acc_stderr": 0.0339175032232166, + "acc_norm": 0.27167630057803466, + "acc_norm_stderr": 0.0339175032232166 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.30392156862745096, + "acc_stderr": 0.045766654032077636, + "acc_norm": 0.30392156862745096, + "acc_norm_stderr": 0.045766654032077636 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.16, + "acc_stderr": 0.03684529491774708, + "acc_norm": 0.16, + "acc_norm_stderr": 0.03684529491774708 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.28936170212765955, + "acc_stderr": 0.02964400657700962, + "acc_norm": 0.28936170212765955, + "acc_norm_stderr": 0.02964400657700962 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813344, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813344 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2827586206896552, + "acc_stderr": 0.037528339580033376, + "acc_norm": 0.2827586206896552, + "acc_norm_stderr": 0.037528339580033376 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.021935878081184756, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.021935878081184756 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.04360314860077459, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.04360314860077459 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3258064516129032, + "acc_stderr": 0.0266620105785671, + "acc_norm": 0.3258064516129032, + "acc_norm_stderr": 0.0266620105785671 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.0317852971064275, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.0317852971064275 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.23030303030303031, + "acc_stderr": 0.0328766675860349, + "acc_norm": 0.23030303030303031, + "acc_norm_stderr": 0.0328766675860349 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.35858585858585856, + "acc_stderr": 0.03416903640391521, + "acc_norm": 0.35858585858585856, + "acc_norm_stderr": 0.03416903640391521 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.36787564766839376, + "acc_stderr": 0.03480175668466036, + "acc_norm": 0.36787564766839376, + "acc_norm_stderr": 0.03480175668466036 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.34615384615384615, + "acc_stderr": 0.024121125416941183, + "acc_norm": 0.34615384615384615, + "acc_norm_stderr": 0.024121125416941183 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.026719240783712177, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.026719240783712177 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.33613445378151263, + "acc_stderr": 0.030684737115135356, + "acc_norm": 0.33613445378151263, + "acc_norm_stderr": 0.030684737115135356 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.03861557546255169, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.03861557546255169 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3431192660550459, + "acc_stderr": 0.02035477773608604, + "acc_norm": 0.3431192660550459, + "acc_norm_stderr": 0.02035477773608604 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4351851851851852, + "acc_stderr": 0.033812000056435254, + "acc_norm": 0.4351851851851852, + "acc_norm_stderr": 0.033812000056435254 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.030587591351604246, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.030587591351604246 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.20675105485232068, + "acc_stderr": 0.0263616516683891, + "acc_norm": 0.20675105485232068, + "acc_norm_stderr": 0.0263616516683891 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.15695067264573992, + "acc_stderr": 0.024413587174907412, + "acc_norm": 0.15695067264573992, + "acc_norm_stderr": 0.024413587174907412 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2824427480916031, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.2824427480916031, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.14049586776859505, + "acc_stderr": 0.03172233426002161, + "acc_norm": 0.14049586776859505, + "acc_norm_stderr": 0.03172233426002161 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.23148148148148148, + "acc_stderr": 0.04077494709252628, + "acc_norm": 0.23148148148148148, + "acc_norm_stderr": 0.04077494709252628 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.20535714285714285, + "acc_stderr": 0.038342410214190735, + "acc_norm": 0.20535714285714285, + "acc_norm_stderr": 0.038342410214190735 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.4174757281553398, + "acc_stderr": 0.04882840548212237, + "acc_norm": 0.4174757281553398, + "acc_norm_stderr": 0.04882840548212237 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.18803418803418803, + "acc_stderr": 0.025598193686652244, + "acc_norm": 0.18803418803418803, + "acc_norm_stderr": 0.025598193686652244 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.210727969348659, + "acc_stderr": 0.014583812465862553, + "acc_norm": 0.210727969348659, + "acc_norm_stderr": 0.014583812465862553 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.22832369942196531, + "acc_stderr": 0.02259870380432162, + "acc_norm": 0.22832369942196531, + "acc_norm_stderr": 0.02259870380432162 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.3006535947712418, + "acc_stderr": 0.02625605383571896, + "acc_norm": 0.3006535947712418, + "acc_norm_stderr": 0.02625605383571896 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.26688102893890675, + "acc_stderr": 0.02512263760881664, + "acc_norm": 0.26688102893890675, + "acc_norm_stderr": 0.02512263760881664 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25308641975308643, + "acc_stderr": 0.02419180860071301, + "acc_norm": 0.25308641975308643, + "acc_norm_stderr": 0.02419180860071301 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2624113475177305, + "acc_stderr": 0.026244920349843003, + "acc_norm": 0.2624113475177305, + "acc_norm_stderr": 0.026244920349843003 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142695, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142695 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4485294117647059, + "acc_stderr": 0.030211479609121593, + "acc_norm": 0.4485294117647059, + "acc_norm_stderr": 0.030211479609121593 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.22058823529411764, + "acc_stderr": 0.01677467236546851, + "acc_norm": 0.22058823529411764, + "acc_norm_stderr": 0.01677467236546851 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.04265792110940589, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.04265792110940589 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.39591836734693875, + "acc_stderr": 0.03130802899065686, + "acc_norm": 0.39591836734693875, + "acc_norm_stderr": 0.03130802899065686 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.263681592039801, + "acc_stderr": 0.03115715086935556, + "acc_norm": 0.263681592039801, + "acc_norm_stderr": 0.03115715086935556 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.21084337349397592, + "acc_stderr": 0.0317555478662992, + "acc_norm": 0.21084337349397592, + "acc_norm_stderr": 0.0317555478662992 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.14619883040935672, + "acc_stderr": 0.027097290118070803, + "acc_norm": 0.14619883040935672, + "acc_norm_stderr": 0.027097290118070803 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.20318237454100369, + "mc1_stderr": 0.014085666526340882, + "mc2": 0.35665813452391837, + "mc2_stderr": 0.014271431688144938 + }, + "all": { + "acc": 0.28185588236286707, + "acc_stderr": 0.03225753349873974, + "acc_norm": 0.2855290591736718, + "acc_norm_stderr": 0.03226027924923892, + "mc1": 0.20318237454100369, + "mc1_stderr": 0.014085666526340882, + "mc2": 0.35665813452391837, + "mc2_stderr": 0.014271431688144938 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6475.037744998932", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Ejafa/vicuna_7B_vanilla_1.1/results_2023-07-19T16-40-36.774019.json b/eval-results/Ejafa/vicuna_7B_vanilla_1.1/results_2023-07-19T16-40-36.774019.json new file mode 100644 index 0000000000000000000000000000000000000000..c5002e31fa317a9303eeb0267a93b8f4db0f6339 --- /dev/null +++ b/eval-results/Ejafa/vicuna_7B_vanilla_1.1/results_2023-07-19T16-40-36.774019.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.4991467576791809, + "acc_stderr": 0.014611369529813279, + "acc_norm": 0.5366894197952219, + "acc_norm_stderr": 0.014572000527756989 + }, + "harness|hellaswag|10": { + "acc": 0.5844453296156145, + "acc_stderr": 0.004918102168717934, + "acc_norm": 0.7746464847639912, + "acc_norm_stderr": 0.00416961025480796 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4, + "acc_stderr": 0.04232073695151589, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04232073695151589 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4934210526315789, + "acc_stderr": 0.040685900502249704, + "acc_norm": 0.4934210526315789, + "acc_norm_stderr": 0.040685900502249704 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5132075471698113, + "acc_stderr": 0.030762134874500482, + "acc_norm": 0.5132075471698113, + "acc_norm_stderr": 0.030762134874500482 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.04155319955593146, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.04155319955593146 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.35260115606936415, + "acc_stderr": 0.036430371689585475, + "acc_norm": 0.35260115606936415, + "acc_norm_stderr": 0.036430371689585475 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.043898699568087785, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.043898699568087785 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3829787234042553, + "acc_stderr": 0.03177821250236922, + "acc_norm": 0.3829787234042553, + "acc_norm_stderr": 0.03177821250236922 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748142, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748142 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.46206896551724136, + "acc_stderr": 0.041546596717075474, + "acc_norm": 0.46206896551724136, + "acc_norm_stderr": 0.041546596717075474 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.28835978835978837, + "acc_stderr": 0.023330654054535886, + "acc_norm": 0.28835978835978837, + "acc_norm_stderr": 0.023330654054535886 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.04263906892795132, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.04263906892795132 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.4612903225806452, + "acc_stderr": 0.028358634859836928, + "acc_norm": 0.4612903225806452, + "acc_norm_stderr": 0.028358634859836928 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3251231527093596, + "acc_stderr": 0.032957975663112704, + "acc_norm": 0.3251231527093596, + "acc_norm_stderr": 0.032957975663112704 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5878787878787879, + "acc_stderr": 0.038435669935887186, + "acc_norm": 0.5878787878787879, + "acc_norm_stderr": 0.038435669935887186 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5959595959595959, + "acc_stderr": 0.03496130972056128, + "acc_norm": 0.5959595959595959, + "acc_norm_stderr": 0.03496130972056128 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6683937823834197, + "acc_stderr": 0.03397636541089118, + "acc_norm": 0.6683937823834197, + "acc_norm_stderr": 0.03397636541089118 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4, + "acc_stderr": 0.024838811988033165, + "acc_norm": 0.4, + "acc_norm_stderr": 0.024838811988033165 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.22962962962962963, + "acc_stderr": 0.025644108639267634, + "acc_norm": 0.22962962962962963, + "acc_norm_stderr": 0.025644108639267634 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3865546218487395, + "acc_stderr": 0.03163145807552378, + "acc_norm": 0.3865546218487395, + "acc_norm_stderr": 0.03163145807552378 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.03631329803969654, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.03631329803969654 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6, + "acc_stderr": 0.021004201260420075, + "acc_norm": 0.6, + "acc_norm_stderr": 0.021004201260420075 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.03324708911809117, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.03324708911809117 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6029411764705882, + "acc_stderr": 0.0343413116471913, + "acc_norm": 0.6029411764705882, + "acc_norm_stderr": 0.0343413116471913 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.5780590717299579, + "acc_stderr": 0.032148146302403695, + "acc_norm": 0.5780590717299579, + "acc_norm_stderr": 0.032148146302403695 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5381165919282511, + "acc_stderr": 0.033460150119732274, + "acc_norm": 0.5381165919282511, + "acc_norm_stderr": 0.033460150119732274 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5648854961832062, + "acc_stderr": 0.04348208051644858, + "acc_norm": 0.5648854961832062, + "acc_norm_stderr": 0.04348208051644858 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5702479338842975, + "acc_stderr": 0.04519082021319772, + "acc_norm": 0.5702479338842975, + "acc_norm_stderr": 0.04519082021319772 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.04803752235190192, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.04803752235190192 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.50920245398773, + "acc_stderr": 0.03927705600787443, + "acc_norm": 0.50920245398773, + "acc_norm_stderr": 0.03927705600787443 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.30357142857142855, + "acc_stderr": 0.04364226155841044, + "acc_norm": 0.30357142857142855, + "acc_norm_stderr": 0.04364226155841044 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5728155339805825, + "acc_stderr": 0.04897957737781168, + "acc_norm": 0.5728155339805825, + "acc_norm_stderr": 0.04897957737781168 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6837606837606838, + "acc_stderr": 0.030463656747340265, + "acc_norm": 0.6837606837606838, + "acc_norm_stderr": 0.030463656747340265 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6296296296296297, + "acc_stderr": 0.01726860756000578, + "acc_norm": 0.6296296296296297, + "acc_norm_stderr": 0.01726860756000578 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5057803468208093, + "acc_stderr": 0.026917296179149123, + "acc_norm": 0.5057803468208093, + "acc_norm_stderr": 0.026917296179149123 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2446927374301676, + "acc_stderr": 0.014378169884098409, + "acc_norm": 0.2446927374301676, + "acc_norm_stderr": 0.014378169884098409 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4869281045751634, + "acc_stderr": 0.028620130800700246, + "acc_norm": 0.4869281045751634, + "acc_norm_stderr": 0.028620130800700246 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.48231511254019294, + "acc_stderr": 0.02838032284907713, + "acc_norm": 0.48231511254019294, + "acc_norm_stderr": 0.02838032284907713 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.027801656212323667, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.027801656212323667 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3404255319148936, + "acc_stderr": 0.028267657482650147, + "acc_norm": 0.3404255319148936, + "acc_norm_stderr": 0.028267657482650147 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.34876140808344197, + "acc_stderr": 0.01217203515712712, + "acc_norm": 0.34876140808344197, + "acc_norm_stderr": 0.01217203515712712 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5036764705882353, + "acc_stderr": 0.030372015885428195, + "acc_norm": 0.5036764705882353, + "acc_norm_stderr": 0.030372015885428195 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4395424836601307, + "acc_stderr": 0.020079420408087918, + "acc_norm": 0.4395424836601307, + "acc_norm_stderr": 0.020079420408087918 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5181818181818182, + "acc_stderr": 0.04785964010794916, + "acc_norm": 0.5181818181818182, + "acc_norm_stderr": 0.04785964010794916 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5387755102040817, + "acc_stderr": 0.031912820526692774, + "acc_norm": 0.5387755102040817, + "acc_norm_stderr": 0.031912820526692774 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6318407960199005, + "acc_stderr": 0.03410410565495302, + "acc_norm": 0.6318407960199005, + "acc_norm_stderr": 0.03410410565495302 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.63, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.63, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42168674698795183, + "acc_stderr": 0.03844453181770917, + "acc_norm": 0.42168674698795183, + "acc_norm_stderr": 0.03844453181770917 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.036155076303109365, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.036155076303109365 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.33047735618115054, + "mc1_stderr": 0.0164667696136983, + "mc2": 0.48940747456304606, + "mc2_stderr": 0.015298126884049629 + }, + "all": { + "acc": 0.4591867762728675, + "acc_stderr": 0.035239355625612076, + "acc_norm": 0.46304684097226445, + "acc_norm_stderr": 0.03522600205076519, + "mc1": 0.33047735618115054, + "mc1_stderr": 0.0164667696136983, + "mc2": 0.48940747456304606, + "mc2_stderr": 0.015298126884049629 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "Ejafa/vicuna_7B_vanilla_1.1", + "model_sha": "d971d788db19648ad16bf77ec3f1de35ebf9a8e0", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/Ejafa/vicuna_7B_vanilla_1.1/results_2023-10-17T05-49-12.117200.json b/eval-results/Ejafa/vicuna_7B_vanilla_1.1/results_2023-10-17T05-49-12.117200.json new file mode 100644 index 0000000000000000000000000000000000000000..fa98c355476c8ff92cff4a07c798b4c52f100edb --- /dev/null +++ b/eval-results/Ejafa/vicuna_7B_vanilla_1.1/results_2023-10-17T05-49-12.117200.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Ejafa/vicuna_7B_vanilla_1.1", + "model_sha": "d971d788db19648ad16bf77ec3f1de35ebf9a8e0", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.11388422818791946, + "em_stderr": 0.00325324428862373, + "f1": 0.16976719798657605, + "f1_stderr": 0.003380156230610554 + }, + "harness|gsm8k|5": { + "acc": 0.05534495830174375, + "acc_stderr": 0.006298221796179588 + }, + "harness|winogrande|5": { + "acc": 0.7095501183898973, + "acc_stderr": 0.012758813448064607 + }, + "all": { + "em": 0.11388422818791946, + "em_stderr": 0.00325324428862373, + "f1": 0.16976719798657605, + "f1_stderr": 0.003380156230610554, + "acc": 0.38244753834582057, + "acc_stderr": 0.009528517622122097 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "189e1ba7e9e7706d" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "9b203c8b6ca577ae" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "bd4d86debbf9bd41" + }, + "total_evaluation_time_secondes": "9764.47821855545", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Enno-Ai/ennodata-13b-8bit-raw-15epoch/results_2023-10-04T01-56-25.933600.json b/eval-results/Enno-Ai/ennodata-13b-8bit-raw-15epoch/results_2023-10-04T01-56-25.933600.json new file mode 100644 index 0000000000000000000000000000000000000000..b3c2d26f61a9cb052cb22194ae3795186ea5e5fc --- /dev/null +++ b/eval-results/Enno-Ai/ennodata-13b-8bit-raw-15epoch/results_2023-10-04T01-56-25.933600.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Enno-Ai/ennodata-13b-8bit-raw-15epoch", + "model_sha": "ee2ceaae9cb806bc30df84ba4d598fdf32e53b17", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5836177474402731, + "acc_stderr": 0.01440561827943617, + "acc_norm": 0.6160409556313993, + "acc_norm_stderr": 0.01421244498065189 + }, + "harness|hellaswag|10": { + "acc": 0.6241784505078669, + "acc_stderr": 0.00483344455633862, + "acc_norm": 0.8220474009161521, + "acc_norm_stderr": 0.00381691171167917 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5526315789473685, + "acc_stderr": 0.040463368839782514, + "acc_norm": 0.5526315789473685, + "acc_norm_stderr": 0.040463368839782514 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6037735849056604, + "acc_stderr": 0.030102793781791194, + "acc_norm": 0.6037735849056604, + "acc_norm_stderr": 0.030102793781791194 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6527777777777778, + "acc_stderr": 0.039812405437178615, + "acc_norm": 0.6527777777777778, + "acc_norm_stderr": 0.039812405437178615 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.45, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939098, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939098 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5433526011560693, + "acc_stderr": 0.03798106566014498, + "acc_norm": 0.5433526011560693, + "acc_norm_stderr": 0.03798106566014498 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.04755129616062946, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.04755129616062946 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4978723404255319, + "acc_stderr": 0.032685726586674915, + "acc_norm": 0.4978723404255319, + "acc_norm_stderr": 0.032685726586674915 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.34210526315789475, + "acc_stderr": 0.04462917535336937, + "acc_norm": 0.34210526315789475, + "acc_norm_stderr": 0.04462917535336937 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.45517241379310347, + "acc_stderr": 0.04149886942192117, + "acc_norm": 0.45517241379310347, + "acc_norm_stderr": 0.04149886942192117 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.023636975996101806, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.023636975996101806 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.04444444444444449, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.04444444444444449 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6258064516129033, + "acc_stderr": 0.027528904299845704, + "acc_norm": 0.6258064516129033, + "acc_norm_stderr": 0.027528904299845704 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43349753694581283, + "acc_stderr": 0.03486731727419873, + "acc_norm": 0.43349753694581283, + "acc_norm_stderr": 0.03486731727419873 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.03546563019624335, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.03546563019624335 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7474747474747475, + "acc_stderr": 0.030954055470365897, + "acc_norm": 0.7474747474747475, + "acc_norm_stderr": 0.030954055470365897 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8082901554404145, + "acc_stderr": 0.028408953626245282, + "acc_norm": 0.8082901554404145, + "acc_norm_stderr": 0.028408953626245282 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5769230769230769, + "acc_stderr": 0.02504919787604234, + "acc_norm": 0.5769230769230769, + "acc_norm_stderr": 0.02504919787604234 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32222222222222224, + "acc_stderr": 0.028493465091028597, + "acc_norm": 0.32222222222222224, + "acc_norm_stderr": 0.028493465091028597 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6008403361344538, + "acc_stderr": 0.031811100324139245, + "acc_norm": 0.6008403361344538, + "acc_norm_stderr": 0.031811100324139245 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3509933774834437, + "acc_stderr": 0.03896981964257375, + "acc_norm": 0.3509933774834437, + "acc_norm_stderr": 0.03896981964257375 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8018348623853211, + "acc_stderr": 0.017090573804217905, + "acc_norm": 0.8018348623853211, + "acc_norm_stderr": 0.017090573804217905 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.49537037037037035, + "acc_stderr": 0.03409825519163572, + "acc_norm": 0.49537037037037035, + "acc_norm_stderr": 0.03409825519163572 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7941176470588235, + "acc_stderr": 0.028379449451588674, + "acc_norm": 0.7941176470588235, + "acc_norm_stderr": 0.028379449451588674 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229966, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229966 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6636771300448431, + "acc_stderr": 0.031708824268455005, + "acc_norm": 0.6636771300448431, + "acc_norm_stderr": 0.031708824268455005 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6259541984732825, + "acc_stderr": 0.042438692422305246, + "acc_norm": 0.6259541984732825, + "acc_norm_stderr": 0.042438692422305246 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6776859504132231, + "acc_stderr": 0.042664163633521685, + "acc_norm": 0.6776859504132231, + "acc_norm_stderr": 0.042664163633521685 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7037037037037037, + "acc_stderr": 0.044143436668549335, + "acc_norm": 0.7037037037037037, + "acc_norm_stderr": 0.044143436668549335 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7055214723926381, + "acc_stderr": 0.03581165790474082, + "acc_norm": 0.7055214723926381, + "acc_norm_stderr": 0.03581165790474082 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4017857142857143, + "acc_stderr": 0.04653333146973646, + "acc_norm": 0.4017857142857143, + "acc_norm_stderr": 0.04653333146973646 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7281553398058253, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.7281553398058253, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8247863247863247, + "acc_stderr": 0.02490443909891823, + "acc_norm": 0.8247863247863247, + "acc_norm_stderr": 0.02490443909891823 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7726692209450831, + "acc_stderr": 0.014987270640946022, + "acc_norm": 0.7726692209450831, + "acc_norm_stderr": 0.014987270640946022 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6358381502890174, + "acc_stderr": 0.025906632631016124, + "acc_norm": 0.6358381502890174, + "acc_norm_stderr": 0.025906632631016124 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.48379888268156424, + "acc_stderr": 0.016713720729501013, + "acc_norm": 0.48379888268156424, + "acc_norm_stderr": 0.016713720729501013 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6274509803921569, + "acc_stderr": 0.027684181883302895, + "acc_norm": 0.6274509803921569, + "acc_norm_stderr": 0.027684181883302895 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.662379421221865, + "acc_stderr": 0.026858825879488547, + "acc_norm": 0.662379421221865, + "acc_norm_stderr": 0.026858825879488547 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6635802469135802, + "acc_stderr": 0.026289734945952926, + "acc_norm": 0.6635802469135802, + "acc_norm_stderr": 0.026289734945952926 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.44680851063829785, + "acc_stderr": 0.029658235097666904, + "acc_norm": 0.44680851063829785, + "acc_norm_stderr": 0.029658235097666904 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.45827900912646674, + "acc_stderr": 0.01272570165695364, + "acc_norm": 0.45827900912646674, + "acc_norm_stderr": 0.01272570165695364 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5625, + "acc_stderr": 0.030134614954403924, + "acc_norm": 0.5625, + "acc_norm_stderr": 0.030134614954403924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5947712418300654, + "acc_stderr": 0.019861155193829156, + "acc_norm": 0.5947712418300654, + "acc_norm_stderr": 0.019861155193829156 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6, + "acc_stderr": 0.03136250240935893, + "acc_norm": 0.6, + "acc_norm_stderr": 0.03136250240935893 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.5920398009950248, + "acc_stderr": 0.03475116365194092, + "acc_norm": 0.5920398009950248, + "acc_norm_stderr": 0.03475116365194092 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4397590361445783, + "acc_stderr": 0.03864139923699121, + "acc_norm": 0.4397590361445783, + "acc_norm_stderr": 0.03864139923699121 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.03094445977853321, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.03094445977853321 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.37576499388004897, + "mc1_stderr": 0.016954584060214294, + "mc2": 0.5358311103869008, + "mc2_stderr": 0.01569764342795165 + }, + "all": { + "acc": 0.5764175560875521, + "acc_stderr": 0.03451333662706103, + "acc_norm": 0.5803208130129659, + "acc_norm_stderr": 0.03449283313310436, + "mc1": 0.37576499388004897, + "mc1_stderr": 0.016954584060214294, + "mc2": 0.5358311103869008, + "mc2_stderr": 0.01569764342795165 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6373.200977802277", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Enno-Ai/ennodata-13b-8bit-raw-15epoch/results_2023-10-27T07-36-25.683103.json b/eval-results/Enno-Ai/ennodata-13b-8bit-raw-15epoch/results_2023-10-27T07-36-25.683103.json new file mode 100644 index 0000000000000000000000000000000000000000..928cd7a8f732ace7be8ce3096ae13f6090c532b6 --- /dev/null +++ b/eval-results/Enno-Ai/ennodata-13b-8bit-raw-15epoch/results_2023-10-27T07-36-25.683103.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Enno-Ai/ennodata-13b-8bit-raw-15epoch", + "model_sha": "ee2ceaae9cb806bc30df84ba4d598fdf32e53b17", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.3571728187919463, + "em_stderr": 0.004907111972287459, + "f1": 0.43218645134228284, + "f1_stderr": 0.004707344511103017 + }, + "harness|gsm8k|5": { + "acc": 0.014404852160727824, + "acc_stderr": 0.0032820559171369366 + }, + "harness|winogrande|5": { + "acc": 0.7750591949486977, + "acc_stderr": 0.01173504356412674 + }, + "all": { + "em": 0.3571728187919463, + "em_stderr": 0.004907111972287459, + "f1": 0.43218645134228284, + "f1_stderr": 0.004707344511103017, + "acc": 0.39473202355471276, + "acc_stderr": 0.007508549740631839 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "0f3f21a3a3a4158b" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "1b57c3c03630f295" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "060e92483e04e676" + }, + "total_evaluation_time_secondes": "9258.722980499268", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Enno-Ai/ennodata-7b/results_2023-08-17T18-21-05.699051.json b/eval-results/Enno-Ai/ennodata-7b/results_2023-08-17T18-21-05.699051.json new file mode 100644 index 0000000000000000000000000000000000000000..c5ab315e5d8cecb60bd243d10f2732d817494684 --- /dev/null +++ b/eval-results/Enno-Ai/ennodata-7b/results_2023-08-17T18-21-05.699051.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.47440273037542663, + "acc_stderr": 0.014592230885298964, + "acc_norm": 0.5102389078498294, + "acc_norm_stderr": 0.014608326906285012 + }, + "harness|hellaswag|10": { + "acc": 0.5731925911173074, + "acc_stderr": 0.004936029827672036, + "acc_norm": 0.7762397928699463, + "acc_norm_stderr": 0.0041591146798738285 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847415, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847415 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3851851851851852, + "acc_stderr": 0.042039210401562783, + "acc_norm": 0.3851851851851852, + "acc_norm_stderr": 0.042039210401562783 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3092105263157895, + "acc_stderr": 0.03761070869867479, + "acc_norm": 0.3092105263157895, + "acc_norm_stderr": 0.03761070869867479 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.32452830188679244, + "acc_stderr": 0.028815615713432115, + "acc_norm": 0.32452830188679244, + "acc_norm_stderr": 0.028815615713432115 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3680555555555556, + "acc_stderr": 0.040329990539607195, + "acc_norm": 0.3680555555555556, + "acc_norm_stderr": 0.040329990539607195 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3063583815028902, + "acc_stderr": 0.035149425512674366, + "acc_norm": 0.3063583815028902, + "acc_norm_stderr": 0.035149425512674366 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.040233822736177476, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.040233822736177476 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.42, + "acc_stderr": 0.04960449637488584, + "acc_norm": 0.42, + "acc_norm_stderr": 0.04960449637488584 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.33617021276595743, + "acc_stderr": 0.030881618520676942, + "acc_norm": 0.33617021276595743, + "acc_norm_stderr": 0.030881618520676942 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.03999423879281336, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.03999423879281336 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.037245636197746304, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.037245636197746304 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.02241804289111394, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.02241804289111394 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.038095238095238106, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.038095238095238106 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3258064516129032, + "acc_stderr": 0.026662010578567104, + "acc_norm": 0.3258064516129032, + "acc_norm_stderr": 0.026662010578567104 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2561576354679803, + "acc_stderr": 0.0307127300709826, + "acc_norm": 0.2561576354679803, + "acc_norm_stderr": 0.0307127300709826 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.4121212121212121, + "acc_stderr": 0.03843566993588717, + "acc_norm": 0.4121212121212121, + "acc_norm_stderr": 0.03843566993588717 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.32323232323232326, + "acc_stderr": 0.03332299921070643, + "acc_norm": 0.32323232323232326, + "acc_norm_stderr": 0.03332299921070643 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.39378238341968913, + "acc_stderr": 0.03526077095548237, + "acc_norm": 0.39378238341968913, + "acc_norm_stderr": 0.03526077095548237 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.023901157979402538, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.023901157979402538 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.026466117538959916, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.026466117538959916 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.030388353551886845, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.030388353551886845 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.03631329803969653, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.03631329803969653 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.42935779816513764, + "acc_stderr": 0.021222286397236504, + "acc_norm": 0.42935779816513764, + "acc_norm_stderr": 0.021222286397236504 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.27314814814814814, + "acc_stderr": 0.030388051301678116, + "acc_norm": 0.27314814814814814, + "acc_norm_stderr": 0.030388051301678116 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.3284313725490196, + "acc_stderr": 0.03296245110172228, + "acc_norm": 0.3284313725490196, + "acc_norm_stderr": 0.03296245110172228 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.4050632911392405, + "acc_stderr": 0.031955147413706725, + "acc_norm": 0.4050632911392405, + "acc_norm_stderr": 0.031955147413706725 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.39461883408071746, + "acc_stderr": 0.03280400504755291, + "acc_norm": 0.39461883408071746, + "acc_norm_stderr": 0.03280400504755291 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.33587786259541985, + "acc_stderr": 0.04142313771996664, + "acc_norm": 0.33587786259541985, + "acc_norm_stderr": 0.04142313771996664 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.512396694214876, + "acc_stderr": 0.04562951548180765, + "acc_norm": 0.512396694214876, + "acc_norm_stderr": 0.04562951548180765 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.04712821257426771, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.04712821257426771 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.39263803680981596, + "acc_stderr": 0.03836740907831029, + "acc_norm": 0.39263803680981596, + "acc_norm_stderr": 0.03836740907831029 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.24107142857142858, + "acc_stderr": 0.040598672469526864, + "acc_norm": 0.24107142857142858, + "acc_norm_stderr": 0.040598672469526864 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3106796116504854, + "acc_stderr": 0.045821241601615506, + "acc_norm": 0.3106796116504854, + "acc_norm_stderr": 0.045821241601615506 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.452991452991453, + "acc_stderr": 0.0326109987309862, + "acc_norm": 0.452991452991453, + "acc_norm_stderr": 0.0326109987309862 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.40229885057471265, + "acc_stderr": 0.017535294529068955, + "acc_norm": 0.40229885057471265, + "acc_norm_stderr": 0.017535294529068955 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.37283236994219654, + "acc_stderr": 0.026033890613576277, + "acc_norm": 0.37283236994219654, + "acc_norm_stderr": 0.026033890613576277 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.3954248366013072, + "acc_stderr": 0.02799672318063145, + "acc_norm": 0.3954248366013072, + "acc_norm_stderr": 0.02799672318063145 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.40836012861736337, + "acc_stderr": 0.02791705074848462, + "acc_norm": 0.40836012861736337, + "acc_norm_stderr": 0.02791705074848462 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.33641975308641975, + "acc_stderr": 0.026289734945952926, + "acc_norm": 0.33641975308641975, + "acc_norm_stderr": 0.026289734945952926 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2695035460992908, + "acc_stderr": 0.026469036818590624, + "acc_norm": 0.2695035460992908, + "acc_norm_stderr": 0.026469036818590624 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.29465449804432853, + "acc_stderr": 0.011643576764069541, + "acc_norm": 0.29465449804432853, + "acc_norm_stderr": 0.011643576764069541 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4375, + "acc_stderr": 0.030134614954403924, + "acc_norm": 0.4375, + "acc_norm_stderr": 0.030134614954403924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.33986928104575165, + "acc_stderr": 0.01916241858862356, + "acc_norm": 0.33986928104575165, + "acc_norm_stderr": 0.01916241858862356 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.43636363636363634, + "acc_stderr": 0.04750185058907297, + "acc_norm": 0.43636363636363634, + "acc_norm_stderr": 0.04750185058907297 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.27755102040816326, + "acc_stderr": 0.028666857790274645, + "acc_norm": 0.27755102040816326, + "acc_norm_stderr": 0.028666857790274645 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.4228855721393035, + "acc_stderr": 0.034932317774212816, + "acc_norm": 0.4228855721393035, + "acc_norm_stderr": 0.034932317774212816 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3614457831325301, + "acc_stderr": 0.037400593820293204, + "acc_norm": 0.3614457831325301, + "acc_norm_stderr": 0.037400593820293204 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.0381107966983353, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.0381107966983353 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.21542227662178703, + "mc1_stderr": 0.014391902652427686, + "mc2": 0.3353289270087254, + "mc2_stderr": 0.013074362091466094 + }, + "all": { + "acc": 0.3456994326510667, + "acc_stderr": 0.03418929248026247, + "acc_norm": 0.34974830348542335, + "acc_norm_stderr": 0.03417639724082498, + "mc1": 0.21542227662178703, + "mc1_stderr": 0.014391902652427686, + "mc2": 0.3353289270087254, + "mc2_stderr": 0.013074362091466094 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "Enno-Ai/ennodata-7b", + "model_sha": "7872a492ebbb3c6a899f9acbd34dfd5f7e674fdd", + "model_dtype": "8bit", + "lighteval_sha": "8bab069fee0c6e75ffa4c1ef8a9591c28ee0e049", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4674.899477958679", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Enno-Ai/ennodata-7b/results_2023-10-27T05-41-02.798297.json b/eval-results/Enno-Ai/ennodata-7b/results_2023-10-27T05-41-02.798297.json new file mode 100644 index 0000000000000000000000000000000000000000..984c3b5e610a7a1d8dbbeca65517826ef038e3a6 --- /dev/null +++ b/eval-results/Enno-Ai/ennodata-7b/results_2023-10-27T05-41-02.798297.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Enno-Ai/ennodata-7b", + "model_sha": "7872a492ebbb3c6a899f9acbd34dfd5f7e674fdd", + "model_size": "6.55 GB", + "model_dtype": "8bit", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0010486577181208054, + "em_stderr": 0.0003314581465219155, + "f1": 0.055922818791946494, + "f1_stderr": 0.0012829613643597505 + }, + "harness|gsm8k|5": { + "acc": 0.037149355572403335, + "acc_stderr": 0.005209516283073758 + }, + "harness|winogrande|5": { + "acc": 0.7095501183898973, + "acc_stderr": 0.01275881344806461 + }, + "all": { + "em": 0.0010486577181208054, + "em_stderr": 0.0003314581465219155, + "f1": 0.055922818791946494, + "f1_stderr": 0.0012829613643597505, + "acc": 0.3733497369811503, + "acc_stderr": 0.008984164865569185 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "1494edee2ffcda41" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "a6d05120f987f6f6" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "9bec69b3afece418" + }, + "total_evaluation_time_secondes": "27239.538413524628", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Enno-Ai/ennodata-raw-pankajmathur-13b-peft/results_2023-10-04T05-26-18.448610.json b/eval-results/Enno-Ai/ennodata-raw-pankajmathur-13b-peft/results_2023-10-04T05-26-18.448610.json new file mode 100644 index 0000000000000000000000000000000000000000..934095e5ae9d2d59ca95306506a438e6a7737128 --- /dev/null +++ b/eval-results/Enno-Ai/ennodata-raw-pankajmathur-13b-peft/results_2023-10-04T05-26-18.448610.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Enno-Ai/ennodata-raw-pankajmathur-13b-peft", + "model_sha": "206553873db96a6730d36477837335dbbcc906fc", + "model_size": "12.51 GB", + "model_dtype": "8bit", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5827645051194539, + "acc_stderr": 0.014409825518403077, + "acc_norm": 0.6194539249146758, + "acc_norm_stderr": 0.014188277712349812 + }, + "harness|hellaswag|10": { + "acc": 0.6224855606452898, + "acc_stderr": 0.004837744647345718, + "acc_norm": 0.8221469826727743, + "acc_norm_stderr": 0.0038160747120605343 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5037037037037037, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.5037037037037037, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5526315789473685, + "acc_stderr": 0.04046336883978251, + "acc_norm": 0.5526315789473685, + "acc_norm_stderr": 0.04046336883978251 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6037735849056604, + "acc_stderr": 0.030102793781791197, + "acc_norm": 0.6037735849056604, + "acc_norm_stderr": 0.030102793781791197 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.625, + "acc_stderr": 0.04048439222695598, + "acc_norm": 0.625, + "acc_norm_stderr": 0.04048439222695598 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5317919075144508, + "acc_stderr": 0.03804749744364764, + "acc_norm": 0.5317919075144508, + "acc_norm_stderr": 0.03804749744364764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.37254901960784315, + "acc_stderr": 0.04810840148082635, + "acc_norm": 0.37254901960784315, + "acc_norm_stderr": 0.04810840148082635 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.49361702127659574, + "acc_stderr": 0.032683358999363366, + "acc_norm": 0.49361702127659574, + "acc_norm_stderr": 0.032683358999363366 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.32456140350877194, + "acc_stderr": 0.04404556157374768, + "acc_norm": 0.32456140350877194, + "acc_norm_stderr": 0.04404556157374768 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.496551724137931, + "acc_stderr": 0.041665675771015785, + "acc_norm": 0.496551724137931, + "acc_norm_stderr": 0.041665675771015785 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.023517294335963286, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.023517294335963286 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6225806451612903, + "acc_stderr": 0.027575960723278243, + "acc_norm": 0.6225806451612903, + "acc_norm_stderr": 0.027575960723278243 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4482758620689655, + "acc_stderr": 0.03499113137676744, + "acc_norm": 0.4482758620689655, + "acc_norm_stderr": 0.03499113137676744 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.03546563019624335, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.03546563019624335 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7525252525252525, + "acc_stderr": 0.030746300742124488, + "acc_norm": 0.7525252525252525, + "acc_norm_stderr": 0.030746300742124488 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8290155440414507, + "acc_stderr": 0.02717121368316453, + "acc_norm": 0.8290155440414507, + "acc_norm_stderr": 0.02717121368316453 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6025641025641025, + "acc_stderr": 0.024811920017903836, + "acc_norm": 0.6025641025641025, + "acc_norm_stderr": 0.024811920017903836 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.027840811495871927, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.027840811495871927 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6050420168067226, + "acc_stderr": 0.03175367846096624, + "acc_norm": 0.6050420168067226, + "acc_norm_stderr": 0.03175367846096624 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3509933774834437, + "acc_stderr": 0.03896981964257375, + "acc_norm": 0.3509933774834437, + "acc_norm_stderr": 0.03896981964257375 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7871559633027523, + "acc_stderr": 0.017549376389313694, + "acc_norm": 0.7871559633027523, + "acc_norm_stderr": 0.017549376389313694 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.49537037037037035, + "acc_stderr": 0.03409825519163572, + "acc_norm": 0.49537037037037035, + "acc_norm_stderr": 0.03409825519163572 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.028867431449849323, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.028867431449849323 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7890295358649789, + "acc_stderr": 0.02655837250266192, + "acc_norm": 0.7890295358649789, + "acc_norm_stderr": 0.02655837250266192 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6547085201793722, + "acc_stderr": 0.03191100192835794, + "acc_norm": 0.6547085201793722, + "acc_norm_stderr": 0.03191100192835794 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6106870229007634, + "acc_stderr": 0.04276486542814591, + "acc_norm": 0.6106870229007634, + "acc_norm_stderr": 0.04276486542814591 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6942148760330579, + "acc_stderr": 0.04205953933884123, + "acc_norm": 0.6942148760330579, + "acc_norm_stderr": 0.04205953933884123 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.044531975073749834, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.044531975073749834 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7116564417177914, + "acc_stderr": 0.035590395316173425, + "acc_norm": 0.7116564417177914, + "acc_norm_stderr": 0.035590395316173425 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4107142857142857, + "acc_stderr": 0.04669510663875191, + "acc_norm": 0.4107142857142857, + "acc_norm_stderr": 0.04669510663875191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.04541609446503948, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.04541609446503948 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8247863247863247, + "acc_stderr": 0.02490443909891824, + "acc_norm": 0.8247863247863247, + "acc_norm_stderr": 0.02490443909891824 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7701149425287356, + "acc_stderr": 0.01504630184669181, + "acc_norm": 0.7701149425287356, + "acc_norm_stderr": 0.01504630184669181 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6416184971098265, + "acc_stderr": 0.0258167567915842, + "acc_norm": 0.6416184971098265, + "acc_norm_stderr": 0.0258167567915842 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4681564245810056, + "acc_stderr": 0.01668855341561221, + "acc_norm": 0.4681564245810056, + "acc_norm_stderr": 0.01668855341561221 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.027914055510467998, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.027914055510467998 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6591639871382636, + "acc_stderr": 0.026920841260776162, + "acc_norm": 0.6591639871382636, + "acc_norm_stderr": 0.026920841260776162 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.026229649178821163, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.026229649178821163 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4219858156028369, + "acc_stderr": 0.029462189233370597, + "acc_norm": 0.4219858156028369, + "acc_norm_stderr": 0.029462189233370597 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44784876140808344, + "acc_stderr": 0.012700582404768223, + "acc_norm": 0.44784876140808344, + "acc_norm_stderr": 0.012700582404768223 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5625, + "acc_stderr": 0.030134614954403924, + "acc_norm": 0.5625, + "acc_norm_stderr": 0.030134614954403924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5980392156862745, + "acc_stderr": 0.019835176484375387, + "acc_norm": 0.5980392156862745, + "acc_norm_stderr": 0.019835176484375387 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5836734693877551, + "acc_stderr": 0.031557828165561644, + "acc_norm": 0.5836734693877551, + "acc_norm_stderr": 0.031557828165561644 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6368159203980099, + "acc_stderr": 0.034005985055990146, + "acc_norm": 0.6368159203980099, + "acc_norm_stderr": 0.034005985055990146 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036625, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036625 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4457831325301205, + "acc_stderr": 0.03869543323472101, + "acc_norm": 0.4457831325301205, + "acc_norm_stderr": 0.03869543323472101 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.783625730994152, + "acc_stderr": 0.031581495393387324, + "acc_norm": 0.783625730994152, + "acc_norm_stderr": 0.031581495393387324 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.379436964504284, + "mc1_stderr": 0.01698703926614298, + "mc2": 0.5357247188867356, + "mc2_stderr": 0.015675780170595004 + }, + "all": { + "acc": 0.5754002485958071, + "acc_stderr": 0.03442932936272782, + "acc_norm": 0.5794061950673782, + "acc_norm_stderr": 0.03440825787558648, + "mc1": 0.379436964504284, + "mc1_stderr": 0.01698703926614298, + "mc2": 0.5357247188867356, + "mc2_stderr": 0.015675780170595004 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "9127.952424049377", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Enno-Ai/ennodata-raw-pankajmathur-13b-peft/results_2023-10-27T00-11-39.294015.json b/eval-results/Enno-Ai/ennodata-raw-pankajmathur-13b-peft/results_2023-10-27T00-11-39.294015.json new file mode 100644 index 0000000000000000000000000000000000000000..f8afe2a6efaf1227c5412241635b1a34ce404cca --- /dev/null +++ b/eval-results/Enno-Ai/ennodata-raw-pankajmathur-13b-peft/results_2023-10-27T00-11-39.294015.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Enno-Ai/ennodata-raw-pankajmathur-13b-peft", + "model_sha": "2f177a348661ce8d0ba8930a92a21328e51c07b0", + "model_size": "12.51 GB", + "model_dtype": "8bit", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.364618288590604, + "em_stderr": 0.00492919762439365, + "f1": 0.4364775587248332, + "f1_stderr": 0.004726102649724112 + }, + "harness|gsm8k|5": { + "acc": 0.01288855193328279, + "acc_stderr": 0.0031069012664996687 + }, + "harness|winogrande|5": { + "acc": 0.7592738752959748, + "acc_stderr": 0.012015559212224183 + }, + "all": { + "em": 0.364618288590604, + "em_stderr": 0.00492919762439365, + "f1": 0.4364775587248332, + "f1_stderr": 0.004726102649724112, + "acc": 0.38608121361462877, + "acc_stderr": 0.007561230239361926 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "12a1e3daf718c448" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "d2af3ac3cc505d2b" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "f05b43299011d5d7" + }, + "total_evaluation_time_secondes": "23020.691961050034", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Enno-Ai/vigogne2-enno-13b-sft-lora-4bit/results_2023-09-12T14-53-48.356901.json b/eval-results/Enno-Ai/vigogne2-enno-13b-sft-lora-4bit/results_2023-09-12T14-53-48.356901.json new file mode 100644 index 0000000000000000000000000000000000000000..4ba5a3510345fa9c5785ae4f2c2a994c4af6df77 --- /dev/null +++ b/eval-results/Enno-Ai/vigogne2-enno-13b-sft-lora-4bit/results_2023-09-12T14-53-48.356901.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Enno-Ai/vigogne2-enno-13b-sft-lora-4bit", + "model_sha": "2a1b03977395eee44742abda63a4787ea5371d06", + "model_size": "6.6 GB", + "model_dtype": "4bit", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5784982935153583, + "acc_stderr": 0.014430197069326021, + "acc_norm": 0.6203071672354948, + "acc_norm_stderr": 0.014182119866974872 + }, + "harness|hellaswag|10": { + "acc": 0.625273849830711, + "acc_stderr": 0.004830628620181032, + "acc_norm": 0.8265285799641505, + "acc_norm_stderr": 0.0037788044746059103 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4740740740740741, + "acc_stderr": 0.04313531696750574, + "acc_norm": 0.4740740740740741, + "acc_norm_stderr": 0.04313531696750574 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5328947368421053, + "acc_stderr": 0.040601270352363966, + "acc_norm": 0.5328947368421053, + "acc_norm_stderr": 0.040601270352363966 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5584905660377358, + "acc_stderr": 0.030561590426731837, + "acc_norm": 0.5584905660377358, + "acc_norm_stderr": 0.030561590426731837 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.041227287076512825, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.041227287076512825 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4508670520231214, + "acc_stderr": 0.037940126746970296, + "acc_norm": 0.4508670520231214, + "acc_norm_stderr": 0.037940126746970296 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.04389869956808778, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.04389869956808778 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.65, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.65, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.42127659574468085, + "acc_stderr": 0.03227834510146268, + "acc_norm": 0.42127659574468085, + "acc_norm_stderr": 0.03227834510146268 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.04142439719489362, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.04142439719489362 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4896551724137931, + "acc_stderr": 0.041657747757287644, + "acc_norm": 0.4896551724137931, + "acc_norm_stderr": 0.041657747757287644 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.02397386199899207, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.02397386199899207 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.04325506042017086, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.04325506042017086 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7, + "acc_stderr": 0.026069362295335137, + "acc_norm": 0.7, + "acc_norm_stderr": 0.026069362295335137 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.41379310344827586, + "acc_stderr": 0.03465304488406795, + "acc_norm": 0.41379310344827586, + "acc_norm_stderr": 0.03465304488406795 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.036085410115739666, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.036085410115739666 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.696969696969697, + "acc_stderr": 0.03274287914026867, + "acc_norm": 0.696969696969697, + "acc_norm_stderr": 0.03274287914026867 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7616580310880829, + "acc_stderr": 0.030748905363909878, + "acc_norm": 0.7616580310880829, + "acc_norm_stderr": 0.030748905363909878 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5128205128205128, + "acc_stderr": 0.025342671293807257, + "acc_norm": 0.5128205128205128, + "acc_norm_stderr": 0.025342671293807257 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32222222222222224, + "acc_stderr": 0.028493465091028597, + "acc_norm": 0.32222222222222224, + "acc_norm_stderr": 0.028493465091028597 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5630252100840336, + "acc_stderr": 0.032219436365661956, + "acc_norm": 0.5630252100840336, + "acc_norm_stderr": 0.032219436365661956 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.03861557546255169, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.03861557546255169 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7357798165137615, + "acc_stderr": 0.018904164171510186, + "acc_norm": 0.7357798165137615, + "acc_norm_stderr": 0.018904164171510186 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5509259259259259, + "acc_stderr": 0.033922384053216174, + "acc_norm": 0.5509259259259259, + "acc_norm_stderr": 0.033922384053216174 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7401960784313726, + "acc_stderr": 0.030778554678693264, + "acc_norm": 0.7401960784313726, + "acc_norm_stderr": 0.030778554678693264 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.679324894514768, + "acc_stderr": 0.030381931949990407, + "acc_norm": 0.679324894514768, + "acc_norm_stderr": 0.030381931949990407 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6098654708520179, + "acc_stderr": 0.03273766725459157, + "acc_norm": 0.6098654708520179, + "acc_norm_stderr": 0.03273766725459157 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6564885496183206, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.6564885496183206, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6942148760330579, + "acc_stderr": 0.04205953933884123, + "acc_norm": 0.6942148760330579, + "acc_norm_stderr": 0.04205953933884123 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6481481481481481, + "acc_stderr": 0.04616631111801713, + "acc_norm": 0.6481481481481481, + "acc_norm_stderr": 0.04616631111801713 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6380368098159509, + "acc_stderr": 0.037757007291414416, + "acc_norm": 0.6380368098159509, + "acc_norm_stderr": 0.037757007291414416 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.045218299028335844, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.045218299028335844 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7475728155339806, + "acc_stderr": 0.04301250399690878, + "acc_norm": 0.7475728155339806, + "acc_norm_stderr": 0.04301250399690878 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7692307692307693, + "acc_stderr": 0.027601921381417593, + "acc_norm": 0.7692307692307693, + "acc_norm_stderr": 0.027601921381417593 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7279693486590039, + "acc_stderr": 0.015913367447500514, + "acc_norm": 0.7279693486590039, + "acc_norm_stderr": 0.015913367447500514 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5867052023121387, + "acc_stderr": 0.02651126136940924, + "acc_norm": 0.5867052023121387, + "acc_norm_stderr": 0.02651126136940924 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.33743016759776534, + "acc_stderr": 0.015813901283913048, + "acc_norm": 0.33743016759776534, + "acc_norm_stderr": 0.015813901283913048 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5980392156862745, + "acc_stderr": 0.028074158947600646, + "acc_norm": 0.5980392156862745, + "acc_norm_stderr": 0.028074158947600646 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6205787781350482, + "acc_stderr": 0.02755994980234781, + "acc_norm": 0.6205787781350482, + "acc_norm_stderr": 0.02755994980234781 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6327160493827161, + "acc_stderr": 0.026822801759507884, + "acc_norm": 0.6327160493827161, + "acc_norm_stderr": 0.026822801759507884 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.41134751773049644, + "acc_stderr": 0.02935491115994098, + "acc_norm": 0.41134751773049644, + "acc_norm_stderr": 0.02935491115994098 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.423728813559322, + "acc_stderr": 0.012620785155885994, + "acc_norm": 0.423728813559322, + "acc_norm_stderr": 0.012620785155885994 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.03032024326500413, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.03032024326500413 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5424836601307189, + "acc_stderr": 0.020154685712590888, + "acc_norm": 0.5424836601307189, + "acc_norm_stderr": 0.020154685712590888 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5454545454545454, + "acc_stderr": 0.04769300568972743, + "acc_norm": 0.5454545454545454, + "acc_norm_stderr": 0.04769300568972743 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6244897959183674, + "acc_stderr": 0.03100120903989484, + "acc_norm": 0.6244897959183674, + "acc_norm_stderr": 0.03100120903989484 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6716417910447762, + "acc_stderr": 0.033206858897443244, + "acc_norm": 0.6716417910447762, + "acc_norm_stderr": 0.033206858897443244 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932264, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932264 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.45180722891566266, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.45180722891566266, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.783625730994152, + "acc_stderr": 0.03158149539338734, + "acc_norm": 0.783625730994152, + "acc_norm_stderr": 0.03158149539338734 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.28518971848225216, + "mc1_stderr": 0.015805827874454892, + "mc2": 0.42976038477184497, + "mc2_stderr": 0.014287624194742454 + }, + "all": { + "acc": 0.5431742017602187, + "acc_stderr": 0.03475364882242359, + "acc_norm": 0.5472939238594319, + "acc_norm_stderr": 0.03473161659618756, + "mc1": 0.28518971848225216, + "mc1_stderr": 0.015805827874454892, + "mc2": 0.42976038477184497, + "mc2_stderr": 0.014287624194742454 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "36147.534912109375", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Enno-Ai/vigogne2-enno-13b-sft-lora-4bit/results_2023-10-23T10-29-28.223248.json b/eval-results/Enno-Ai/vigogne2-enno-13b-sft-lora-4bit/results_2023-10-23T10-29-28.223248.json new file mode 100644 index 0000000000000000000000000000000000000000..260d7ff97dce422111870fe12e17db645e8baf2b --- /dev/null +++ b/eval-results/Enno-Ai/vigogne2-enno-13b-sft-lora-4bit/results_2023-10-23T10-29-28.223248.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Enno-Ai/vigogne2-enno-13b-sft-lora-4bit", + "model_sha": "2a1b03977395eee44742abda63a4787ea5371d06", + "model_size": "6.6 GB", + "model_dtype": "4bit", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.38370385906040266, + "em_stderr": 0.00498003573381493, + "f1": 0.4364649748322163, + "f1_stderr": 0.004838389403253292 + }, + "harness|gsm8k|5": { + "acc": 0.001516300227445034, + "acc_stderr": 0.0010717793485492625 + }, + "harness|winogrande|5": { + "acc": 0.7695343330702447, + "acc_stderr": 0.011835872164836666 + }, + "all": { + "em": 0.38370385906040266, + "em_stderr": 0.00498003573381493, + "f1": 0.4364649748322163, + "f1_stderr": 0.004838389403253292, + "acc": 0.3855253166488449, + "acc_stderr": 0.006453825756692964 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "1852edad827b3b58" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "c94bf647c63903ca" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "8431bb8406900229" + }, + "total_evaluation_time_secondes": "9939.970715522766", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Faradaylab/ARIA-70B-V2/results_2023-09-14T05-14-04.383698.json b/eval-results/Faradaylab/ARIA-70B-V2/results_2023-09-14T05-14-04.383698.json new file mode 100644 index 0000000000000000000000000000000000000000..31240db13955f73d87497f521bb417a1aabbdfd0 --- /dev/null +++ b/eval-results/Faradaylab/ARIA-70B-V2/results_2023-09-14T05-14-04.383698.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Faradaylab/ARIA-70B-V2", + "model_sha": "2bf026af438d522268533484a85a3e54178e7809", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5784982935153583, + "acc_stderr": 0.014430197069326025, + "acc_norm": 0.621160409556314, + "acc_norm_stderr": 0.014175915490000322 + }, + "harness|hellaswag|10": { + "acc": 0.6558454491137223, + "acc_stderr": 0.004741208229092876, + "acc_norm": 0.8568014339772954, + "acc_norm_stderr": 0.0034955936625207357 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5333333333333333, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.5333333333333333, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7039473684210527, + "acc_stderr": 0.03715062154998904, + "acc_norm": 0.7039473684210527, + "acc_norm_stderr": 0.03715062154998904 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252609, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252609 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6377358490566037, + "acc_stderr": 0.029582245128384303, + "acc_norm": 0.6377358490566037, + "acc_norm_stderr": 0.029582245128384303 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7361111111111112, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.7361111111111112, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5838150289017341, + "acc_stderr": 0.03758517775404948, + "acc_norm": 0.5838150289017341, + "acc_norm_stderr": 0.03758517775404948 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.046550104113196177, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.046550104113196177 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252607, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252607 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6085106382978723, + "acc_stderr": 0.03190701242326812, + "acc_norm": 0.6085106382978723, + "acc_norm_stderr": 0.03190701242326812 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.41228070175438597, + "acc_stderr": 0.04630653203366595, + "acc_norm": 0.41228070175438597, + "acc_norm_stderr": 0.04630653203366595 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5655172413793104, + "acc_stderr": 0.04130740879555498, + "acc_norm": 0.5655172413793104, + "acc_norm_stderr": 0.04130740879555498 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3915343915343915, + "acc_stderr": 0.025138091388851088, + "acc_norm": 0.3915343915343915, + "acc_norm_stderr": 0.025138091388851088 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.04360314860077459, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.04360314860077459 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7645161290322581, + "acc_stderr": 0.02413763242933771, + "acc_norm": 0.7645161290322581, + "acc_norm_stderr": 0.02413763242933771 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5024630541871922, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.5024630541871922, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7696969696969697, + "acc_stderr": 0.03287666758603489, + "acc_norm": 0.7696969696969697, + "acc_norm_stderr": 0.03287666758603489 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.0291265228345868, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.0291265228345868 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8963730569948186, + "acc_stderr": 0.02199531196364424, + "acc_norm": 0.8963730569948186, + "acc_norm_stderr": 0.02199531196364424 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6384615384615384, + "acc_stderr": 0.024359581465397, + "acc_norm": 0.6384615384615384, + "acc_norm_stderr": 0.024359581465397 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.029116617606083018, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.029116617606083018 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6596638655462185, + "acc_stderr": 0.030778057422931673, + "acc_norm": 0.6596638655462185, + "acc_norm_stderr": 0.030778057422931673 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.423841059602649, + "acc_stderr": 0.04034846678603397, + "acc_norm": 0.423841059602649, + "acc_norm_stderr": 0.04034846678603397 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8513761467889909, + "acc_stderr": 0.015251253773660834, + "acc_norm": 0.8513761467889909, + "acc_norm_stderr": 0.015251253773660834 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.49074074074074076, + "acc_stderr": 0.034093869469927006, + "acc_norm": 0.49074074074074076, + "acc_norm_stderr": 0.034093869469927006 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8627450980392157, + "acc_stderr": 0.024152225962801588, + "acc_norm": 0.8627450980392157, + "acc_norm_stderr": 0.024152225962801588 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8565400843881856, + "acc_stderr": 0.022818291821017012, + "acc_norm": 0.8565400843881856, + "acc_norm_stderr": 0.022818291821017012 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7399103139013453, + "acc_stderr": 0.029442495585857476, + "acc_norm": 0.7399103139013453, + "acc_norm_stderr": 0.029442495585857476 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7251908396946565, + "acc_stderr": 0.03915345408847836, + "acc_norm": 0.7251908396946565, + "acc_norm_stderr": 0.03915345408847836 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8264462809917356, + "acc_stderr": 0.0345727283691767, + "acc_norm": 0.8264462809917356, + "acc_norm_stderr": 0.0345727283691767 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8055555555555556, + "acc_stderr": 0.03826076324884866, + "acc_norm": 0.8055555555555556, + "acc_norm_stderr": 0.03826076324884866 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7484662576687117, + "acc_stderr": 0.034089978868575295, + "acc_norm": 0.7484662576687117, + "acc_norm_stderr": 0.034089978868575295 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7864077669902912, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.7864077669902912, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8547008547008547, + "acc_stderr": 0.023086635086841407, + "acc_norm": 0.8547008547008547, + "acc_norm_stderr": 0.023086635086841407 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252607, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252607 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8250319284802043, + "acc_stderr": 0.013586619219903354, + "acc_norm": 0.8250319284802043, + "acc_norm_stderr": 0.013586619219903354 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6965317919075145, + "acc_stderr": 0.024752411960917202, + "acc_norm": 0.6965317919075145, + "acc_norm_stderr": 0.024752411960917202 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.33743016759776534, + "acc_stderr": 0.015813901283913044, + "acc_norm": 0.33743016759776534, + "acc_norm_stderr": 0.015813901283913044 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6830065359477124, + "acc_stderr": 0.02664327847450875, + "acc_norm": 0.6830065359477124, + "acc_norm_stderr": 0.02664327847450875 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.707395498392283, + "acc_stderr": 0.025839898334877983, + "acc_norm": 0.707395498392283, + "acc_norm_stderr": 0.025839898334877983 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7129629629629629, + "acc_stderr": 0.02517104191530968, + "acc_norm": 0.7129629629629629, + "acc_norm_stderr": 0.02517104191530968 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5035460992907801, + "acc_stderr": 0.02982674915328092, + "acc_norm": 0.5035460992907801, + "acc_norm_stderr": 0.02982674915328092 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.485006518904824, + "acc_stderr": 0.012764493202193257, + "acc_norm": 0.485006518904824, + "acc_norm_stderr": 0.012764493202193257 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5477941176470589, + "acc_stderr": 0.030233758551596445, + "acc_norm": 0.5477941176470589, + "acc_norm_stderr": 0.030233758551596445 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6781045751633987, + "acc_stderr": 0.01890101532209308, + "acc_norm": 0.6781045751633987, + "acc_norm_stderr": 0.01890101532209308 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7551020408163265, + "acc_stderr": 0.027529637440174913, + "acc_norm": 0.7551020408163265, + "acc_norm_stderr": 0.027529637440174913 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8706467661691543, + "acc_stderr": 0.023729830881018515, + "acc_norm": 0.8706467661691543, + "acc_norm_stderr": 0.023729830881018515 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.03379976689896308, + "acc_norm": 0.87, + "acc_norm_stderr": 0.03379976689896308 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4879518072289157, + "acc_stderr": 0.038913644958358196, + "acc_norm": 0.4879518072289157, + "acc_norm_stderr": 0.038913644958358196 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3268053855569155, + "mc1_stderr": 0.01641987473113503, + "mc2": 0.4979524095981481, + "mc2_stderr": 0.014785337524777346 + }, + "all": { + "acc": 0.634284380404607, + "acc_stderr": 0.03297009687797655, + "acc_norm": 0.638413500758921, + "acc_norm_stderr": 0.03294467490940201, + "mc1": 0.3268053855569155, + "mc1_stderr": 0.01641987473113503, + "mc2": 0.4979524095981481, + "mc2_stderr": 0.014785337524777346 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "43673.98809981346", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Faradaylab/ARIA-70B-V2/results_2023-10-25T19-48-19.078343.json b/eval-results/Faradaylab/ARIA-70B-V2/results_2023-10-25T19-48-19.078343.json new file mode 100644 index 0000000000000000000000000000000000000000..d686b62afe00cae8b98217c1c109713732620aae --- /dev/null +++ b/eval-results/Faradaylab/ARIA-70B-V2/results_2023-10-25T19-48-19.078343.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Faradaylab/ARIA-70B-V2", + "model_sha": "0acd6b01b129995edfcd5be8dc08962421987337", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.12174916107382551, + "em_stderr": 0.0033487438315364985, + "f1": 0.18035549496644265, + "f1_stderr": 0.0034191831504093964 + }, + "harness|gsm8k|5": { + "acc": 0.2880970432145565, + "acc_stderr": 0.012474469737197917 + }, + "harness|winogrande|5": { + "acc": 0.8168902920284136, + "acc_stderr": 0.01086977863316837 + }, + "all": { + "em": 0.12174916107382551, + "em_stderr": 0.0033487438315364985, + "f1": 0.18035549496644265, + "f1_stderr": 0.0034191831504093964, + "acc": 0.552493667621485, + "acc_stderr": 0.011672124185183144 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "e1b09c08381b0aac" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "8fdfab8326f55658" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "fa77177c40f1d9c1" + }, + "total_evaluation_time_secondes": "43688.751851558685", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Faradaylab/ARIA-70B-V3/results_2023-09-22T10-43-51.211297.json b/eval-results/Faradaylab/ARIA-70B-V3/results_2023-09-22T10-43-51.211297.json new file mode 100644 index 0000000000000000000000000000000000000000..566f57a554082ee45512d70552332accd054d290 --- /dev/null +++ b/eval-results/Faradaylab/ARIA-70B-V3/results_2023-09-22T10-43-51.211297.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Faradaylab/ARIA-70B-V3", + "model_sha": "6e7fdcd20626786dd744ea86c664a3c088ced39f", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5998293515358362, + "acc_stderr": 0.014317197787809174, + "acc_norm": 0.6390784982935154, + "acc_norm_stderr": 0.014034761386175452 + }, + "harness|hellaswag|10": { + "acc": 0.6726747659828719, + "acc_stderr": 0.004682780790508322, + "acc_norm": 0.8620792670782712, + "acc_norm_stderr": 0.0034411206110598396 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.04292596718256981, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.04292596718256981 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7302631578947368, + "acc_stderr": 0.03611780560284898, + "acc_norm": 0.7302631578947368, + "acc_norm_stderr": 0.03611780560284898 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526094, + "acc_norm": 0.67, + "acc_norm_stderr": 0.047258156262526094 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.660377358490566, + "acc_stderr": 0.02914690474779833, + "acc_norm": 0.660377358490566, + "acc_norm_stderr": 0.02914690474779833 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7638888888888888, + "acc_stderr": 0.03551446610810826, + "acc_norm": 0.7638888888888888, + "acc_norm_stderr": 0.03551446610810826 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.45, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237101, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237101 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6127167630057804, + "acc_stderr": 0.037143259063020656, + "acc_norm": 0.6127167630057804, + "acc_norm_stderr": 0.037143259063020656 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3137254901960784, + "acc_stderr": 0.04617034827006718, + "acc_norm": 0.3137254901960784, + "acc_norm_stderr": 0.04617034827006718 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5829787234042553, + "acc_stderr": 0.032232762667117124, + "acc_norm": 0.5829787234042553, + "acc_norm_stderr": 0.032232762667117124 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.41228070175438597, + "acc_stderr": 0.04630653203366595, + "acc_norm": 0.41228070175438597, + "acc_norm_stderr": 0.04630653203366595 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5586206896551724, + "acc_stderr": 0.04137931034482757, + "acc_norm": 0.5586206896551724, + "acc_norm_stderr": 0.04137931034482757 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.025355741263055284, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.025355741263055284 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4365079365079365, + "acc_stderr": 0.04435932892851466, + "acc_norm": 0.4365079365079365, + "acc_norm_stderr": 0.04435932892851466 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7806451612903226, + "acc_stderr": 0.023540799358723285, + "acc_norm": 0.7806451612903226, + "acc_norm_stderr": 0.023540799358723285 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5024630541871922, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.5024630541871922, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8242424242424242, + "acc_stderr": 0.02972094300622445, + "acc_norm": 0.8242424242424242, + "acc_norm_stderr": 0.02972094300622445 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.026552207828215282, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.026552207828215282 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9067357512953368, + "acc_stderr": 0.02098685459328974, + "acc_norm": 0.9067357512953368, + "acc_norm_stderr": 0.02098685459328974 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6410256410256411, + "acc_stderr": 0.02432173848460235, + "acc_norm": 0.6410256410256411, + "acc_norm_stderr": 0.02432173848460235 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.027634907264178544, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.027634907264178544 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7142857142857143, + "acc_stderr": 0.029344572500634342, + "acc_norm": 0.7142857142857143, + "acc_norm_stderr": 0.029344572500634342 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.44370860927152317, + "acc_stderr": 0.04056527902281732, + "acc_norm": 0.44370860927152317, + "acc_norm_stderr": 0.04056527902281732 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8458715596330275, + "acc_stderr": 0.015480826865374303, + "acc_norm": 0.8458715596330275, + "acc_norm_stderr": 0.015480826865374303 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.49537037037037035, + "acc_stderr": 0.03409825519163572, + "acc_norm": 0.49537037037037035, + "acc_norm_stderr": 0.03409825519163572 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8774509803921569, + "acc_stderr": 0.023015389732458258, + "acc_norm": 0.8774509803921569, + "acc_norm_stderr": 0.023015389732458258 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8438818565400844, + "acc_stderr": 0.02362715946031868, + "acc_norm": 0.8438818565400844, + "acc_norm_stderr": 0.02362715946031868 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7399103139013453, + "acc_stderr": 0.02944249558585747, + "acc_norm": 0.7399103139013453, + "acc_norm_stderr": 0.02944249558585747 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7480916030534351, + "acc_stderr": 0.03807387116306085, + "acc_norm": 0.7480916030534351, + "acc_norm_stderr": 0.03807387116306085 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8181818181818182, + "acc_stderr": 0.035208939510976534, + "acc_norm": 0.8181818181818182, + "acc_norm_stderr": 0.035208939510976534 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7962962962962963, + "acc_stderr": 0.03893542518824847, + "acc_norm": 0.7962962962962963, + "acc_norm_stderr": 0.03893542518824847 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7791411042944786, + "acc_stderr": 0.03259177392742178, + "acc_norm": 0.7791411042944786, + "acc_norm_stderr": 0.03259177392742178 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.49107142857142855, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.49107142857142855, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.03916667762822585, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.03916667762822585 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8931623931623932, + "acc_stderr": 0.020237149008990905, + "acc_norm": 0.8931623931623932, + "acc_norm_stderr": 0.020237149008990905 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252607, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252607 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8275862068965517, + "acc_stderr": 0.013507943909371802, + "acc_norm": 0.8275862068965517, + "acc_norm_stderr": 0.013507943909371802 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7369942196531792, + "acc_stderr": 0.023703099525258176, + "acc_norm": 0.7369942196531792, + "acc_norm_stderr": 0.023703099525258176 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.34413407821229053, + "acc_stderr": 0.015889221313307094, + "acc_norm": 0.34413407821229053, + "acc_norm_stderr": 0.015889221313307094 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.696078431372549, + "acc_stderr": 0.02633661346904664, + "acc_norm": 0.696078431372549, + "acc_norm_stderr": 0.02633661346904664 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.729903536977492, + "acc_stderr": 0.025218040373410616, + "acc_norm": 0.729903536977492, + "acc_norm_stderr": 0.025218040373410616 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.024659685185967284, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.024659685185967284 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5177304964539007, + "acc_stderr": 0.02980873964223777, + "acc_norm": 0.5177304964539007, + "acc_norm_stderr": 0.02980873964223777 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.48891786179921776, + "acc_stderr": 0.012767098998525852, + "acc_norm": 0.48891786179921776, + "acc_norm_stderr": 0.012767098998525852 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6029411764705882, + "acc_stderr": 0.02972215209928007, + "acc_norm": 0.6029411764705882, + "acc_norm_stderr": 0.02972215209928007 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6977124183006536, + "acc_stderr": 0.018579232711113877, + "acc_norm": 0.6977124183006536, + "acc_norm_stderr": 0.018579232711113877 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7387755102040816, + "acc_stderr": 0.028123429335142787, + "acc_norm": 0.7387755102040816, + "acc_norm_stderr": 0.028123429335142787 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8656716417910447, + "acc_stderr": 0.024112678240900836, + "acc_norm": 0.8656716417910447, + "acc_norm_stderr": 0.024112678240900836 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.034873508801977704, + "acc_norm": 0.86, + "acc_norm_stderr": 0.034873508801977704 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5060240963855421, + "acc_stderr": 0.03892212195333045, + "acc_norm": 0.5060240963855421, + "acc_norm_stderr": 0.03892212195333045 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.34149326805385555, + "mc1_stderr": 0.016600688619950826, + "mc2": 0.513240508208704, + "mc2_stderr": 0.015101415537603125 + }, + "all": { + "acc": 0.6471664219890731, + "acc_stderr": 0.03252894231531827, + "acc_norm": 0.651041907545905, + "acc_norm_stderr": 0.0325031101698762, + "mc1": 0.34149326805385555, + "mc1_stderr": 0.016600688619950826, + "mc2": 0.513240508208704, + "mc2_stderr": 0.015101415537603125 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "44122.13709402084", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Faradaylab/ARIA-70B-V3/results_2023-10-26T01-49-31.523366.json b/eval-results/Faradaylab/ARIA-70B-V3/results_2023-10-26T01-49-31.523366.json new file mode 100644 index 0000000000000000000000000000000000000000..706fa13d2eb218f74494f58c208e4b9fdc3d6406 --- /dev/null +++ b/eval-results/Faradaylab/ARIA-70B-V3/results_2023-10-26T01-49-31.523366.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Faradaylab/ARIA-70B-V3", + "model_sha": "23810c56f3111ada1f846a0c06c6d33293d4a050", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.4170511744966443, + "em_stderr": 0.005049513544068899, + "f1": 0.4729456795302025, + "f1_stderr": 0.004847240610421039 + }, + "harness|gsm8k|5": { + "acc": 0.2812736921910538, + "acc_stderr": 0.012384789310940237 + }, + "harness|winogrande|5": { + "acc": 0.8208366219415943, + "acc_stderr": 0.010777949156047986 + }, + "all": { + "em": 0.4170511744966443, + "em_stderr": 0.005049513544068899, + "f1": 0.4729456795302025, + "f1_stderr": 0.004847240610421039, + "acc": 0.551055157066324, + "acc_stderr": 0.01158136923349411 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "5b09a04b7c790749" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "0302c4daa79a3949" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "474e604e2860bc0a" + }, + "total_evaluation_time_secondes": "28280.38520526886", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Faradaylab/Aria-70B/results_2023-08-26T09-05-40.294272.json b/eval-results/Faradaylab/Aria-70B/results_2023-08-26T09-05-40.294272.json new file mode 100644 index 0000000000000000000000000000000000000000..19027908da3640d7d39aee95bd3f3347227d6ab9 --- /dev/null +++ b/eval-results/Faradaylab/Aria-70B/results_2023-08-26T09-05-40.294272.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "Faradaylab/Aria-70B", + "model_sha": "57cd251f2cf4e832f64550ea0e2b90ecec155b54", + "model_dtype": "torch.float16", + "lighteval_sha": "578835f70c499eaf870208de093513e08f864581", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6049488054607508, + "acc_stderr": 0.01428589829293817, + "acc_norm": 0.6450511945392492, + "acc_norm_stderr": 0.013983036904094087 + }, + "harness|hellaswag|10": { + "acc": 0.6690898227444733, + "acc_stderr": 0.004695791340502876, + "acc_norm": 0.8586934873531169, + "acc_norm_stderr": 0.003476255509644533 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411021, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411021 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7302631578947368, + "acc_stderr": 0.03611780560284898, + "acc_norm": 0.7302631578947368, + "acc_norm_stderr": 0.03611780560284898 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.65, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.65, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6377358490566037, + "acc_stderr": 0.029582245128384303, + "acc_norm": 0.6377358490566037, + "acc_norm_stderr": 0.029582245128384303 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.75, + "acc_stderr": 0.03621034121889507, + "acc_norm": 0.75, + "acc_norm_stderr": 0.03621034121889507 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237101, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237101 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6069364161849711, + "acc_stderr": 0.0372424959581773, + "acc_norm": 0.6069364161849711, + "acc_norm_stderr": 0.0372424959581773 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04690650298201943, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04690650298201943 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5829787234042553, + "acc_stderr": 0.032232762667117124, + "acc_norm": 0.5829787234042553, + "acc_norm_stderr": 0.032232762667117124 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.41228070175438597, + "acc_stderr": 0.04630653203366595, + "acc_norm": 0.41228070175438597, + "acc_norm_stderr": 0.04630653203366595 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5793103448275863, + "acc_stderr": 0.0411391498118926, + "acc_norm": 0.5793103448275863, + "acc_norm_stderr": 0.0411391498118926 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.41005291005291006, + "acc_stderr": 0.02533120243894442, + "acc_norm": 0.41005291005291006, + "acc_norm_stderr": 0.02533120243894442 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.04403438954768176, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.04403438954768176 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7677419354838709, + "acc_stderr": 0.024022256130308235, + "acc_norm": 0.7677419354838709, + "acc_norm_stderr": 0.024022256130308235 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4630541871921182, + "acc_stderr": 0.035083705204426656, + "acc_norm": 0.4630541871921182, + "acc_norm_stderr": 0.035083705204426656 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.65, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.65, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.793939393939394, + "acc_stderr": 0.03158415324047709, + "acc_norm": 0.793939393939394, + "acc_norm_stderr": 0.03158415324047709 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.803030303030303, + "acc_stderr": 0.028335609732463355, + "acc_norm": 0.803030303030303, + "acc_norm_stderr": 0.028335609732463355 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8911917098445595, + "acc_stderr": 0.022473253332768783, + "acc_norm": 0.8911917098445595, + "acc_norm_stderr": 0.022473253332768783 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6410256410256411, + "acc_stderr": 0.02432173848460235, + "acc_norm": 0.6410256410256411, + "acc_norm_stderr": 0.02432173848460235 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.027940457136228416, + "acc_norm": 0.3, + "acc_norm_stderr": 0.027940457136228416 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6596638655462185, + "acc_stderr": 0.030778057422931673, + "acc_norm": 0.6596638655462185, + "acc_norm_stderr": 0.030778057422931673 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4304635761589404, + "acc_stderr": 0.04042809961395634, + "acc_norm": 0.4304635761589404, + "acc_norm_stderr": 0.04042809961395634 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8385321100917431, + "acc_stderr": 0.015776239256163255, + "acc_norm": 0.8385321100917431, + "acc_norm_stderr": 0.015776239256163255 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.03407632093854052, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.03407632093854052 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8529411764705882, + "acc_stderr": 0.024857478080250458, + "acc_norm": 0.8529411764705882, + "acc_norm_stderr": 0.024857478080250458 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8438818565400844, + "acc_stderr": 0.02362715946031867, + "acc_norm": 0.8438818565400844, + "acc_norm_stderr": 0.02362715946031867 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.726457399103139, + "acc_stderr": 0.02991858670779883, + "acc_norm": 0.726457399103139, + "acc_norm_stderr": 0.02991858670779883 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7099236641221374, + "acc_stderr": 0.039800662464677665, + "acc_norm": 0.7099236641221374, + "acc_norm_stderr": 0.039800662464677665 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8016528925619835, + "acc_stderr": 0.03640118271990946, + "acc_norm": 0.8016528925619835, + "acc_norm_stderr": 0.03640118271990946 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8240740740740741, + "acc_stderr": 0.036809181416738807, + "acc_norm": 0.8240740740740741, + "acc_norm_stderr": 0.036809181416738807 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7607361963190185, + "acc_stderr": 0.033519538795212696, + "acc_norm": 0.7607361963190185, + "acc_norm_stderr": 0.033519538795212696 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.48214285714285715, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.48214285714285715, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.03916667762822584, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.03916667762822584 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8760683760683761, + "acc_stderr": 0.021586494001281372, + "acc_norm": 0.8760683760683761, + "acc_norm_stderr": 0.021586494001281372 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.65, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.65, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8275862068965517, + "acc_stderr": 0.013507943909371798, + "acc_norm": 0.8275862068965517, + "acc_norm_stderr": 0.013507943909371798 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7167630057803468, + "acc_stderr": 0.02425790170532338, + "acc_norm": 0.7167630057803468, + "acc_norm_stderr": 0.02425790170532338 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.39329608938547483, + "acc_stderr": 0.01633726869427011, + "acc_norm": 0.39329608938547483, + "acc_norm_stderr": 0.01633726869427011 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6993464052287581, + "acc_stderr": 0.026256053835718968, + "acc_norm": 0.6993464052287581, + "acc_norm_stderr": 0.026256053835718968 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7041800643086816, + "acc_stderr": 0.02592237178881877, + "acc_norm": 0.7041800643086816, + "acc_norm_stderr": 0.02592237178881877 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7098765432098766, + "acc_stderr": 0.025251173936495036, + "acc_norm": 0.7098765432098766, + "acc_norm_stderr": 0.025251173936495036 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5070921985815603, + "acc_stderr": 0.02982449855912901, + "acc_norm": 0.5070921985815603, + "acc_norm_stderr": 0.02982449855912901 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4791395045632334, + "acc_stderr": 0.01275911706651802, + "acc_norm": 0.4791395045632334, + "acc_norm_stderr": 0.01275911706651802 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5772058823529411, + "acc_stderr": 0.030008562845003476, + "acc_norm": 0.5772058823529411, + "acc_norm_stderr": 0.030008562845003476 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.0190709855896875, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.0190709855896875 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7877551020408163, + "acc_stderr": 0.026176967197866767, + "acc_norm": 0.7877551020408163, + "acc_norm_stderr": 0.026176967197866767 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8706467661691543, + "acc_stderr": 0.023729830881018526, + "acc_norm": 0.8706467661691543, + "acc_norm_stderr": 0.023729830881018526 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.03379976689896309, + "acc_norm": 0.87, + "acc_norm_stderr": 0.03379976689896309 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5120481927710844, + "acc_stderr": 0.03891364495835817, + "acc_norm": 0.5120481927710844, + "acc_norm_stderr": 0.03891364495835817 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8187134502923976, + "acc_stderr": 0.029547741687640038, + "acc_norm": 0.8187134502923976, + "acc_norm_stderr": 0.029547741687640038 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3561811505507956, + "mc1_stderr": 0.016763790728446335, + "mc2": 0.527991738544026, + "mc2_stderr": 0.015530613367021443 + }, + "all": { + "acc": 0.6386983068475005, + "acc_stderr": 0.032863621226889406, + "acc_norm": 0.6425916297913504, + "acc_norm_stderr": 0.03283781788418258, + "mc1": 0.3561811505507956, + "mc1_stderr": 0.016763790728446335, + "mc2": 0.527991738544026, + "mc2_stderr": 0.015530613367021443 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "43684.17815852165", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Felladrin/TinyMistral-248M-SFT-v3/results_2023-12-04T18-03-12.401261.json b/eval-results/Felladrin/TinyMistral-248M-SFT-v3/results_2023-12-04T18-03-12.401261.json new file mode 100644 index 0000000000000000000000000000000000000000..fe305c7cf2b5b297bdcf943d21804d94b91179f5 --- /dev/null +++ b/eval-results/Felladrin/TinyMistral-248M-SFT-v3/results_2023-12-04T18-03-12.401261.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 164787.700556242, + "end_time": 167858.465019261, + "total_evaluation_time_secondes": "3070.7644630190043", + "model_name": "Felladrin/TinyMistral-248M-SFT-v3", + "model_sha": "7a4787dfed21a432924d24575e6c65a97e1dd98a", + "model_dtype": "torch.float16", + "model_size": "521.06 MB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.19283276450511946, + "acc_stderr": 0.01152905546566333, + "acc_norm": 0.21928327645051193, + "acc_norm_stderr": 0.012091245787615721 + }, + "harness|hellaswag|10": { + "acc": 0.27106154152559253, + "acc_stderr": 0.004435993492583857, + "acc_norm": 0.2826130252937662, + "acc_norm_stderr": 0.004493495872000123 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.18518518518518517, + "acc_stderr": 0.03355677216313142, + "acc_norm": 0.18518518518518517, + "acc_norm_stderr": 0.03355677216313142 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2188679245283019, + "acc_stderr": 0.025447863825108618, + "acc_norm": 0.2188679245283019, + "acc_norm_stderr": 0.025447863825108618 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2023121387283237, + "acc_stderr": 0.030631145539198813, + "acc_norm": 0.2023121387283237, + "acc_norm_stderr": 0.030631145539198813 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813365 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.18783068783068782, + "acc_stderr": 0.0201157341415211, + "acc_norm": 0.18783068783068782, + "acc_norm_stderr": 0.0201157341415211 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.03718489006818115, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.03718489006818115 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.18387096774193548, + "acc_stderr": 0.022037217340267836, + "acc_norm": 0.18387096774193548, + "acc_norm_stderr": 0.022037217340267836 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.16748768472906403, + "acc_stderr": 0.026273086047535418, + "acc_norm": 0.16748768472906403, + "acc_norm_stderr": 0.026273086047535418 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.16666666666666666, + "acc_stderr": 0.02655220782821529, + "acc_norm": 0.16666666666666666, + "acc_norm_stderr": 0.02655220782821529 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.19170984455958548, + "acc_stderr": 0.028408953626245296, + "acc_norm": 0.19170984455958548, + "acc_norm_stderr": 0.028408953626245296 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2205128205128205, + "acc_stderr": 0.021020672680827912, + "acc_norm": 0.2205128205128205, + "acc_norm_stderr": 0.021020672680827912 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.21481481481481482, + "acc_stderr": 0.025040443877000683, + "acc_norm": 0.21481481481481482, + "acc_norm_stderr": 0.025040443877000683 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21008403361344538, + "acc_stderr": 0.026461398717471874, + "acc_norm": 0.21008403361344538, + "acc_norm_stderr": 0.026461398717471874 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2052980132450331, + "acc_stderr": 0.03297986648473836, + "acc_norm": 0.2052980132450331, + "acc_norm_stderr": 0.03297986648473836 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1926605504587156, + "acc_stderr": 0.016909276884936104, + "acc_norm": 0.1926605504587156, + "acc_norm_stderr": 0.016909276884936104 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.1527777777777778, + "acc_stderr": 0.024536326026134224, + "acc_norm": 0.1527777777777778, + "acc_norm_stderr": 0.024536326026134224 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.030587591351604246, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.030587591351604246 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.28270042194092826, + "acc_stderr": 0.029312814153955917, + "acc_norm": 0.28270042194092826, + "acc_norm_stderr": 0.029312814153955917 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.31390134529147984, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.31390134529147984, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2748091603053435, + "acc_stderr": 0.03915345408847836, + "acc_norm": 0.2748091603053435, + "acc_norm_stderr": 0.03915345408847836 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.23140495867768596, + "acc_stderr": 0.03849856098794088, + "acc_norm": 0.23140495867768596, + "acc_norm_stderr": 0.03849856098794088 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.04330043749650743, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.04330043749650743 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.042878587513404565, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.042878587513404565 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.28205128205128205, + "acc_stderr": 0.029480360549541194, + "acc_norm": 0.28205128205128205, + "acc_norm_stderr": 0.029480360549541194 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2388250319284802, + "acc_stderr": 0.015246803197398682, + "acc_norm": 0.2388250319284802, + "acc_norm_stderr": 0.015246803197398682 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2543352601156069, + "acc_stderr": 0.023445826276545546, + "acc_norm": 0.2543352601156069, + "acc_norm_stderr": 0.023445826276545546 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574918, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574918 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.023805186524888146, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.023805186524888146 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1864951768488746, + "acc_stderr": 0.022122439772480778, + "acc_norm": 0.1864951768488746, + "acc_norm_stderr": 0.022122439772480778 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.023132376234543332, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.023132376234543332 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.22695035460992907, + "acc_stderr": 0.024987106365642973, + "acc_norm": 0.22695035460992907, + "acc_norm_stderr": 0.024987106365642973 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.242503259452412, + "acc_stderr": 0.010946570966348788, + "acc_norm": 0.242503259452412, + "acc_norm_stderr": 0.010946570966348788 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.023157468308559324, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.023157468308559324 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25, + "acc_stderr": 0.01751781884501444, + "acc_norm": 0.25, + "acc_norm_stderr": 0.01751781884501444 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.1836734693877551, + "acc_stderr": 0.024789071332007633, + "acc_norm": 0.1836734693877551, + "acc_norm_stderr": 0.024789071332007633 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23880597014925373, + "acc_stderr": 0.030147775935409217, + "acc_norm": 0.23880597014925373, + "acc_norm_stderr": 0.030147775935409217 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.26506024096385544, + "acc_stderr": 0.03436024037944967, + "acc_norm": 0.26506024096385544, + "acc_norm_stderr": 0.03436024037944967 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.03508771929824563, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.03508771929824563 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.20563035495716034, + "mc1_stderr": 0.014148482219460962, + "mc2": 0.400307198899101, + "mc2_stderr": 0.014941622020470767 + }, + "harness|winogrande|5": { + "acc": 0.5153906866614049, + "acc_stderr": 0.014045826789783656 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "all": { + "acc": 0.23016202481388653, + "acc_stderr": 0.029832125302523167, + "acc_norm": 0.22987279360507185, + "acc_norm_stderr": 0.03061582219263556, + "mc1": 0.20563035495716034, + "mc1_stderr": 0.014148482219460962, + "mc2": 0.400307198899101, + "mc2_stderr": 0.014941622020470767 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "0970ef5d16bb5e0d" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "12ddcbcce5e0893d" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Felladrin/TinyMistral-248M-SFT-v4/results_2023-12-12T04-15-32.627780.json b/eval-results/Felladrin/TinyMistral-248M-SFT-v4/results_2023-12-12T04-15-32.627780.json new file mode 100644 index 0000000000000000000000000000000000000000..5eaccb9d39f02c95cb1a6bbf5e1f46957839f43d --- /dev/null +++ b/eval-results/Felladrin/TinyMistral-248M-SFT-v4/results_2023-12-12T04-15-32.627780.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 806891.911917729, + "end_time": 809401.754238324, + "total_evaluation_time_secondes": "2509.842320595053", + "model_name": "Felladrin/TinyMistral-248M-SFT-v4", + "model_sha": "ec0ff201527cd9b50eb9b4fc754d6c08f1242ea1", + "model_dtype": "torch.float16", + "model_size": "521.06 MB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.2022184300341297, + "acc_stderr": 0.011737454431872104, + "acc_norm": 0.24914675767918087, + "acc_norm_stderr": 0.012639407111926435 + }, + "harness|hellaswag|10": { + "acc": 0.2742481577375025, + "acc_stderr": 0.004452228541043549, + "acc_norm": 0.2815176259709221, + "acc_norm_stderr": 0.004488201756642581 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2, + "acc_stderr": 0.034554737023254366, + "acc_norm": 0.2, + "acc_norm_stderr": 0.034554737023254366 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.036906779861372814, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.036906779861372814 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2528301886792453, + "acc_stderr": 0.02674989977124124, + "acc_norm": 0.2528301886792453, + "acc_norm_stderr": 0.02674989977124124 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.22916666666666666, + "acc_stderr": 0.035146974678623884, + "acc_norm": 0.22916666666666666, + "acc_norm_stderr": 0.035146974678623884 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909282, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909282 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.27167630057803466, + "acc_stderr": 0.03391750322321659, + "acc_norm": 0.27167630057803466, + "acc_norm_stderr": 0.03391750322321659 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.04389869956808778, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.04389869956808778 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.21, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.21, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.30638297872340425, + "acc_stderr": 0.03013590647851756, + "acc_norm": 0.30638297872340425, + "acc_norm_stderr": 0.03013590647851756 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748141, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748141 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.22758620689655173, + "acc_stderr": 0.03493950380131184, + "acc_norm": 0.22758620689655173, + "acc_norm_stderr": 0.03493950380131184 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2724867724867725, + "acc_stderr": 0.02293097307163335, + "acc_norm": 0.2724867724867725, + "acc_norm_stderr": 0.02293097307163335 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04040610178208841, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04040610178208841 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2806451612903226, + "acc_stderr": 0.025560604721022902, + "acc_norm": 0.2806451612903226, + "acc_norm_stderr": 0.025560604721022902 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2955665024630542, + "acc_stderr": 0.03210494433751458, + "acc_norm": 0.2955665024630542, + "acc_norm_stderr": 0.03210494433751458 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.23030303030303031, + "acc_stderr": 0.0328766675860349, + "acc_norm": 0.23030303030303031, + "acc_norm_stderr": 0.0328766675860349 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.29797979797979796, + "acc_stderr": 0.03258630383836557, + "acc_norm": 0.29797979797979796, + "acc_norm_stderr": 0.03258630383836557 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.39378238341968913, + "acc_stderr": 0.03526077095548236, + "acc_norm": 0.39378238341968913, + "acc_norm_stderr": 0.03526077095548236 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.31794871794871793, + "acc_stderr": 0.02361088430892786, + "acc_norm": 0.31794871794871793, + "acc_norm_stderr": 0.02361088430892786 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.027840811495871937, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.027840811495871937 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.33613445378151263, + "acc_stderr": 0.030684737115135363, + "acc_norm": 0.33613445378151263, + "acc_norm_stderr": 0.030684737115135363 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.25165562913907286, + "acc_stderr": 0.035433042343899844, + "acc_norm": 0.25165562913907286, + "acc_norm_stderr": 0.035433042343899844 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.26238532110091745, + "acc_stderr": 0.01886188502153473, + "acc_norm": 0.26238532110091745, + "acc_norm_stderr": 0.01886188502153473 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.39814814814814814, + "acc_stderr": 0.03338473403207401, + "acc_norm": 0.39814814814814814, + "acc_norm_stderr": 0.03338473403207401 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.29901960784313725, + "acc_stderr": 0.03213325717373617, + "acc_norm": 0.29901960784313725, + "acc_norm_stderr": 0.03213325717373617 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.21518987341772153, + "acc_stderr": 0.02675082699467618, + "acc_norm": 0.21518987341772153, + "acc_norm_stderr": 0.02675082699467618 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.17040358744394618, + "acc_stderr": 0.025234593447136165, + "acc_norm": 0.17040358744394618, + "acc_norm_stderr": 0.025234593447136165 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.19083969465648856, + "acc_stderr": 0.03446513350752599, + "acc_norm": 0.19083969465648856, + "acc_norm_stderr": 0.03446513350752599 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.09917355371900827, + "acc_stderr": 0.02728524631275895, + "acc_norm": 0.09917355371900827, + "acc_norm_stderr": 0.02728524631275895 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.26380368098159507, + "acc_stderr": 0.03462419931615623, + "acc_norm": 0.26380368098159507, + "acc_norm_stderr": 0.03462419931615623 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.29464285714285715, + "acc_stderr": 0.043270409325787296, + "acc_norm": 0.29464285714285715, + "acc_norm_stderr": 0.043270409325787296 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.2912621359223301, + "acc_stderr": 0.044986763205729224, + "acc_norm": 0.2912621359223301, + "acc_norm_stderr": 0.044986763205729224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.23076923076923078, + "acc_stderr": 0.02760192138141759, + "acc_norm": 0.23076923076923078, + "acc_norm_stderr": 0.02760192138141759 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.27330779054916987, + "acc_stderr": 0.015936681062628556, + "acc_norm": 0.27330779054916987, + "acc_norm_stderr": 0.015936681062628556 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.21965317919075145, + "acc_stderr": 0.022289638852617904, + "acc_norm": 0.21965317919075145, + "acc_norm_stderr": 0.022289638852617904 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2558659217877095, + "acc_stderr": 0.014593620923210756, + "acc_norm": 0.2558659217877095, + "acc_norm_stderr": 0.014593620923210756 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.26143790849673204, + "acc_stderr": 0.025160998214292456, + "acc_norm": 0.26143790849673204, + "acc_norm_stderr": 0.025160998214292456 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24115755627009647, + "acc_stderr": 0.02429659403476343, + "acc_norm": 0.24115755627009647, + "acc_norm_stderr": 0.02429659403476343 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.24382716049382716, + "acc_stderr": 0.0238918795419596, + "acc_norm": 0.24382716049382716, + "acc_norm_stderr": 0.0238918795419596 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23404255319148937, + "acc_stderr": 0.025257861359432403, + "acc_norm": 0.23404255319148937, + "acc_norm_stderr": 0.025257861359432403 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24837027379400262, + "acc_stderr": 0.011035212598034503, + "acc_norm": 0.24837027379400262, + "acc_norm_stderr": 0.011035212598034503 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4007352941176471, + "acc_stderr": 0.029768263528933105, + "acc_norm": 0.4007352941176471, + "acc_norm_stderr": 0.029768263528933105 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.017740899509177798, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.017740899509177798 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.22727272727272727, + "acc_stderr": 0.04013964554072775, + "acc_norm": 0.22727272727272727, + "acc_norm_stderr": 0.04013964554072775 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.33877551020408164, + "acc_stderr": 0.030299506562154185, + "acc_norm": 0.33877551020408164, + "acc_norm_stderr": 0.030299506562154185 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.27860696517412936, + "acc_stderr": 0.03170056183497308, + "acc_norm": 0.27860696517412936, + "acc_norm_stderr": 0.03170056183497308 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.17, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.17, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.1686746987951807, + "acc_stderr": 0.029152009627856544, + "acc_norm": 0.1686746987951807, + "acc_norm_stderr": 0.029152009627856544 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.22807017543859648, + "acc_stderr": 0.03218093795602357, + "acc_norm": 0.22807017543859648, + "acc_norm_stderr": 0.03218093795602357 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.20807833537331702, + "mc1_stderr": 0.014210503473576618, + "mc2": 0.3956118679297354, + "mc2_stderr": 0.01494264576082401 + }, + "harness|winogrande|5": { + "acc": 0.505130228887135, + "acc_stderr": 0.014051745961790516 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "all": { + "acc": 0.25943849313327083, + "acc_stderr": 0.03081669921999169, + "acc_norm": 0.26059009573086195, + "acc_norm_stderr": 0.03163906495514162, + "mc1": 0.20807833537331702, + "mc1_stderr": 0.014210503473576618, + "mc2": 0.3956118679297354, + "mc2_stderr": 0.01494264576082401 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "ebe00725116700ac" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "8eabdc21dd4e1a60" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/GreenNode/GreenNodeLM-7B-v1olet/results_2023-12-13T20-30-07.482326.json b/eval-results/GreenNode/GreenNodeLM-7B-v1olet/results_2023-12-13T20-30-07.482326.json new file mode 100644 index 0000000000000000000000000000000000000000..f8dd010bfa80080b3778f78e5309a6ba941be510 --- /dev/null +++ b/eval-results/GreenNode/GreenNodeLM-7B-v1olet/results_2023-12-13T20-30-07.482326.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 123286.062862314, + "end_time": 130362.696950504, + "total_evaluation_time_secondes": "7076.6340881900105", + "model_name": "GreenNode/GreenNodeLM-7B-v1olet", + "model_sha": "4f0d53e65814390b8a260dd23fe5a30ced239176", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.7030716723549488, + "acc_stderr": 0.013352025976725223, + "acc_norm": 0.7261092150170648, + "acc_norm_stderr": 0.013032004972989503 + }, + "harness|hellaswag|10": { + "acc": 0.7143995220075682, + "acc_stderr": 0.004507768029590101, + "acc_norm": 0.8770165305715992, + "acc_norm_stderr": 0.0032774703870227257 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6148148148148148, + "acc_stderr": 0.04203921040156279, + "acc_norm": 0.6148148148148148, + "acc_norm_stderr": 0.04203921040156279 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7039473684210527, + "acc_stderr": 0.03715062154998904, + "acc_norm": 0.7039473684210527, + "acc_norm_stderr": 0.03715062154998904 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7056603773584905, + "acc_stderr": 0.02804918631569525, + "acc_norm": 0.7056603773584905, + "acc_norm_stderr": 0.02804918631569525 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7291666666666666, + "acc_stderr": 0.03716177437566017, + "acc_norm": 0.7291666666666666, + "acc_norm_stderr": 0.03716177437566017 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.03643037168958548, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.03643037168958548 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107224, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107224 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5531914893617021, + "acc_stderr": 0.0325005368436584, + "acc_norm": 0.5531914893617021, + "acc_norm_stderr": 0.0325005368436584 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5, + "acc_stderr": 0.047036043419179864, + "acc_norm": 0.5, + "acc_norm_stderr": 0.047036043419179864 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5517241379310345, + "acc_stderr": 0.04144311810878152, + "acc_norm": 0.5517241379310345, + "acc_norm_stderr": 0.04144311810878152 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3941798941798942, + "acc_stderr": 0.02516798233389414, + "acc_norm": 0.3941798941798942, + "acc_norm_stderr": 0.02516798233389414 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.04451807959055328, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.04451807959055328 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7967741935483871, + "acc_stderr": 0.022891687984554956, + "acc_norm": 0.7967741935483871, + "acc_norm_stderr": 0.022891687984554956 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.47783251231527096, + "acc_stderr": 0.03514528562175008, + "acc_norm": 0.47783251231527096, + "acc_norm_stderr": 0.03514528562175008 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.029126522834586808, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.029126522834586808 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8860103626943006, + "acc_stderr": 0.022935144053919436, + "acc_norm": 0.8860103626943006, + "acc_norm_stderr": 0.022935144053919436 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6410256410256411, + "acc_stderr": 0.02432173848460235, + "acc_norm": 0.6410256410256411, + "acc_norm_stderr": 0.02432173848460235 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32222222222222224, + "acc_stderr": 0.028493465091028593, + "acc_norm": 0.32222222222222224, + "acc_norm_stderr": 0.028493465091028593 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6302521008403361, + "acc_stderr": 0.03135709599613591, + "acc_norm": 0.6302521008403361, + "acc_norm_stderr": 0.03135709599613591 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2847682119205298, + "acc_stderr": 0.03684881521389023, + "acc_norm": 0.2847682119205298, + "acc_norm_stderr": 0.03684881521389023 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8348623853211009, + "acc_stderr": 0.015919557829976037, + "acc_norm": 0.8348623853211009, + "acc_norm_stderr": 0.015919557829976037 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5, + "acc_stderr": 0.034099716973523674, + "acc_norm": 0.5, + "acc_norm_stderr": 0.034099716973523674 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8186274509803921, + "acc_stderr": 0.02704462171947408, + "acc_norm": 0.8186274509803921, + "acc_norm_stderr": 0.02704462171947408 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7848101265822784, + "acc_stderr": 0.02675082699467618, + "acc_norm": 0.7848101265822784, + "acc_norm_stderr": 0.02675082699467618 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7040358744394619, + "acc_stderr": 0.0306365913486998, + "acc_norm": 0.7040358744394619, + "acc_norm_stderr": 0.0306365913486998 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7557251908396947, + "acc_stderr": 0.037683359597287434, + "acc_norm": 0.7557251908396947, + "acc_norm_stderr": 0.037683359597287434 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7768595041322314, + "acc_stderr": 0.03800754475228732, + "acc_norm": 0.7768595041322314, + "acc_norm_stderr": 0.03800754475228732 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.0401910747255735, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.0401910747255735 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7361963190184049, + "acc_stderr": 0.03462419931615623, + "acc_norm": 0.7361963190184049, + "acc_norm_stderr": 0.03462419931615623 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4375, + "acc_stderr": 0.04708567521880525, + "acc_norm": 0.4375, + "acc_norm_stderr": 0.04708567521880525 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8803418803418803, + "acc_stderr": 0.021262719400406957, + "acc_norm": 0.8803418803418803, + "acc_norm_stderr": 0.021262719400406957 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8263090676883781, + "acc_stderr": 0.01354741565866226, + "acc_norm": 0.8263090676883781, + "acc_norm_stderr": 0.01354741565866226 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7167630057803468, + "acc_stderr": 0.024257901705323378, + "acc_norm": 0.7167630057803468, + "acc_norm_stderr": 0.024257901705323378 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4547486033519553, + "acc_stderr": 0.016653875777524006, + "acc_norm": 0.4547486033519553, + "acc_norm_stderr": 0.016653875777524006 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7026143790849673, + "acc_stderr": 0.026173908506718576, + "acc_norm": 0.7026143790849673, + "acc_norm_stderr": 0.026173908506718576 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6977491961414791, + "acc_stderr": 0.026082700695399662, + "acc_norm": 0.6977491961414791, + "acc_norm_stderr": 0.026082700695399662 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7469135802469136, + "acc_stderr": 0.024191808600712995, + "acc_norm": 0.7469135802469136, + "acc_norm_stderr": 0.024191808600712995 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.029766675075873862, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.029766675075873862 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4680573663624511, + "acc_stderr": 0.012744149704869647, + "acc_norm": 0.4680573663624511, + "acc_norm_stderr": 0.012744149704869647 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6654411764705882, + "acc_stderr": 0.0286619962023353, + "acc_norm": 0.6654411764705882, + "acc_norm_stderr": 0.0286619962023353 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6552287581699346, + "acc_stderr": 0.019228322018696644, + "acc_norm": 0.6552287581699346, + "acc_norm_stderr": 0.019228322018696644 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7306122448979592, + "acc_stderr": 0.02840125202902294, + "acc_norm": 0.7306122448979592, + "acc_norm_stderr": 0.02840125202902294 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8507462686567164, + "acc_stderr": 0.025196929874827075, + "acc_norm": 0.8507462686567164, + "acc_norm_stderr": 0.025196929874827075 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.03379976689896309, + "acc_norm": 0.87, + "acc_norm_stderr": 0.03379976689896309 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5481927710843374, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.5481927710843374, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8245614035087719, + "acc_stderr": 0.029170885500727665, + "acc_norm": 0.8245614035087719, + "acc_norm_stderr": 0.029170885500727665 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.5740514075887393, + "mc1_stderr": 0.01731047190407654, + "mc2": 0.6907171691355769, + "mc2_stderr": 0.015243695704371275 + }, + "harness|winogrande|5": { + "acc": 0.8232044198895028, + "acc_stderr": 0.010721923287918742 + }, + "harness|gsm8k|5": { + "acc": 0.66868840030326, + "acc_stderr": 0.01296499967968867 + }, + "all": { + "acc": 0.6411740347675706, + "acc_stderr": 0.03228342039008203, + "acc_norm": 0.6407691161331389, + "acc_norm_stderr": 0.03295002376578124, + "mc1": 0.5740514075887393, + "mc1_stderr": 0.01731047190407654, + "mc2": 0.6907171691355769, + "mc2_stderr": 0.015243695704371275 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "728fe88693ca2600" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "a56bd960f63dc2ea" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/GreenNode/GreenNodeLM-7B-v2leo/results_2023-12-16T15-11-27.236820.json b/eval-results/GreenNode/GreenNodeLM-7B-v2leo/results_2023-12-16T15-11-27.236820.json new file mode 100644 index 0000000000000000000000000000000000000000..8d52af26cbc4f8e06d8cacabc14a002796ef0114 --- /dev/null +++ b/eval-results/GreenNode/GreenNodeLM-7B-v2leo/results_2023-12-16T15-11-27.236820.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 363435.068993866, + "end_time": 370449.39700326, + "total_evaluation_time_secondes": "7014.3280093939975", + "model_name": "GreenNode/GreenNodeLM-7B-v2leo", + "model_sha": "e5a0955eb36568aa850cd73debbe9815a9d1e60a", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6697952218430034, + "acc_stderr": 0.013743085603760427, + "acc_norm": 0.6979522184300341, + "acc_norm_stderr": 0.013417519144716413 + }, + "harness|hellaswag|10": { + "acc": 0.7097191794463255, + "acc_stderr": 0.004529642828546397, + "acc_norm": 0.8802031467835093, + "acc_norm_stderr": 0.0032406018831804884 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.04072314811876837, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.04072314811876837 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6842105263157895, + "acc_stderr": 0.0378272898086547, + "acc_norm": 0.6842105263157895, + "acc_norm_stderr": 0.0378272898086547 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.720754716981132, + "acc_stderr": 0.027611163402399715, + "acc_norm": 0.720754716981132, + "acc_norm_stderr": 0.027611163402399715 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7430555555555556, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.7430555555555556, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720683, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720683 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6416184971098265, + "acc_stderr": 0.036563436533531585, + "acc_norm": 0.6416184971098265, + "acc_norm_stderr": 0.036563436533531585 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4019607843137255, + "acc_stderr": 0.04878608714466996, + "acc_norm": 0.4019607843137255, + "acc_norm_stderr": 0.04878608714466996 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.79, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5659574468085107, + "acc_stderr": 0.03240038086792747, + "acc_norm": 0.5659574468085107, + "acc_norm_stderr": 0.03240038086792747 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.046970851366478626, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.046970851366478626 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6068965517241379, + "acc_stderr": 0.0407032901370707, + "acc_norm": 0.6068965517241379, + "acc_norm_stderr": 0.0407032901370707 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4470899470899471, + "acc_stderr": 0.02560672399577702, + "acc_norm": 0.4470899470899471, + "acc_norm_stderr": 0.02560672399577702 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7774193548387097, + "acc_stderr": 0.023664216671642518, + "acc_norm": 0.7774193548387097, + "acc_norm_stderr": 0.023664216671642518 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5073891625615764, + "acc_stderr": 0.035176035403610105, + "acc_norm": 0.5073891625615764, + "acc_norm_stderr": 0.035176035403610105 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7636363636363637, + "acc_stderr": 0.03317505930009182, + "acc_norm": 0.7636363636363637, + "acc_norm_stderr": 0.03317505930009182 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.029126522834586818, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.029126522834586818 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8808290155440415, + "acc_stderr": 0.023381935348121427, + "acc_norm": 0.8808290155440415, + "acc_norm_stderr": 0.023381935348121427 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.023901157979402534, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.023901157979402534 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.37407407407407406, + "acc_stderr": 0.029502861128955293, + "acc_norm": 0.37407407407407406, + "acc_norm_stderr": 0.029502861128955293 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7016806722689075, + "acc_stderr": 0.029719142876342853, + "acc_norm": 0.7016806722689075, + "acc_norm_stderr": 0.029719142876342853 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.03861557546255169, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.03861557546255169 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8366972477064221, + "acc_stderr": 0.01584825580650155, + "acc_norm": 0.8366972477064221, + "acc_norm_stderr": 0.01584825580650155 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5231481481481481, + "acc_stderr": 0.03406315360711507, + "acc_norm": 0.5231481481481481, + "acc_norm_stderr": 0.03406315360711507 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8186274509803921, + "acc_stderr": 0.027044621719474082, + "acc_norm": 0.8186274509803921, + "acc_norm_stderr": 0.027044621719474082 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8016877637130801, + "acc_stderr": 0.025955020841621112, + "acc_norm": 0.8016877637130801, + "acc_norm_stderr": 0.025955020841621112 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7938931297709924, + "acc_stderr": 0.03547771004159465, + "acc_norm": 0.7938931297709924, + "acc_norm_stderr": 0.03547771004159465 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8016528925619835, + "acc_stderr": 0.03640118271990946, + "acc_norm": 0.8016528925619835, + "acc_norm_stderr": 0.03640118271990946 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7962962962962963, + "acc_stderr": 0.03893542518824847, + "acc_norm": 0.7962962962962963, + "acc_norm_stderr": 0.03893542518824847 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7791411042944786, + "acc_stderr": 0.03259177392742178, + "acc_norm": 0.7791411042944786, + "acc_norm_stderr": 0.03259177392742178 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.49107142857142855, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.49107142857142855, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7864077669902912, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.7864077669902912, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.02190190511507333, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.02190190511507333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8173690932311622, + "acc_stderr": 0.013816335389973136, + "acc_norm": 0.8173690932311622, + "acc_norm_stderr": 0.013816335389973136 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7427745664739884, + "acc_stderr": 0.023532925431044287, + "acc_norm": 0.7427745664739884, + "acc_norm_stderr": 0.023532925431044287 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4782122905027933, + "acc_stderr": 0.016706617522176136, + "acc_norm": 0.4782122905027933, + "acc_norm_stderr": 0.016706617522176136 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7124183006535948, + "acc_stderr": 0.025917806117147158, + "acc_norm": 0.7124183006535948, + "acc_norm_stderr": 0.025917806117147158 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7106109324758842, + "acc_stderr": 0.025755865922632945, + "acc_norm": 0.7106109324758842, + "acc_norm_stderr": 0.025755865922632945 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7561728395061729, + "acc_stderr": 0.023891879541959614, + "acc_norm": 0.7561728395061729, + "acc_norm_stderr": 0.023891879541959614 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4716312056737589, + "acc_stderr": 0.02977945095730307, + "acc_norm": 0.4716312056737589, + "acc_norm_stderr": 0.02977945095730307 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.46740547588005216, + "acc_stderr": 0.01274307294265335, + "acc_norm": 0.46740547588005216, + "acc_norm_stderr": 0.01274307294265335 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6691176470588235, + "acc_stderr": 0.028582709753898445, + "acc_norm": 0.6691176470588235, + "acc_norm_stderr": 0.028582709753898445 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.0190709855896875, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.0190709855896875 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6818181818181818, + "acc_stderr": 0.04461272175910509, + "acc_norm": 0.6818181818181818, + "acc_norm_stderr": 0.04461272175910509 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7183673469387755, + "acc_stderr": 0.028795185574291296, + "acc_norm": 0.7183673469387755, + "acc_norm_stderr": 0.028795185574291296 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8407960199004975, + "acc_stderr": 0.02587064676616913, + "acc_norm": 0.8407960199004975, + "acc_norm_stderr": 0.02587064676616913 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.88, + "acc_stderr": 0.03265986323710906, + "acc_norm": 0.88, + "acc_norm_stderr": 0.03265986323710906 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.536144578313253, + "acc_stderr": 0.038823108508905954, + "acc_norm": 0.536144578313253, + "acc_norm_stderr": 0.038823108508905954 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.847953216374269, + "acc_stderr": 0.027539122889061456, + "acc_norm": 0.847953216374269, + "acc_norm_stderr": 0.027539122889061456 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.5397796817625459, + "mc1_stderr": 0.017448017223960874, + "mc2": 0.6783429669391698, + "mc2_stderr": 0.015130110237542948 + }, + "harness|winogrande|5": { + "acc": 0.8200473559589582, + "acc_stderr": 0.01079646868806868 + }, + "harness|gsm8k|5": { + "acc": 0.6709628506444276, + "acc_stderr": 0.01294237560367937 + }, + "all": { + "acc": 0.6544526838897918, + "acc_stderr": 0.032061359095590616, + "acc_norm": 0.6547329571965785, + "acc_norm_stderr": 0.03271846530737067, + "mc1": 0.5397796817625459, + "mc1_stderr": 0.017448017223960874, + "mc2": 0.6783429669391698, + "mc2_stderr": 0.015130110237542948 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "7b5141049e866397" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "c3a57d224f7e1514" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/GreenNode/GreenNodeLM-7B-v4leo/results_2023-12-16T20-22-05.730511.json b/eval-results/GreenNode/GreenNodeLM-7B-v4leo/results_2023-12-16T20-22-05.730511.json new file mode 100644 index 0000000000000000000000000000000000000000..105914b0e8aac3c67cd3ec283a4735b015ea19d5 --- /dev/null +++ b/eval-results/GreenNode/GreenNodeLM-7B-v4leo/results_2023-12-16T20-22-05.730511.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 382122.268173081, + "end_time": 389083.82192116, + "total_evaluation_time_secondes": "6961.5537480789935", + "model_name": "GreenNode/GreenNodeLM-7B-v4leo", + "model_sha": "9286f6fac1df497203e110070322c93dab33fdd2", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6825938566552902, + "acc_stderr": 0.013602239088038167, + "acc_norm": 0.712457337883959, + "acc_norm_stderr": 0.013226719056266127 + }, + "harness|hellaswag|10": { + "acc": 0.7104162517426807, + "acc_stderr": 0.004526422125860673, + "acc_norm": 0.8823939454291974, + "acc_norm_stderr": 0.0032148270694168255 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.04072314811876837, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.04072314811876837 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6973684210526315, + "acc_stderr": 0.037385206761196686, + "acc_norm": 0.6973684210526315, + "acc_norm_stderr": 0.037385206761196686 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.720754716981132, + "acc_stderr": 0.027611163402399715, + "acc_norm": 0.720754716981132, + "acc_norm_stderr": 0.027611163402399715 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7638888888888888, + "acc_stderr": 0.03551446610810826, + "acc_norm": 0.7638888888888888, + "acc_norm_stderr": 0.03551446610810826 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6705202312138728, + "acc_stderr": 0.03583901754736412, + "acc_norm": 0.6705202312138728, + "acc_norm_stderr": 0.03583901754736412 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.43137254901960786, + "acc_stderr": 0.04928099597287534, + "acc_norm": 0.43137254901960786, + "acc_norm_stderr": 0.04928099597287534 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932263, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932263 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.574468085106383, + "acc_stderr": 0.03232146916224468, + "acc_norm": 0.574468085106383, + "acc_norm_stderr": 0.03232146916224468 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.49122807017543857, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.49122807017543857, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5793103448275863, + "acc_stderr": 0.0411391498118926, + "acc_norm": 0.5793103448275863, + "acc_norm_stderr": 0.0411391498118926 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.42328042328042326, + "acc_stderr": 0.025446365634406783, + "acc_norm": 0.42328042328042326, + "acc_norm_stderr": 0.025446365634406783 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.46825396825396826, + "acc_stderr": 0.04463112720677172, + "acc_norm": 0.46825396825396826, + "acc_norm_stderr": 0.04463112720677172 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7870967741935484, + "acc_stderr": 0.023287665127268542, + "acc_norm": 0.7870967741935484, + "acc_norm_stderr": 0.023287665127268542 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5123152709359606, + "acc_stderr": 0.035169204442208966, + "acc_norm": 0.5123152709359606, + "acc_norm_stderr": 0.035169204442208966 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.031922715695483, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.031922715695483 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7828282828282829, + "acc_stderr": 0.02937661648494563, + "acc_norm": 0.7828282828282829, + "acc_norm_stderr": 0.02937661648494563 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8860103626943006, + "acc_stderr": 0.022935144053919436, + "acc_norm": 0.8860103626943006, + "acc_norm_stderr": 0.022935144053919436 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6871794871794872, + "acc_stderr": 0.023507579020645358, + "acc_norm": 0.6871794871794872, + "acc_norm_stderr": 0.023507579020645358 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.337037037037037, + "acc_stderr": 0.02882088466625326, + "acc_norm": 0.337037037037037, + "acc_norm_stderr": 0.02882088466625326 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6890756302521008, + "acc_stderr": 0.03006676158297793, + "acc_norm": 0.6890756302521008, + "acc_norm_stderr": 0.03006676158297793 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8458715596330275, + "acc_stderr": 0.015480826865374303, + "acc_norm": 0.8458715596330275, + "acc_norm_stderr": 0.015480826865374303 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.02615686752393104, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.02615686752393104 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8059071729957806, + "acc_stderr": 0.025744902532290913, + "acc_norm": 0.8059071729957806, + "acc_norm_stderr": 0.025744902532290913 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6771300448430493, + "acc_stderr": 0.031381476375754995, + "acc_norm": 0.6771300448430493, + "acc_norm_stderr": 0.031381476375754995 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8015267175572519, + "acc_stderr": 0.03498149385462472, + "acc_norm": 0.8015267175572519, + "acc_norm_stderr": 0.03498149385462472 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7851239669421488, + "acc_stderr": 0.037494924487096966, + "acc_norm": 0.7851239669421488, + "acc_norm_stderr": 0.037494924487096966 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7668711656441718, + "acc_stderr": 0.0332201579577674, + "acc_norm": 0.7668711656441718, + "acc_norm_stderr": 0.0332201579577674 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4732142857142857, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.4732142857142857, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7669902912621359, + "acc_stderr": 0.04185832598928315, + "acc_norm": 0.7669902912621359, + "acc_norm_stderr": 0.04185832598928315 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.02190190511507333, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.02190190511507333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8339719029374202, + "acc_stderr": 0.0133064782430663, + "acc_norm": 0.8339719029374202, + "acc_norm_stderr": 0.0133064782430663 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7254335260115607, + "acc_stderr": 0.024027745155265026, + "acc_norm": 0.7254335260115607, + "acc_norm_stderr": 0.024027745155265026 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4770949720670391, + "acc_stderr": 0.016704945740326185, + "acc_norm": 0.4770949720670391, + "acc_norm_stderr": 0.016704945740326185 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7058823529411765, + "acc_stderr": 0.026090162504279053, + "acc_norm": 0.7058823529411765, + "acc_norm_stderr": 0.026090162504279053 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7266881028938906, + "acc_stderr": 0.02531176597542612, + "acc_norm": 0.7266881028938906, + "acc_norm_stderr": 0.02531176597542612 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7561728395061729, + "acc_stderr": 0.02389187954195961, + "acc_norm": 0.7561728395061729, + "acc_norm_stderr": 0.02389187954195961 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.475177304964539, + "acc_stderr": 0.029790719243829727, + "acc_norm": 0.475177304964539, + "acc_norm_stderr": 0.029790719243829727 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.46740547588005216, + "acc_stderr": 0.01274307294265335, + "acc_norm": 0.46740547588005216, + "acc_norm_stderr": 0.01274307294265335 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6801470588235294, + "acc_stderr": 0.02833295951403121, + "acc_norm": 0.6801470588235294, + "acc_norm_stderr": 0.02833295951403121 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.018926082916083383, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.018926082916083383 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7, + "acc_stderr": 0.04389311454644287, + "acc_norm": 0.7, + "acc_norm_stderr": 0.04389311454644287 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7224489795918367, + "acc_stderr": 0.028666857790274648, + "acc_norm": 0.7224489795918367, + "acc_norm_stderr": 0.028666857790274648 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8407960199004975, + "acc_stderr": 0.02587064676616913, + "acc_norm": 0.8407960199004975, + "acc_norm_stderr": 0.02587064676616913 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.03487350880197769, + "acc_norm": 0.86, + "acc_norm_stderr": 0.03487350880197769 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.536144578313253, + "acc_stderr": 0.038823108508905954, + "acc_norm": 0.536144578313253, + "acc_norm_stderr": 0.038823108508905954 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.5483476132190942, + "mc1_stderr": 0.01742148030027764, + "mc2": 0.6965131744948723, + "mc2_stderr": 0.01496885686799417 + }, + "harness|winogrande|5": { + "acc": 0.8232044198895028, + "acc_stderr": 0.010721923287918744 + }, + "harness|gsm8k|5": { + "acc": 0.686125852918878, + "acc_stderr": 0.012782681251053198 + }, + "all": { + "acc": 0.6550149914285618, + "acc_stderr": 0.03201980400748622, + "acc_norm": 0.6550580572754082, + "acc_norm_stderr": 0.03267824279371986, + "mc1": 0.5483476132190942, + "mc1_stderr": 0.01742148030027764, + "mc2": 0.6965131744948723, + "mc2_stderr": 0.01496885686799417 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "0fdae49d009f035e" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "6560808d7d71fa12" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/GreenNode/GreenNodeLM-7B-v4leo/results_2023-12-16T20-58-30.002770.json b/eval-results/GreenNode/GreenNodeLM-7B-v4leo/results_2023-12-16T20-58-30.002770.json new file mode 100644 index 0000000000000000000000000000000000000000..e08ef9f7aace4cf993207ecef1848098a2a0e81d --- /dev/null +++ b/eval-results/GreenNode/GreenNodeLM-7B-v4leo/results_2023-12-16T20-58-30.002770.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 384320.191319024, + "end_time": 391267.296963998, + "total_evaluation_time_secondes": "6947.105644973984", + "model_name": "GreenNode/GreenNodeLM-7B-v4leo", + "model_sha": "9286f6fac1df497203e110070322c93dab33fdd2", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6825938566552902, + "acc_stderr": 0.013602239088038167, + "acc_norm": 0.712457337883959, + "acc_norm_stderr": 0.013226719056266127 + }, + "harness|hellaswag|10": { + "acc": 0.7104162517426807, + "acc_stderr": 0.004526422125860673, + "acc_norm": 0.8823939454291974, + "acc_norm_stderr": 0.0032148270694168255 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.04072314811876837, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.04072314811876837 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6973684210526315, + "acc_stderr": 0.037385206761196686, + "acc_norm": 0.6973684210526315, + "acc_norm_stderr": 0.037385206761196686 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.720754716981132, + "acc_stderr": 0.027611163402399715, + "acc_norm": 0.720754716981132, + "acc_norm_stderr": 0.027611163402399715 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7638888888888888, + "acc_stderr": 0.03551446610810826, + "acc_norm": 0.7638888888888888, + "acc_norm_stderr": 0.03551446610810826 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6705202312138728, + "acc_stderr": 0.03583901754736412, + "acc_norm": 0.6705202312138728, + "acc_norm_stderr": 0.03583901754736412 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.43137254901960786, + "acc_stderr": 0.04928099597287534, + "acc_norm": 0.43137254901960786, + "acc_norm_stderr": 0.04928099597287534 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932263, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932263 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.574468085106383, + "acc_stderr": 0.03232146916224468, + "acc_norm": 0.574468085106383, + "acc_norm_stderr": 0.03232146916224468 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.49122807017543857, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.49122807017543857, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5793103448275863, + "acc_stderr": 0.0411391498118926, + "acc_norm": 0.5793103448275863, + "acc_norm_stderr": 0.0411391498118926 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.42328042328042326, + "acc_stderr": 0.025446365634406783, + "acc_norm": 0.42328042328042326, + "acc_norm_stderr": 0.025446365634406783 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.46825396825396826, + "acc_stderr": 0.04463112720677172, + "acc_norm": 0.46825396825396826, + "acc_norm_stderr": 0.04463112720677172 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7870967741935484, + "acc_stderr": 0.023287665127268542, + "acc_norm": 0.7870967741935484, + "acc_norm_stderr": 0.023287665127268542 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5123152709359606, + "acc_stderr": 0.035169204442208966, + "acc_norm": 0.5123152709359606, + "acc_norm_stderr": 0.035169204442208966 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.031922715695483, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.031922715695483 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7828282828282829, + "acc_stderr": 0.02937661648494563, + "acc_norm": 0.7828282828282829, + "acc_norm_stderr": 0.02937661648494563 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8860103626943006, + "acc_stderr": 0.022935144053919436, + "acc_norm": 0.8860103626943006, + "acc_norm_stderr": 0.022935144053919436 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6871794871794872, + "acc_stderr": 0.023507579020645358, + "acc_norm": 0.6871794871794872, + "acc_norm_stderr": 0.023507579020645358 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.337037037037037, + "acc_stderr": 0.02882088466625326, + "acc_norm": 0.337037037037037, + "acc_norm_stderr": 0.02882088466625326 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6890756302521008, + "acc_stderr": 0.03006676158297793, + "acc_norm": 0.6890756302521008, + "acc_norm_stderr": 0.03006676158297793 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8458715596330275, + "acc_stderr": 0.015480826865374303, + "acc_norm": 0.8458715596330275, + "acc_norm_stderr": 0.015480826865374303 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.02615686752393104, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.02615686752393104 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8059071729957806, + "acc_stderr": 0.025744902532290913, + "acc_norm": 0.8059071729957806, + "acc_norm_stderr": 0.025744902532290913 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6771300448430493, + "acc_stderr": 0.031381476375754995, + "acc_norm": 0.6771300448430493, + "acc_norm_stderr": 0.031381476375754995 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8015267175572519, + "acc_stderr": 0.03498149385462472, + "acc_norm": 0.8015267175572519, + "acc_norm_stderr": 0.03498149385462472 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7851239669421488, + "acc_stderr": 0.037494924487096966, + "acc_norm": 0.7851239669421488, + "acc_norm_stderr": 0.037494924487096966 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7668711656441718, + "acc_stderr": 0.0332201579577674, + "acc_norm": 0.7668711656441718, + "acc_norm_stderr": 0.0332201579577674 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4732142857142857, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.4732142857142857, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7669902912621359, + "acc_stderr": 0.04185832598928315, + "acc_norm": 0.7669902912621359, + "acc_norm_stderr": 0.04185832598928315 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.02190190511507333, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.02190190511507333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8339719029374202, + "acc_stderr": 0.0133064782430663, + "acc_norm": 0.8339719029374202, + "acc_norm_stderr": 0.0133064782430663 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7254335260115607, + "acc_stderr": 0.024027745155265026, + "acc_norm": 0.7254335260115607, + "acc_norm_stderr": 0.024027745155265026 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4770949720670391, + "acc_stderr": 0.016704945740326185, + "acc_norm": 0.4770949720670391, + "acc_norm_stderr": 0.016704945740326185 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7058823529411765, + "acc_stderr": 0.026090162504279053, + "acc_norm": 0.7058823529411765, + "acc_norm_stderr": 0.026090162504279053 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7266881028938906, + "acc_stderr": 0.02531176597542612, + "acc_norm": 0.7266881028938906, + "acc_norm_stderr": 0.02531176597542612 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7561728395061729, + "acc_stderr": 0.02389187954195961, + "acc_norm": 0.7561728395061729, + "acc_norm_stderr": 0.02389187954195961 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.475177304964539, + "acc_stderr": 0.029790719243829727, + "acc_norm": 0.475177304964539, + "acc_norm_stderr": 0.029790719243829727 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.46740547588005216, + "acc_stderr": 0.01274307294265335, + "acc_norm": 0.46740547588005216, + "acc_norm_stderr": 0.01274307294265335 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6801470588235294, + "acc_stderr": 0.02833295951403121, + "acc_norm": 0.6801470588235294, + "acc_norm_stderr": 0.02833295951403121 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.018926082916083383, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.018926082916083383 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7, + "acc_stderr": 0.04389311454644287, + "acc_norm": 0.7, + "acc_norm_stderr": 0.04389311454644287 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7224489795918367, + "acc_stderr": 0.028666857790274648, + "acc_norm": 0.7224489795918367, + "acc_norm_stderr": 0.028666857790274648 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8407960199004975, + "acc_stderr": 0.02587064676616913, + "acc_norm": 0.8407960199004975, + "acc_norm_stderr": 0.02587064676616913 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.03487350880197769, + "acc_norm": 0.86, + "acc_norm_stderr": 0.03487350880197769 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.536144578313253, + "acc_stderr": 0.038823108508905954, + "acc_norm": 0.536144578313253, + "acc_norm_stderr": 0.038823108508905954 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.5483476132190942, + "mc1_stderr": 0.01742148030027764, + "mc2": 0.6965131744948723, + "mc2_stderr": 0.01496885686799417 + }, + "harness|winogrande|5": { + "acc": 0.8232044198895028, + "acc_stderr": 0.010721923287918744 + }, + "harness|gsm8k|5": { + "acc": 0.686125852918878, + "acc_stderr": 0.012782681251053198 + }, + "all": { + "acc": 0.6550149914285618, + "acc_stderr": 0.03201980400748622, + "acc_norm": 0.6550580572754082, + "acc_norm_stderr": 0.03267824279371986, + "mc1": 0.5483476132190942, + "mc1_stderr": 0.01742148030027764, + "mc2": 0.6965131744948723, + "mc2_stderr": 0.01496885686799417 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "0fdae49d009f035e" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "6560808d7d71fa12" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/GreenNode/GreenNodeLM-v3olet-7B/results_2023-12-16T20-49-02.259410.json b/eval-results/GreenNode/GreenNodeLM-v3olet-7B/results_2023-12-16T20-49-02.259410.json new file mode 100644 index 0000000000000000000000000000000000000000..8d8acee941927f72a9c594f4d284607427d19596 --- /dev/null +++ b/eval-results/GreenNode/GreenNodeLM-v3olet-7B/results_2023-12-16T20-49-02.259410.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 383629.474194233, + "end_time": 390708.234124646, + "total_evaluation_time_secondes": "7078.7599304129835", + "model_name": "GreenNode/GreenNodeLM-v3olet-7B", + "model_sha": "94b36a4573657d7815f55b917b204e6b73f7a634", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6919795221843004, + "acc_stderr": 0.013491429517292038, + "acc_norm": 0.7226962457337884, + "acc_norm_stderr": 0.013082095839059376 + }, + "harness|hellaswag|10": { + "acc": 0.7102170882294364, + "acc_stderr": 0.004527343651130799, + "acc_norm": 0.8824935271858195, + "acc_norm_stderr": 0.0032136470410029463 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6444444444444445, + "acc_stderr": 0.04135176749720386, + "acc_norm": 0.6444444444444445, + "acc_norm_stderr": 0.04135176749720386 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6907894736842105, + "acc_stderr": 0.037610708698674805, + "acc_norm": 0.6907894736842105, + "acc_norm_stderr": 0.037610708698674805 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7245283018867924, + "acc_stderr": 0.02749566368372406, + "acc_norm": 0.7245283018867924, + "acc_norm_stderr": 0.02749566368372406 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.03476590104304134, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.03476590104304134 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6936416184971098, + "acc_stderr": 0.035149425512674394, + "acc_norm": 0.6936416184971098, + "acc_norm_stderr": 0.035149425512674394 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.46078431372549017, + "acc_stderr": 0.04959859966384181, + "acc_norm": 0.46078431372549017, + "acc_norm_stderr": 0.04959859966384181 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.77, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5829787234042553, + "acc_stderr": 0.03223276266711712, + "acc_norm": 0.5829787234042553, + "acc_norm_stderr": 0.03223276266711712 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5087719298245614, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.5087719298245614, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5793103448275863, + "acc_stderr": 0.0411391498118926, + "acc_norm": 0.5793103448275863, + "acc_norm_stderr": 0.0411391498118926 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.02548718714785938, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.02548718714785938 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.46825396825396826, + "acc_stderr": 0.04463112720677172, + "acc_norm": 0.46825396825396826, + "acc_norm_stderr": 0.04463112720677172 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7838709677419354, + "acc_stderr": 0.02341529343356853, + "acc_norm": 0.7838709677419354, + "acc_norm_stderr": 0.02341529343356853 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5221674876847291, + "acc_stderr": 0.03514528562175007, + "acc_norm": 0.5221674876847291, + "acc_norm_stderr": 0.03514528562175007 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7757575757575758, + "acc_stderr": 0.03256866661681102, + "acc_norm": 0.7757575757575758, + "acc_norm_stderr": 0.03256866661681102 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7929292929292929, + "acc_stderr": 0.028869778460267042, + "acc_norm": 0.7929292929292929, + "acc_norm_stderr": 0.028869778460267042 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8963730569948186, + "acc_stderr": 0.02199531196364424, + "acc_norm": 0.8963730569948186, + "acc_norm_stderr": 0.02199531196364424 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.023901157979402538, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.023901157979402538 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32592592592592595, + "acc_stderr": 0.028578348365473082, + "acc_norm": 0.32592592592592595, + "acc_norm_stderr": 0.028578348365473082 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6680672268907563, + "acc_stderr": 0.03058869701378364, + "acc_norm": 0.6680672268907563, + "acc_norm_stderr": 0.03058869701378364 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8532110091743119, + "acc_stderr": 0.01517314184512625, + "acc_norm": 0.8532110091743119, + "acc_norm_stderr": 0.01517314184512625 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5416666666666666, + "acc_stderr": 0.03398110890294636, + "acc_norm": 0.5416666666666666, + "acc_norm_stderr": 0.03398110890294636 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.02615686752393104, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.02615686752393104 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8059071729957806, + "acc_stderr": 0.025744902532290916, + "acc_norm": 0.8059071729957806, + "acc_norm_stderr": 0.025744902532290916 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.031024411740572213, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.031024411740572213 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8244274809160306, + "acc_stderr": 0.03336820338476074, + "acc_norm": 0.8244274809160306, + "acc_norm_stderr": 0.03336820338476074 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8016528925619835, + "acc_stderr": 0.03640118271990947, + "acc_norm": 0.8016528925619835, + "acc_norm_stderr": 0.03640118271990947 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7962962962962963, + "acc_stderr": 0.03893542518824847, + "acc_norm": 0.7962962962962963, + "acc_norm_stderr": 0.03893542518824847 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7852760736196319, + "acc_stderr": 0.03226219377286775, + "acc_norm": 0.7852760736196319, + "acc_norm_stderr": 0.03226219377286775 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.04697113923010212, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.04697113923010212 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8760683760683761, + "acc_stderr": 0.021586494001281376, + "acc_norm": 0.8760683760683761, + "acc_norm_stderr": 0.021586494001281376 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8352490421455939, + "acc_stderr": 0.013265346261323797, + "acc_norm": 0.8352490421455939, + "acc_norm_stderr": 0.013265346261323797 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7398843930635838, + "acc_stderr": 0.023618678310069356, + "acc_norm": 0.7398843930635838, + "acc_norm_stderr": 0.023618678310069356 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4759776536312849, + "acc_stderr": 0.016703190189300186, + "acc_norm": 0.4759776536312849, + "acc_norm_stderr": 0.016703190189300186 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7254901960784313, + "acc_stderr": 0.025553169991826528, + "acc_norm": 0.7254901960784313, + "acc_norm_stderr": 0.025553169991826528 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7170418006430869, + "acc_stderr": 0.02558306248998481, + "acc_norm": 0.7170418006430869, + "acc_norm_stderr": 0.02558306248998481 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7469135802469136, + "acc_stderr": 0.024191808600712995, + "acc_norm": 0.7469135802469136, + "acc_norm_stderr": 0.024191808600712995 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.48936170212765956, + "acc_stderr": 0.029820747191422473, + "acc_norm": 0.48936170212765956, + "acc_norm_stderr": 0.029820747191422473 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.46936114732724904, + "acc_stderr": 0.012746237711716634, + "acc_norm": 0.46936114732724904, + "acc_norm_stderr": 0.012746237711716634 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6838235294117647, + "acc_stderr": 0.02824568739146293, + "acc_norm": 0.6838235294117647, + "acc_norm_stderr": 0.02824568739146293 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.673202614379085, + "acc_stderr": 0.018975427920507208, + "acc_norm": 0.673202614379085, + "acc_norm_stderr": 0.018975427920507208 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.726530612244898, + "acc_stderr": 0.028535560337128438, + "acc_norm": 0.726530612244898, + "acc_norm_stderr": 0.028535560337128438 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.835820895522388, + "acc_stderr": 0.026193923544454115, + "acc_norm": 0.835820895522388, + "acc_norm_stderr": 0.026193923544454115 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.0348735088019777, + "acc_norm": 0.86, + "acc_norm_stderr": 0.0348735088019777 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5481927710843374, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.5481927710843374, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8245614035087719, + "acc_stderr": 0.029170885500727665, + "acc_norm": 0.8245614035087719, + "acc_norm_stderr": 0.029170885500727665 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.554467564259486, + "mc1_stderr": 0.017399335280140343, + "mc2": 0.695178465897982, + "mc2_stderr": 0.015007650690745592 + }, + "harness|winogrande|5": { + "acc": 0.824782951854775, + "acc_stderr": 0.010684179227706177 + }, + "harness|gsm8k|5": { + "acc": 0.7073540561031084, + "acc_stderr": 0.01253233436824289 + }, + "all": { + "acc": 0.657982142300908, + "acc_stderr": 0.03197829574357859, + "acc_norm": 0.6577587599136167, + "acc_norm_stderr": 0.032639601635152275, + "mc1": 0.554467564259486, + "mc1_stderr": 0.017399335280140343, + "mc2": 0.695178465897982, + "mc2_stderr": 0.015007650690745592 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "d14ac7465ad5d595" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "988000a99c1aa582" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/GreenNode/Merged-DPO-7B/results_2023-12-13T14-00-25.287195.json b/eval-results/GreenNode/Merged-DPO-7B/results_2023-12-13T14-00-25.287195.json new file mode 100644 index 0000000000000000000000000000000000000000..59f7a3e734569a157b6235c3c895d52dd63cadeb --- /dev/null +++ b/eval-results/GreenNode/Merged-DPO-7B/results_2023-12-13T14-00-25.287195.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 99852.001723124, + "end_time": 106984.380177586, + "total_evaluation_time_secondes": "7132.378454461999", + "model_name": "GreenNode/Merged-DPO-7B", + "model_sha": "1c0e61c7da6839fe4cc34433b899c5416fadbe18", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6604095563139932, + "acc_stderr": 0.013839039762820164, + "acc_norm": 0.689419795221843, + "acc_norm_stderr": 0.013522292098053059 + }, + "harness|hellaswag|10": { + "acc": 0.7271459868552081, + "acc_stderr": 0.00444516099761836, + "acc_norm": 0.8775144393547102, + "acc_norm_stderr": 0.0032717574453291595 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5481481481481482, + "acc_stderr": 0.042992689054808644, + "acc_norm": 0.5481481481481482, + "acc_norm_stderr": 0.042992689054808644 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.618421052631579, + "acc_stderr": 0.039531733777491945, + "acc_norm": 0.618421052631579, + "acc_norm_stderr": 0.039531733777491945 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6188679245283019, + "acc_stderr": 0.029890609686286634, + "acc_norm": 0.6188679245283019, + "acc_norm_stderr": 0.029890609686286634 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6527777777777778, + "acc_stderr": 0.039812405437178615, + "acc_norm": 0.6527777777777778, + "acc_norm_stderr": 0.039812405437178615 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5375722543352601, + "acc_stderr": 0.03801685104524458, + "acc_norm": 0.5375722543352601, + "acc_norm_stderr": 0.03801685104524458 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.046550104113196177, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.046550104113196177 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4723404255319149, + "acc_stderr": 0.03263597118409769, + "acc_norm": 0.4723404255319149, + "acc_norm_stderr": 0.03263597118409769 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4298245614035088, + "acc_stderr": 0.046570472605949625, + "acc_norm": 0.4298245614035088, + "acc_norm_stderr": 0.046570472605949625 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.02530590624159063, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.02530590624159063 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.043062412591271526, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.043062412591271526 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6290322580645161, + "acc_stderr": 0.027480541887953593, + "acc_norm": 0.6290322580645161, + "acc_norm_stderr": 0.027480541887953593 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4088669950738916, + "acc_stderr": 0.034590588158832314, + "acc_norm": 0.4088669950738916, + "acc_norm_stderr": 0.034590588158832314 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.036085410115739666, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.036085410115739666 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7424242424242424, + "acc_stderr": 0.03115626951964684, + "acc_norm": 0.7424242424242424, + "acc_norm_stderr": 0.03115626951964684 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8082901554404145, + "acc_stderr": 0.02840895362624529, + "acc_norm": 0.8082901554404145, + "acc_norm_stderr": 0.02840895362624529 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6, + "acc_stderr": 0.02483881198803316, + "acc_norm": 0.6, + "acc_norm_stderr": 0.02483881198803316 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.337037037037037, + "acc_stderr": 0.028820884666253252, + "acc_norm": 0.337037037037037, + "acc_norm_stderr": 0.028820884666253252 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5672268907563025, + "acc_stderr": 0.032183581077426124, + "acc_norm": 0.5672268907563025, + "acc_norm_stderr": 0.032183581077426124 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3708609271523179, + "acc_stderr": 0.03943966699183629, + "acc_norm": 0.3708609271523179, + "acc_norm_stderr": 0.03943966699183629 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7724770642201835, + "acc_stderr": 0.017974463578776502, + "acc_norm": 0.7724770642201835, + "acc_norm_stderr": 0.017974463578776502 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7205882352941176, + "acc_stderr": 0.031493281045079556, + "acc_norm": 0.7205882352941176, + "acc_norm_stderr": 0.031493281045079556 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7088607594936709, + "acc_stderr": 0.029571601065753374, + "acc_norm": 0.7088607594936709, + "acc_norm_stderr": 0.029571601065753374 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6053811659192825, + "acc_stderr": 0.03280400504755291, + "acc_norm": 0.6053811659192825, + "acc_norm_stderr": 0.03280400504755291 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5572519083969466, + "acc_stderr": 0.043564472026650695, + "acc_norm": 0.5572519083969466, + "acc_norm_stderr": 0.043564472026650695 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.628099173553719, + "acc_stderr": 0.04412015806624504, + "acc_norm": 0.628099173553719, + "acc_norm_stderr": 0.04412015806624504 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5740740740740741, + "acc_stderr": 0.047803436269367894, + "acc_norm": 0.5740740740740741, + "acc_norm_stderr": 0.047803436269367894 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6503067484662577, + "acc_stderr": 0.03746668325470021, + "acc_norm": 0.6503067484662577, + "acc_norm_stderr": 0.03746668325470021 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7184466019417476, + "acc_stderr": 0.04453254836326469, + "acc_norm": 0.7184466019417476, + "acc_norm_stderr": 0.04453254836326469 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7863247863247863, + "acc_stderr": 0.026853450377009182, + "acc_norm": 0.7863247863247863, + "acc_norm_stderr": 0.026853450377009182 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237101, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237101 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7662835249042146, + "acc_stderr": 0.015133383278988827, + "acc_norm": 0.7662835249042146, + "acc_norm_stderr": 0.015133383278988827 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5404624277456648, + "acc_stderr": 0.026830805998952236, + "acc_norm": 0.5404624277456648, + "acc_norm_stderr": 0.026830805998952236 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3653631284916201, + "acc_stderr": 0.01610483388014229, + "acc_norm": 0.3653631284916201, + "acc_norm_stderr": 0.01610483388014229 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5163398692810458, + "acc_stderr": 0.028614624752805434, + "acc_norm": 0.5163398692810458, + "acc_norm_stderr": 0.028614624752805434 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6141479099678456, + "acc_stderr": 0.027648149599751464, + "acc_norm": 0.6141479099678456, + "acc_norm_stderr": 0.027648149599751464 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6512345679012346, + "acc_stderr": 0.02651759772446501, + "acc_norm": 0.6512345679012346, + "acc_norm_stderr": 0.02651759772446501 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4148936170212766, + "acc_stderr": 0.0293922365846125, + "acc_norm": 0.4148936170212766, + "acc_norm_stderr": 0.0293922365846125 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4067796610169492, + "acc_stderr": 0.012546325596569536, + "acc_norm": 0.4067796610169492, + "acc_norm_stderr": 0.012546325596569536 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5661764705882353, + "acc_stderr": 0.030105636570016636, + "acc_norm": 0.5661764705882353, + "acc_norm_stderr": 0.030105636570016636 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5408496732026143, + "acc_stderr": 0.020160213617222516, + "acc_norm": 0.5408496732026143, + "acc_norm_stderr": 0.020160213617222516 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5909090909090909, + "acc_stderr": 0.04709306978661895, + "acc_norm": 0.5909090909090909, + "acc_norm_stderr": 0.04709306978661895 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5591836734693878, + "acc_stderr": 0.03178419114175363, + "acc_norm": 0.5591836734693878, + "acc_norm_stderr": 0.03178419114175363 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7711442786069652, + "acc_stderr": 0.029705284056772432, + "acc_norm": 0.7711442786069652, + "acc_norm_stderr": 0.029705284056772432 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.038367221765980515, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.038367221765980515 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7485380116959064, + "acc_stderr": 0.033275044238468436, + "acc_norm": 0.7485380116959064, + "acc_norm_stderr": 0.033275044238468436 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.5899632802937577, + "mc1_stderr": 0.01721784471744932, + "mc2": 0.7276047803006407, + "mc2_stderr": 0.014645147930666262 + }, + "harness|winogrande|5": { + "acc": 0.7837411207576953, + "acc_stderr": 0.011570614861409352 + }, + "harness|gsm8k|5": { + "acc": 0.4518574677786202, + "acc_stderr": 0.013708494995677641 + }, + "all": { + "acc": 0.560167032409899, + "acc_stderr": 0.034083462253007915, + "acc_norm": 0.5612537132182182, + "acc_norm_stderr": 0.034785117565412534, + "mc1": 0.5899632802937577, + "mc1_stderr": 0.01721784471744932, + "mc2": 0.7276047803006407, + "mc2_stderr": 0.014645147930666262 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "0556dfc52449460f" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "268383d1836b5fc4" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/HyperbeeAI/Tulpar-7b-v0/results_2023-08-26T12-16-04.808575.json b/eval-results/HyperbeeAI/Tulpar-7b-v0/results_2023-08-26T12-16-04.808575.json new file mode 100644 index 0000000000000000000000000000000000000000..40548bfbee07384e411ed5b4a693083c39ad05b8 --- /dev/null +++ b/eval-results/HyperbeeAI/Tulpar-7b-v0/results_2023-08-26T12-16-04.808575.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "HyperbeeAI/Tulpar-7b-v0", + "model_sha": "d7c2bc52a3ae13571357f51273ae948caf84400e", + "model_dtype": "torch.float16", + "lighteval_sha": "578835f70c499eaf870208de093513e08f864581", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.537542662116041, + "acc_stderr": 0.014570144495075581, + "acc_norm": 0.5631399317406144, + "acc_norm_stderr": 0.01449442158425652 + }, + "harness|hellaswag|10": { + "acc": 0.5981876120294762, + "acc_stderr": 0.0048926244909372205, + "acc_norm": 0.7900816570404302, + "acc_norm_stderr": 0.004064177814209496 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.45394736842105265, + "acc_stderr": 0.04051646342874142, + "acc_norm": 0.45394736842105265, + "acc_norm_stderr": 0.04051646342874142 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5962264150943396, + "acc_stderr": 0.03019761160019795, + "acc_norm": 0.5962264150943396, + "acc_norm_stderr": 0.03019761160019795 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5416666666666666, + "acc_stderr": 0.04166666666666666, + "acc_norm": 0.5416666666666666, + "acc_norm_stderr": 0.04166666666666666 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.44508670520231214, + "acc_stderr": 0.03789401760283648, + "acc_norm": 0.44508670520231214, + "acc_norm_stderr": 0.03789401760283648 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4553191489361702, + "acc_stderr": 0.03255525359340354, + "acc_norm": 0.4553191489361702, + "acc_norm_stderr": 0.03255525359340354 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322004, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322004 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4689655172413793, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.4689655172413793, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3306878306878307, + "acc_stderr": 0.024229965298425086, + "acc_norm": 0.3306878306878307, + "acc_norm_stderr": 0.024229965298425086 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.041905964388711366, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.041905964388711366 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5741935483870968, + "acc_stderr": 0.0281291127091659, + "acc_norm": 0.5741935483870968, + "acc_norm_stderr": 0.0281291127091659 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3694581280788177, + "acc_stderr": 0.03395970381998573, + "acc_norm": 0.3694581280788177, + "acc_norm_stderr": 0.03395970381998573 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.03546563019624336, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.03546563019624336 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6767676767676768, + "acc_stderr": 0.03332299921070644, + "acc_norm": 0.6767676767676768, + "acc_norm_stderr": 0.03332299921070644 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7616580310880829, + "acc_stderr": 0.03074890536390989, + "acc_norm": 0.7616580310880829, + "acc_norm_stderr": 0.03074890536390989 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5153846153846153, + "acc_stderr": 0.02533900301010651, + "acc_norm": 0.5153846153846153, + "acc_norm_stderr": 0.02533900301010651 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.02671924078371217, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.02671924078371217 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5168067226890757, + "acc_stderr": 0.03246013680375308, + "acc_norm": 0.5168067226890757, + "acc_norm_stderr": 0.03246013680375308 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763744, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763744 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7302752293577982, + "acc_stderr": 0.01902848671111544, + "acc_norm": 0.7302752293577982, + "acc_norm_stderr": 0.01902848671111544 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.03372343271653063, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.03372343271653063 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7058823529411765, + "acc_stderr": 0.03198001660115072, + "acc_norm": 0.7058823529411765, + "acc_norm_stderr": 0.03198001660115072 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.729957805907173, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.729957805907173, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6233183856502242, + "acc_stderr": 0.032521134899291884, + "acc_norm": 0.6233183856502242, + "acc_norm_stderr": 0.032521134899291884 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6183206106870229, + "acc_stderr": 0.0426073515764456, + "acc_norm": 0.6183206106870229, + "acc_norm_stderr": 0.0426073515764456 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6611570247933884, + "acc_stderr": 0.043207678075366705, + "acc_norm": 0.6611570247933884, + "acc_norm_stderr": 0.043207678075366705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6296296296296297, + "acc_stderr": 0.04668408033024931, + "acc_norm": 0.6296296296296297, + "acc_norm_stderr": 0.04668408033024931 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6012269938650306, + "acc_stderr": 0.03847021420456023, + "acc_norm": 0.6012269938650306, + "acc_norm_stderr": 0.03847021420456023 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.04697113923010212, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.04697113923010212 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7378640776699029, + "acc_stderr": 0.04354631077260595, + "acc_norm": 0.7378640776699029, + "acc_norm_stderr": 0.04354631077260595 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7863247863247863, + "acc_stderr": 0.026853450377009168, + "acc_norm": 0.7863247863247863, + "acc_norm_stderr": 0.026853450377009168 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7203065134099617, + "acc_stderr": 0.016050792148036532, + "acc_norm": 0.7203065134099617, + "acc_norm_stderr": 0.016050792148036532 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5635838150289018, + "acc_stderr": 0.026700545424943677, + "acc_norm": 0.5635838150289018, + "acc_norm_stderr": 0.026700545424943677 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23687150837988827, + "acc_stderr": 0.014219570788103986, + "acc_norm": 0.23687150837988827, + "acc_norm_stderr": 0.014219570788103986 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5522875816993464, + "acc_stderr": 0.02847293847803353, + "acc_norm": 0.5522875816993464, + "acc_norm_stderr": 0.02847293847803353 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6012861736334405, + "acc_stderr": 0.0278093225857745, + "acc_norm": 0.6012861736334405, + "acc_norm_stderr": 0.0278093225857745 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5679012345679012, + "acc_stderr": 0.02756301097160668, + "acc_norm": 0.5679012345679012, + "acc_norm_stderr": 0.02756301097160668 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3971631205673759, + "acc_stderr": 0.029189805673587102, + "acc_norm": 0.3971631205673759, + "acc_norm_stderr": 0.029189805673587102 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3924380704041721, + "acc_stderr": 0.01247124366922911, + "acc_norm": 0.3924380704041721, + "acc_norm_stderr": 0.01247124366922911 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4889705882352941, + "acc_stderr": 0.030365446477275668, + "acc_norm": 0.4889705882352941, + "acc_norm_stderr": 0.030365446477275668 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5228758169934641, + "acc_stderr": 0.020206653187884786, + "acc_norm": 0.5228758169934641, + "acc_norm_stderr": 0.020206653187884786 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6, + "acc_stderr": 0.0469237132203465, + "acc_norm": 0.6, + "acc_norm_stderr": 0.0469237132203465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6285714285714286, + "acc_stderr": 0.03093285879278984, + "acc_norm": 0.6285714285714286, + "acc_norm_stderr": 0.03093285879278984 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7114427860696517, + "acc_stderr": 0.03203841040213322, + "acc_norm": 0.7114427860696517, + "acc_norm_stderr": 0.03203841040213322 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.74, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.74, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.03836722176598052, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.03836722176598052 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7134502923976608, + "acc_stderr": 0.03467826685703826, + "acc_norm": 0.7134502923976608, + "acc_norm_stderr": 0.03467826685703826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35862913096695226, + "mc1_stderr": 0.016789289499502025, + "mc2": 0.5168382384839378, + "mc2_stderr": 0.015507413298617018 + }, + "all": { + "acc": 0.5269514916803312, + "acc_stderr": 0.03474932471881244, + "acc_norm": 0.53063778514873, + "acc_norm_stderr": 0.034733999810548935, + "mc1": 0.35862913096695226, + "mc1_stderr": 0.016789289499502025, + "mc2": 0.5168382384839378, + "mc2_stderr": 0.015507413298617018 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4167.701423883438", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/HyperbeeAI/Tulpar-7b-v0/results_2023-09-16T17-05-33.641696.json b/eval-results/HyperbeeAI/Tulpar-7b-v0/results_2023-09-16T17-05-33.641696.json new file mode 100644 index 0000000000000000000000000000000000000000..3c4db189d30f618bec8d4cba9c52926e7b46975d --- /dev/null +++ b/eval-results/HyperbeeAI/Tulpar-7b-v0/results_2023-09-16T17-05-33.641696.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "HyperbeeAI/Tulpar-7b-v0", + "model_sha": "7caa5fc7f6581d0f791b631c890682d73301b49c", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.3200503355704698, + "em_stderr": 0.004777351284269766, + "f1": 0.39745910234899495, + "f1_stderr": 0.004660867839676267 + }, + "harness|gsm8k|5": { + "acc": 0.027293404094010616, + "acc_stderr": 0.004488095380209751 + }, + "harness|winogrande|5": { + "acc": 0.7387529597474349, + "acc_stderr": 0.012346914863415308 + }, + "all": { + "em": 0.3200503355704698, + "em_stderr": 0.004777351284269766, + "f1": 0.39745910234899495, + "f1_stderr": 0.004660867839676267, + "acc": 0.38302318192072277, + "acc_stderr": 0.00841750512181253 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "d29773ce4a5260f0" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "b9156fa9f7d4edd9" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "1afd503bc6d73bc1" + }, + "total_evaluation_time_secondes": "6232.1419105529785", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/HyperbeeAI/Tulpar-7b-v1/results_2023-09-14T07-59-39.326009.json b/eval-results/HyperbeeAI/Tulpar-7b-v1/results_2023-09-14T07-59-39.326009.json new file mode 100644 index 0000000000000000000000000000000000000000..00d73051db5f36fd7e4758d25a99b201da5047ca --- /dev/null +++ b/eval-results/HyperbeeAI/Tulpar-7b-v1/results_2023-09-14T07-59-39.326009.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "HyperbeeAI/Tulpar-7b-v1", + "model_sha": "719d8e1eb4a820f01e0a92ef6220d041964bb472", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5418088737201365, + "acc_stderr": 0.014560220308714698, + "acc_norm": 0.5699658703071673, + "acc_norm_stderr": 0.014467631559137988 + }, + "harness|hellaswag|10": { + "acc": 0.6116311491734714, + "acc_stderr": 0.0048638313648480735, + "acc_norm": 0.7968532164907389, + "acc_norm_stderr": 0.004015185891482733 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.046482319871173156, + "acc_norm": 0.31, + "acc_norm_stderr": 0.046482319871173156 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.48026315789473684, + "acc_stderr": 0.040657710025626036, + "acc_norm": 0.48026315789473684, + "acc_norm_stderr": 0.040657710025626036 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6, + "acc_stderr": 0.03015113445777628, + "acc_norm": 0.6, + "acc_norm_stderr": 0.03015113445777628 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.04155319955593146, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.04155319955593146 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411019, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411019 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4797687861271676, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.4797687861271676, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.04440521906179327, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.04440521906179327 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.03261936918467382, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.03261936918467382 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.044346007015849245, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.044346007015849245 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.46206896551724136, + "acc_stderr": 0.041546596717075474, + "acc_norm": 0.46206896551724136, + "acc_norm_stderr": 0.041546596717075474 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.023266512213730564, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.023266512213730564 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.03893259610604675, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.03893259610604675 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.567741935483871, + "acc_stderr": 0.028181739720019413, + "acc_norm": 0.567741935483871, + "acc_norm_stderr": 0.028181739720019413 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3645320197044335, + "acc_stderr": 0.0338640574606209, + "acc_norm": 0.3645320197044335, + "acc_norm_stderr": 0.0338640574606209 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7151515151515152, + "acc_stderr": 0.035243908445117815, + "acc_norm": 0.7151515151515152, + "acc_norm_stderr": 0.035243908445117815 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6717171717171717, + "acc_stderr": 0.03345678422756775, + "acc_norm": 0.6717171717171717, + "acc_norm_stderr": 0.03345678422756775 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7357512953367875, + "acc_stderr": 0.03182155050916646, + "acc_norm": 0.7357512953367875, + "acc_norm_stderr": 0.03182155050916646 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.47435897435897434, + "acc_stderr": 0.025317649726448656, + "acc_norm": 0.47435897435897434, + "acc_norm_stderr": 0.025317649726448656 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.026067159222275815, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.026067159222275815 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5084033613445378, + "acc_stderr": 0.0324739027656967, + "acc_norm": 0.5084033613445378, + "acc_norm_stderr": 0.0324739027656967 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.038227469376587525, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.038227469376587525 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.726605504587156, + "acc_stderr": 0.01910929984609829, + "acc_norm": 0.726605504587156, + "acc_norm_stderr": 0.01910929984609829 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.375, + "acc_stderr": 0.033016908987210894, + "acc_norm": 0.375, + "acc_norm_stderr": 0.033016908987210894 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6715686274509803, + "acc_stderr": 0.032962451101722294, + "acc_norm": 0.6715686274509803, + "acc_norm_stderr": 0.032962451101722294 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7341772151898734, + "acc_stderr": 0.02875679962965834, + "acc_norm": 0.7341772151898734, + "acc_norm_stderr": 0.02875679962965834 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5919282511210763, + "acc_stderr": 0.03298574607842822, + "acc_norm": 0.5919282511210763, + "acc_norm_stderr": 0.03298574607842822 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5725190839694656, + "acc_stderr": 0.04338920305792401, + "acc_norm": 0.5725190839694656, + "acc_norm_stderr": 0.04338920305792401 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6694214876033058, + "acc_stderr": 0.04294340845212094, + "acc_norm": 0.6694214876033058, + "acc_norm_stderr": 0.04294340845212094 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6481481481481481, + "acc_stderr": 0.04616631111801713, + "acc_norm": 0.6481481481481481, + "acc_norm_stderr": 0.04616631111801713 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.558282208588957, + "acc_stderr": 0.03901591825836184, + "acc_norm": 0.558282208588957, + "acc_norm_stderr": 0.03901591825836184 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6893203883495146, + "acc_stderr": 0.045821241601615506, + "acc_norm": 0.6893203883495146, + "acc_norm_stderr": 0.045821241601615506 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.782051282051282, + "acc_stderr": 0.027046857630716684, + "acc_norm": 0.782051282051282, + "acc_norm_stderr": 0.027046857630716684 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7088122605363985, + "acc_stderr": 0.016246087069701407, + "acc_norm": 0.7088122605363985, + "acc_norm_stderr": 0.016246087069701407 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5867052023121387, + "acc_stderr": 0.02651126136940924, + "acc_norm": 0.5867052023121387, + "acc_norm_stderr": 0.02651126136940924 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2558659217877095, + "acc_stderr": 0.014593620923210754, + "acc_norm": 0.2558659217877095, + "acc_norm_stderr": 0.014593620923210754 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5163398692810458, + "acc_stderr": 0.02861462475280544, + "acc_norm": 0.5163398692810458, + "acc_norm_stderr": 0.02861462475280544 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.594855305466238, + "acc_stderr": 0.027882383791325953, + "acc_norm": 0.594855305466238, + "acc_norm_stderr": 0.027882383791325953 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5617283950617284, + "acc_stderr": 0.027607914087400473, + "acc_norm": 0.5617283950617284, + "acc_norm_stderr": 0.027607914087400473 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.37943262411347517, + "acc_stderr": 0.028947338851614105, + "acc_norm": 0.37943262411347517, + "acc_norm_stderr": 0.028947338851614105 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.39113428943937417, + "acc_stderr": 0.012463861839982061, + "acc_norm": 0.39113428943937417, + "acc_norm_stderr": 0.012463861839982061 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4889705882352941, + "acc_stderr": 0.030365446477275675, + "acc_norm": 0.4889705882352941, + "acc_norm_stderr": 0.030365446477275675 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.49673202614379086, + "acc_stderr": 0.020227402794434864, + "acc_norm": 0.49673202614379086, + "acc_norm_stderr": 0.020227402794434864 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6181818181818182, + "acc_stderr": 0.046534298079135075, + "acc_norm": 0.6181818181818182, + "acc_norm_stderr": 0.046534298079135075 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.031680911612338825, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.031680911612338825 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6716417910447762, + "acc_stderr": 0.033206858897443244, + "acc_norm": 0.6716417910447762, + "acc_norm_stderr": 0.033206858897443244 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.03836722176598052, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.03836722176598052 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.695906432748538, + "acc_stderr": 0.0352821125824523, + "acc_norm": 0.695906432748538, + "acc_norm_stderr": 0.0352821125824523 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35128518971848227, + "mc1_stderr": 0.0167113581635444, + "mc2": 0.5182558368766056, + "mc2_stderr": 0.01571195010605344 + }, + "all": { + "acc": 0.5154814101902333, + "acc_stderr": 0.03490831850058395, + "acc_norm": 0.5190980044936959, + "acc_norm_stderr": 0.034892365378161205, + "mc1": 0.35128518971848227, + "mc1_stderr": 0.0167113581635444, + "mc2": 0.5182558368766056, + "mc2_stderr": 0.01571195010605344 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4269.299047708511", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/HyperbeeAI/Tulpar-7b-v1/results_2023-10-23T09-02-28.162757.json b/eval-results/HyperbeeAI/Tulpar-7b-v1/results_2023-10-23T09-02-28.162757.json new file mode 100644 index 0000000000000000000000000000000000000000..7f9d91baa940df469a4a42df0101191556134adb --- /dev/null +++ b/eval-results/HyperbeeAI/Tulpar-7b-v1/results_2023-10-23T09-02-28.162757.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "HyperbeeAI/Tulpar-7b-v1", + "model_sha": "719d8e1eb4a820f01e0a92ef6220d041964bb472", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.2915268456375839, + "em_stderr": 0.004654152691335802, + "f1": 0.3658179530201351, + "f1_stderr": 0.004568137923093851 + }, + "harness|gsm8k|5": { + "acc": 0.006823351023502654, + "acc_stderr": 0.0022675371022544944 + }, + "harness|winogrande|5": { + "acc": 0.7245461720599842, + "acc_stderr": 0.01255569005570953 + }, + "all": { + "em": 0.2915268456375839, + "em_stderr": 0.004654152691335802, + "f1": 0.3658179530201351, + "f1_stderr": 0.004568137923093851, + "acc": 0.3656847615417434, + "acc_stderr": 0.0074116135789820126 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "cba87215932958ea" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "641242c560e29ae7" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "7914fccfc9477b26" + }, + "total_evaluation_time_secondes": "7387.397860527039", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/HyperbeeAI/Tulpar-7b-v2/results_2023-12-08T00-34-52.885019.json b/eval-results/HyperbeeAI/Tulpar-7b-v2/results_2023-12-08T00-34-52.885019.json new file mode 100644 index 0000000000000000000000000000000000000000..705257584159b901dbed873e8b384034cc443bd1 --- /dev/null +++ b/eval-results/HyperbeeAI/Tulpar-7b-v2/results_2023-12-08T00-34-52.885019.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 443096.695976265, + "end_time": 450565.127492432, + "total_evaluation_time_secondes": "7468.431516167009", + "model_name": "HyperbeeAI/Tulpar-7b-v2", + "model_sha": "b466113c7726cfcd98ba602ec4000ae323f112fa", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6450511945392492, + "acc_stderr": 0.013983036904094095, + "acc_norm": 0.6749146757679181, + "acc_norm_stderr": 0.013688147309729124 + }, + "harness|hellaswag|10": { + "acc": 0.6728739294961164, + "acc_stderr": 0.0046820489066223174, + "acc_norm": 0.8489344752041426, + "acc_norm_stderr": 0.0035738085511685283 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6222222222222222, + "acc_stderr": 0.04188307537595852, + "acc_norm": 0.6222222222222222, + "acc_norm_stderr": 0.04188307537595852 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7302631578947368, + "acc_stderr": 0.03611780560284898, + "acc_norm": 0.7302631578947368, + "acc_norm_stderr": 0.03611780560284898 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.690566037735849, + "acc_stderr": 0.028450154794118637, + "acc_norm": 0.690566037735849, + "acc_norm_stderr": 0.028450154794118637 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7152777777777778, + "acc_stderr": 0.03773809990686934, + "acc_norm": 0.7152777777777778, + "acc_norm_stderr": 0.03773809990686934 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6416184971098265, + "acc_stderr": 0.03656343653353159, + "acc_norm": 0.6416184971098265, + "acc_norm_stderr": 0.03656343653353159 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5787234042553191, + "acc_stderr": 0.03227834510146268, + "acc_norm": 0.5787234042553191, + "acc_norm_stderr": 0.03227834510146268 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.49122807017543857, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.49122807017543857, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3862433862433862, + "acc_stderr": 0.025075981767601684, + "acc_norm": 0.3862433862433862, + "acc_norm_stderr": 0.025075981767601684 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.04403438954768177, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.04403438954768177 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7741935483870968, + "acc_stderr": 0.023785577884181012, + "acc_norm": 0.7741935483870968, + "acc_norm_stderr": 0.023785577884181012 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.47783251231527096, + "acc_stderr": 0.03514528562175007, + "acc_norm": 0.47783251231527096, + "acc_norm_stderr": 0.03514528562175007 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.793939393939394, + "acc_stderr": 0.03158415324047711, + "acc_norm": 0.793939393939394, + "acc_norm_stderr": 0.03158415324047711 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7929292929292929, + "acc_stderr": 0.02886977846026705, + "acc_norm": 0.7929292929292929, + "acc_norm_stderr": 0.02886977846026705 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8704663212435233, + "acc_stderr": 0.024233532297758733, + "acc_norm": 0.8704663212435233, + "acc_norm_stderr": 0.024233532297758733 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6384615384615384, + "acc_stderr": 0.024359581465396997, + "acc_norm": 0.6384615384615384, + "acc_norm_stderr": 0.024359581465396997 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.028742040903948482, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.028742040903948482 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7016806722689075, + "acc_stderr": 0.029719142876342853, + "acc_norm": 0.7016806722689075, + "acc_norm_stderr": 0.029719142876342853 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3509933774834437, + "acc_stderr": 0.03896981964257375, + "acc_norm": 0.3509933774834437, + "acc_norm_stderr": 0.03896981964257375 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8440366972477065, + "acc_stderr": 0.015555802713590172, + "acc_norm": 0.8440366972477065, + "acc_norm_stderr": 0.015555802713590172 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5092592592592593, + "acc_stderr": 0.034093869469927006, + "acc_norm": 0.5092592592592593, + "acc_norm_stderr": 0.034093869469927006 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7941176470588235, + "acc_stderr": 0.028379449451588667, + "acc_norm": 0.7941176470588235, + "acc_norm_stderr": 0.028379449451588667 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7848101265822784, + "acc_stderr": 0.026750826994676187, + "acc_norm": 0.7848101265822784, + "acc_norm_stderr": 0.026750826994676187 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.03102441174057221, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.03102441174057221 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7175572519083969, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.7175572519083969, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7851239669421488, + "acc_stderr": 0.037494924487096966, + "acc_norm": 0.7851239669421488, + "acc_norm_stderr": 0.037494924487096966 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.04133119440243838, + "acc_norm": 0.7592592592592593, + "acc_norm_stderr": 0.04133119440243838 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7239263803680982, + "acc_stderr": 0.03512385283705049, + "acc_norm": 0.7239263803680982, + "acc_norm_stderr": 0.03512385283705049 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5267857142857143, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.5267857142857143, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7961165048543689, + "acc_stderr": 0.039891398595317706, + "acc_norm": 0.7961165048543689, + "acc_norm_stderr": 0.039891398595317706 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8675213675213675, + "acc_stderr": 0.022209309073165623, + "acc_norm": 0.8675213675213675, + "acc_norm_stderr": 0.022209309073165623 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8237547892720306, + "acc_stderr": 0.01362555690799345, + "acc_norm": 0.8237547892720306, + "acc_norm_stderr": 0.01362555690799345 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.684971098265896, + "acc_stderr": 0.025009313790069716, + "acc_norm": 0.684971098265896, + "acc_norm_stderr": 0.025009313790069716 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.40558659217877097, + "acc_stderr": 0.016421670506339185, + "acc_norm": 0.40558659217877097, + "acc_norm_stderr": 0.016421670506339185 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6895424836601307, + "acc_stderr": 0.0264930332251459, + "acc_norm": 0.6895424836601307, + "acc_norm_stderr": 0.0264930332251459 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.684887459807074, + "acc_stderr": 0.026385273703464492, + "acc_norm": 0.684887459807074, + "acc_norm_stderr": 0.026385273703464492 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7191358024691358, + "acc_stderr": 0.025006469755799208, + "acc_norm": 0.7191358024691358, + "acc_norm_stderr": 0.025006469755799208 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.45390070921985815, + "acc_stderr": 0.02970045324729147, + "acc_norm": 0.45390070921985815, + "acc_norm_stderr": 0.02970045324729147 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4556714471968709, + "acc_stderr": 0.012719949543032212, + "acc_norm": 0.4556714471968709, + "acc_norm_stderr": 0.012719949543032212 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6580882352941176, + "acc_stderr": 0.028814722422254184, + "acc_norm": 0.6580882352941176, + "acc_norm_stderr": 0.028814722422254184 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6486928104575164, + "acc_stderr": 0.01931267606578655, + "acc_norm": 0.6486928104575164, + "acc_norm_stderr": 0.01931267606578655 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7142857142857143, + "acc_stderr": 0.0289205832206756, + "acc_norm": 0.7142857142857143, + "acc_norm_stderr": 0.0289205832206756 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8109452736318408, + "acc_stderr": 0.02768691358801302, + "acc_norm": 0.8109452736318408, + "acc_norm_stderr": 0.02768691358801302 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4939759036144578, + "acc_stderr": 0.03892212195333045, + "acc_norm": 0.4939759036144578, + "acc_norm_stderr": 0.03892212195333045 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8011695906432749, + "acc_stderr": 0.03061111655743253, + "acc_norm": 0.8011695906432749, + "acc_norm_stderr": 0.03061111655743253 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.47613219094247244, + "mc1_stderr": 0.017483547156961578, + "mc2": 0.6364510557327406, + "mc2_stderr": 0.015316470691061521 + }, + "harness|winogrande|5": { + "acc": 0.7947908445146015, + "acc_stderr": 0.01135031570746207 + }, + "harness|gsm8k|5": { + "acc": 0.6360879454131918, + "acc_stderr": 0.013252539227966185 + }, + "all": { + "acc": 0.6339643856948549, + "acc_stderr": 0.032544251757185845, + "acc_norm": 0.6346927587185602, + "acc_norm_stderr": 0.03320666732717101, + "mc1": 0.47613219094247244, + "mc1_stderr": 0.017483547156961578, + "mc2": 0.6364510557327406, + "mc2_stderr": 0.015316470691061521 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "74810975b096f9e3" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "f1bda73e39d57383" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1/results_2023-07-18T14-27-13.663491.json b/eval-results/IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1/results_2023-07-18T14-27-13.663491.json new file mode 100644 index 0000000000000000000000000000000000000000..0c00bfeec43cf07848219ec0741ab107caae7019 --- /dev/null +++ b/eval-results/IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1/results_2023-07-18T14-27-13.663491.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.20648464163822525, + "acc_stderr": 0.011828865619002316, + "acc_norm": 0.27986348122866894, + "acc_norm_stderr": 0.013119040897725923 + }, + "harness|hellaswag|10": { + "acc": 0.25492929695279826, + "acc_stderr": 0.004349307702735164, + "acc_norm": 0.26000796654052977, + "acc_norm_stderr": 0.004377421493297837 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.22962962962962963, + "acc_stderr": 0.03633384414073461, + "acc_norm": 0.22962962962962963, + "acc_norm_stderr": 0.03633384414073461 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3355263157894737, + "acc_stderr": 0.03842498559395268, + "acc_norm": 0.3355263157894737, + "acc_norm_stderr": 0.03842498559395268 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2981132075471698, + "acc_stderr": 0.028152837942493857, + "acc_norm": 0.2981132075471698, + "acc_norm_stderr": 0.028152837942493857 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3352601156069364, + "acc_stderr": 0.03599586301247078, + "acc_norm": 0.3352601156069364, + "acc_norm_stderr": 0.03599586301247078 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.37254901960784315, + "acc_stderr": 0.04810840148082633, + "acc_norm": 0.37254901960784315, + "acc_norm_stderr": 0.04810840148082633 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.20851063829787234, + "acc_stderr": 0.026556982117838728, + "acc_norm": 0.20851063829787234, + "acc_norm_stderr": 0.026556982117838728 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813344, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813344 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2671957671957672, + "acc_stderr": 0.022789673145776564, + "acc_norm": 0.2671957671957672, + "acc_norm_stderr": 0.022789673145776564 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.04306241259127153, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.04306241259127153 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3161290322580645, + "acc_stderr": 0.02645087448904277, + "acc_norm": 0.3161290322580645, + "acc_norm_stderr": 0.02645087448904277 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.28078817733990147, + "acc_stderr": 0.03161856335358609, + "acc_norm": 0.28078817733990147, + "acc_norm_stderr": 0.03161856335358609 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.35353535353535354, + "acc_stderr": 0.03406086723547153, + "acc_norm": 0.35353535353535354, + "acc_norm_stderr": 0.03406086723547153 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.36787564766839376, + "acc_stderr": 0.03480175668466036, + "acc_norm": 0.36787564766839376, + "acc_norm_stderr": 0.03480175668466036 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3641025641025641, + "acc_stderr": 0.02439667298509477, + "acc_norm": 0.3641025641025641, + "acc_norm_stderr": 0.02439667298509477 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.026842057873833706, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.026842057873833706 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3487394957983193, + "acc_stderr": 0.03095663632856655, + "acc_norm": 0.3487394957983193, + "acc_norm_stderr": 0.03095663632856655 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3486238532110092, + "acc_stderr": 0.020431254090714328, + "acc_norm": 0.3486238532110092, + "acc_norm_stderr": 0.020431254090714328 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.030587591351604246, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.030587591351604246 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.20253164556962025, + "acc_stderr": 0.026160568246601457, + "acc_norm": 0.20253164556962025, + "acc_norm_stderr": 0.026160568246601457 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.10762331838565023, + "acc_stderr": 0.020799400082879997, + "acc_norm": 0.10762331838565023, + "acc_norm_stderr": 0.020799400082879997 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2824427480916031, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.2824427480916031, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.14049586776859505, + "acc_stderr": 0.03172233426002161, + "acc_norm": 0.14049586776859505, + "acc_norm_stderr": 0.03172233426002161 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2331288343558282, + "acc_stderr": 0.033220157957767414, + "acc_norm": 0.2331288343558282, + "acc_norm_stderr": 0.033220157957767414 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.16071428571428573, + "acc_stderr": 0.03485946096475741, + "acc_norm": 0.16071428571428573, + "acc_norm_stderr": 0.03485946096475741 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3786407766990291, + "acc_stderr": 0.04802694698258972, + "acc_norm": 0.3786407766990291, + "acc_norm_stderr": 0.04802694698258972 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.19658119658119658, + "acc_stderr": 0.02603538609895129, + "acc_norm": 0.19658119658119658, + "acc_norm_stderr": 0.02603538609895129 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909281, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909281 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.20434227330779056, + "acc_stderr": 0.0144191239809319, + "acc_norm": 0.20434227330779056, + "acc_norm_stderr": 0.0144191239809319 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2138728323699422, + "acc_stderr": 0.022075709251757183, + "acc_norm": 0.2138728323699422, + "acc_norm_stderr": 0.022075709251757183 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.27262569832402234, + "acc_stderr": 0.014893391735249588, + "acc_norm": 0.27262569832402234, + "acc_norm_stderr": 0.014893391735249588 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.02609016250427905, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.02609016250427905 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24115755627009647, + "acc_stderr": 0.024296594034763426, + "acc_norm": 0.24115755627009647, + "acc_norm_stderr": 0.024296594034763426 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.22530864197530864, + "acc_stderr": 0.023246202647819746, + "acc_norm": 0.22530864197530864, + "acc_norm_stderr": 0.023246202647819746 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.24113475177304963, + "acc_stderr": 0.025518731049537762, + "acc_norm": 0.24113475177304963, + "acc_norm_stderr": 0.025518731049537762 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24445893089960888, + "acc_stderr": 0.010976425013113886, + "acc_norm": 0.24445893089960888, + "acc_norm_stderr": 0.010976425013113886 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4485294117647059, + "acc_stderr": 0.030211479609121593, + "acc_norm": 0.4485294117647059, + "acc_norm_stderr": 0.030211479609121593 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2173202614379085, + "acc_stderr": 0.01668482092914859, + "acc_norm": 0.2173202614379085, + "acc_norm_stderr": 0.01668482092914859 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.22727272727272727, + "acc_stderr": 0.04013964554072774, + "acc_norm": 0.22727272727272727, + "acc_norm_stderr": 0.04013964554072774 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4, + "acc_stderr": 0.031362502409358936, + "acc_norm": 0.4, + "acc_norm_stderr": 0.031362502409358936 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.26865671641791045, + "acc_stderr": 0.03134328358208954, + "acc_norm": 0.26865671641791045, + "acc_norm_stderr": 0.03134328358208954 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.1927710843373494, + "acc_stderr": 0.030709824050565274, + "acc_norm": 0.1927710843373494, + "acc_norm_stderr": 0.030709824050565274 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.17543859649122806, + "acc_stderr": 0.029170885500727654, + "acc_norm": 0.17543859649122806, + "acc_norm_stderr": 0.029170885500727654 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.26438188494492043, + "mc1_stderr": 0.015438211119522496, + "mc2": 0.48589836094381605, + "mc2_stderr": 0.015747574476903958 + }, + "all": { + "acc": 0.26903507830363266, + "acc_stderr": 0.03177080282700087, + "acc_norm": 0.2703648665947882, + "acc_norm_stderr": 0.03179314670953114, + "mc1": 0.26438188494492043, + "mc1_stderr": 0.015438211119522496, + "mc2": 0.48589836094381605, + "mc2_stderr": 0.015747574476903958 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1", + "model_sha": "826e83e411df32f358893ab21f5eae680499ae9a", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1/results_2023-10-13T07-31-46.021134.json b/eval-results/IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1/results_2023-10-13T07-31-46.021134.json new file mode 100644 index 0000000000000000000000000000000000000000..4a130ea1e9d4cdb4f16d9623edaee937d299b03a --- /dev/null +++ b/eval-results/IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1/results_2023-10-13T07-31-46.021134.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1", + "model_sha": "826e83e411df32f358893ab21f5eae680499ae9a", + "model_size": "24.42 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5011838989739542, + "acc_stderr": 0.014052446290529019 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0, + "acc": 0.2505919494869771, + "acc_stderr": 0.0070262231452645095 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "db6b0c12edc29d2e", + "hash_cont_tokens": "17bd80b3e3064def" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "182a4481db442175" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "af7926c4a4d6e10c", + "hash_cont_tokens": "a4ed15699136a5e8" + }, + "total_evaluation_time_secondes": "31295.868842601776", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1/results_2023-12-02T16-26-56.383238.json b/eval-results/IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1/results_2023-12-02T16-26-56.383238.json new file mode 100644 index 0000000000000000000000000000000000000000..704c45f36c1e516c4bec7f73f3bc0b0b15cd1fd9 --- /dev/null +++ b/eval-results/IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1/results_2023-12-02T16-26-56.383238.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1418336.635651281, + "end_time": 1422719.533186423, + "total_evaluation_time_secondes": "4382.89753514179", + "model_name": "IDEA-CCNL/Ziya-LLaMA-13B-Pretrain-v1", + "model_sha": "826e83e411df32f358893ab21f5eae680499ae9a", + "model_dtype": "torch.float16", + "model_size": "24.42 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "all": { + "acc": 0.0, + "acc_stderr": 0.0 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "182a4481db442175" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "42036645de5ac59d", + "hash_cont_tokens": "8b67ca37551bc775" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/IDEA-CCNL/Ziya-LLaMA-13B-v1/results_2023-07-19T18-24-58.972667.json b/eval-results/IDEA-CCNL/Ziya-LLaMA-13B-v1/results_2023-07-19T18-24-58.972667.json new file mode 100644 index 0000000000000000000000000000000000000000..3d6d81152402cd2dd4adac012a6114d584f25b41 --- /dev/null +++ b/eval-results/IDEA-CCNL/Ziya-LLaMA-13B-v1/results_2023-07-19T18-24-58.972667.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.21416382252559726, + "acc_stderr": 0.0119883832059665, + "acc_norm": 0.2773037542662116, + "acc_norm_stderr": 0.013082095839059374 + }, + "harness|hellaswag|10": { + "acc": 0.2552280422226648, + "acc_stderr": 0.004350982826580602, + "acc_norm": 0.25960963951404104, + "acc_norm_stderr": 0.004375244237045139 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.22962962962962963, + "acc_stderr": 0.03633384414073461, + "acc_norm": 0.22962962962962963, + "acc_norm_stderr": 0.03633384414073461 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3355263157894737, + "acc_stderr": 0.03842498559395268, + "acc_norm": 0.3355263157894737, + "acc_norm_stderr": 0.03842498559395268 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2981132075471698, + "acc_stderr": 0.028152837942493857, + "acc_norm": 0.2981132075471698, + "acc_norm_stderr": 0.028152837942493857 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3352601156069364, + "acc_stderr": 0.03599586301247078, + "acc_norm": 0.3352601156069364, + "acc_norm_stderr": 0.03599586301247078 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.37254901960784315, + "acc_stderr": 0.04810840148082633, + "acc_norm": 0.37254901960784315, + "acc_norm_stderr": 0.04810840148082633 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.20851063829787234, + "acc_stderr": 0.026556982117838728, + "acc_norm": 0.20851063829787234, + "acc_norm_stderr": 0.026556982117838728 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813344, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813344 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2671957671957672, + "acc_stderr": 0.022789673145776564, + "acc_norm": 0.2671957671957672, + "acc_norm_stderr": 0.022789673145776564 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.04306241259127153, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.04306241259127153 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3161290322580645, + "acc_stderr": 0.02645087448904277, + "acc_norm": 0.3161290322580645, + "acc_norm_stderr": 0.02645087448904277 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.28078817733990147, + "acc_stderr": 0.03161856335358609, + "acc_norm": 0.28078817733990147, + "acc_norm_stderr": 0.03161856335358609 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.35353535353535354, + "acc_stderr": 0.03406086723547153, + "acc_norm": 0.35353535353535354, + "acc_norm_stderr": 0.03406086723547153 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.36787564766839376, + "acc_stderr": 0.03480175668466036, + "acc_norm": 0.36787564766839376, + "acc_norm_stderr": 0.03480175668466036 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3641025641025641, + "acc_stderr": 0.02439667298509477, + "acc_norm": 0.3641025641025641, + "acc_norm_stderr": 0.02439667298509477 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.026842057873833706, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.026842057873833706 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3487394957983193, + "acc_stderr": 0.03095663632856655, + "acc_norm": 0.3487394957983193, + "acc_norm_stderr": 0.03095663632856655 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3486238532110092, + "acc_stderr": 0.020431254090714328, + "acc_norm": 0.3486238532110092, + "acc_norm_stderr": 0.020431254090714328 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.030587591351604246, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.030587591351604246 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.20253164556962025, + "acc_stderr": 0.026160568246601457, + "acc_norm": 0.20253164556962025, + "acc_norm_stderr": 0.026160568246601457 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.10762331838565023, + "acc_stderr": 0.020799400082879997, + "acc_norm": 0.10762331838565023, + "acc_norm_stderr": 0.020799400082879997 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2824427480916031, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.2824427480916031, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.14049586776859505, + "acc_stderr": 0.03172233426002161, + "acc_norm": 0.14049586776859505, + "acc_norm_stderr": 0.03172233426002161 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2331288343558282, + "acc_stderr": 0.033220157957767414, + "acc_norm": 0.2331288343558282, + "acc_norm_stderr": 0.033220157957767414 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.16071428571428573, + "acc_stderr": 0.03485946096475741, + "acc_norm": 0.16071428571428573, + "acc_norm_stderr": 0.03485946096475741 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3786407766990291, + "acc_stderr": 0.04802694698258972, + "acc_norm": 0.3786407766990291, + "acc_norm_stderr": 0.04802694698258972 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.19658119658119658, + "acc_stderr": 0.02603538609895129, + "acc_norm": 0.19658119658119658, + "acc_norm_stderr": 0.02603538609895129 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909281, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909281 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.20434227330779056, + "acc_stderr": 0.0144191239809319, + "acc_norm": 0.20434227330779056, + "acc_norm_stderr": 0.0144191239809319 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2138728323699422, + "acc_stderr": 0.022075709251757183, + "acc_norm": 0.2138728323699422, + "acc_norm_stderr": 0.022075709251757183 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.27262569832402234, + "acc_stderr": 0.014893391735249588, + "acc_norm": 0.27262569832402234, + "acc_norm_stderr": 0.014893391735249588 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.02609016250427905, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.02609016250427905 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24115755627009647, + "acc_stderr": 0.024296594034763426, + "acc_norm": 0.24115755627009647, + "acc_norm_stderr": 0.024296594034763426 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.22530864197530864, + "acc_stderr": 0.023246202647819746, + "acc_norm": 0.22530864197530864, + "acc_norm_stderr": 0.023246202647819746 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.24113475177304963, + "acc_stderr": 0.025518731049537762, + "acc_norm": 0.24113475177304963, + "acc_norm_stderr": 0.025518731049537762 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24445893089960888, + "acc_stderr": 0.010976425013113886, + "acc_norm": 0.24445893089960888, + "acc_norm_stderr": 0.010976425013113886 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4485294117647059, + "acc_stderr": 0.030211479609121593, + "acc_norm": 0.4485294117647059, + "acc_norm_stderr": 0.030211479609121593 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2173202614379085, + "acc_stderr": 0.01668482092914859, + "acc_norm": 0.2173202614379085, + "acc_norm_stderr": 0.01668482092914859 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.22727272727272727, + "acc_stderr": 0.04013964554072774, + "acc_norm": 0.22727272727272727, + "acc_norm_stderr": 0.04013964554072774 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4, + "acc_stderr": 0.031362502409358936, + "acc_norm": 0.4, + "acc_norm_stderr": 0.031362502409358936 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.26865671641791045, + "acc_stderr": 0.03134328358208954, + "acc_norm": 0.26865671641791045, + "acc_norm_stderr": 0.03134328358208954 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.1927710843373494, + "acc_stderr": 0.030709824050565274, + "acc_norm": 0.1927710843373494, + "acc_norm_stderr": 0.030709824050565274 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.17543859649122806, + "acc_stderr": 0.029170885500727654, + "acc_norm": 0.17543859649122806, + "acc_norm_stderr": 0.029170885500727654 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2778457772337821, + "mc1_stderr": 0.015680929364024654, + "mc2": 0.4864939735546677, + "mc2_stderr": 0.015749143910650822 + }, + "all": { + "acc": 0.2691702973910435, + "acc_stderr": 0.0317735349068451, + "acc_norm": 0.2703147300865009, + "acc_norm_stderr": 0.03179248361944777, + "mc1": 0.2778457772337821, + "mc1_stderr": 0.015680929364024654, + "mc2": 0.4864939735546677, + "mc2_stderr": 0.015749143910650822 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "IDEA-CCNL/Ziya-LLaMA-13B-v1", + "model_sha": "fccf34387d2c9f2f95ff59ae380e6de3718e41ff", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/IDEA-CCNL/Ziya-LLaMA-13B-v1/results_2023-09-18T04-43-18.868497.json b/eval-results/IDEA-CCNL/Ziya-LLaMA-13B-v1/results_2023-09-18T04-43-18.868497.json new file mode 100644 index 0000000000000000000000000000000000000000..aa0f2c62dcd7dce20277214552e09a81441b1cb1 --- /dev/null +++ b/eval-results/IDEA-CCNL/Ziya-LLaMA-13B-v1/results_2023-09-18T04-43-18.868497.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "IDEA-CCNL/Ziya-LLaMA-13B-v1", + "model_sha": "64d931f346e1a49ea3bbca07a83137075bab1c66", + "model_size": "24.42 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 3.145973154362416e-06, + "f1_stderr": 3.145973154362522e-06 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.4956590370955012, + "acc_stderr": 0.014051956064076892 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 3.145973154362416e-06, + "f1_stderr": 3.145973154362522e-06, + "acc": 0.2478295185477506, + "acc_stderr": 0.007025978032038446 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "db6b0c12edc29d2e", + "hash_cont_tokens": "18f159398ab5ee27" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "68054e93505b7d63" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "af7926c4a4d6e10c", + "hash_cont_tokens": "873966ef88715147" + }, + "total_evaluation_time_secondes": "33522.18334698677", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Intel/neural-chat-7b-v3-1/results_2023-11-18T15-19-14.739909.json b/eval-results/Intel/neural-chat-7b-v3-1/results_2023-11-18T15-19-14.739909.json new file mode 100644 index 0000000000000000000000000000000000000000..437dcbbe8a052f953d904cb0e9a5a8cc3c13ce30 --- /dev/null +++ b/eval-results/Intel/neural-chat-7b-v3-1/results_2023-11-18T15-19-14.739909.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 145237.069553293, + "end_time": 155718.702369908, + "total_evaluation_time_secondes": "10481.63281661499", + "model_name": "Intel/neural-chat-7b-v3-1", + "model_sha": "3995e9a13d54ce95f0ad55de2eaa92e2dc580174", + "model_dtype": "4bit", + "model_size": "4.24 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6100682593856656, + "acc_stderr": 0.014252959848892893, + "acc_norm": 0.6424914675767918, + "acc_norm_stderr": 0.014005494275916576 + }, + "harness|hellaswag|10": { + "acc": 0.6345349531965744, + "acc_stderr": 0.004805761513803412, + "acc_norm": 0.8249352718581956, + "acc_norm_stderr": 0.0037924580005234305 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6148148148148148, + "acc_stderr": 0.04203921040156279, + "acc_norm": 0.6148148148148148, + "acc_norm_stderr": 0.04203921040156279 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6710526315789473, + "acc_stderr": 0.038234289699266046, + "acc_norm": 0.6710526315789473, + "acc_norm_stderr": 0.038234289699266046 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6754716981132075, + "acc_stderr": 0.02881561571343211, + "acc_norm": 0.6754716981132075, + "acc_norm_stderr": 0.02881561571343211 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7013888888888888, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.7013888888888888, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562427, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562427 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6069364161849711, + "acc_stderr": 0.03724249595817731, + "acc_norm": 0.6069364161849711, + "acc_norm_stderr": 0.03724249595817731 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.04784060704105654, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.04784060704105654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421296, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421296 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5319148936170213, + "acc_stderr": 0.03261936918467381, + "acc_norm": 0.5319148936170213, + "acc_norm_stderr": 0.03261936918467381 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.43859649122807015, + "acc_stderr": 0.04668000738510455, + "acc_norm": 0.43859649122807015, + "acc_norm_stderr": 0.04668000738510455 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.35978835978835977, + "acc_stderr": 0.024718075944129277, + "acc_norm": 0.35978835978835977, + "acc_norm_stderr": 0.024718075944129277 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.0442626668137991, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.0442626668137991 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7354838709677419, + "acc_stderr": 0.02509189237885928, + "acc_norm": 0.7354838709677419, + "acc_norm_stderr": 0.02509189237885928 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.49261083743842365, + "acc_stderr": 0.035176035403610084, + "acc_norm": 0.49261083743842365, + "acc_norm_stderr": 0.035176035403610084 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.65, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.65, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7696969696969697, + "acc_stderr": 0.0328766675860349, + "acc_norm": 0.7696969696969697, + "acc_norm_stderr": 0.0328766675860349 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7727272727272727, + "acc_stderr": 0.02985751567338642, + "acc_norm": 0.7727272727272727, + "acc_norm_stderr": 0.02985751567338642 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8549222797927462, + "acc_stderr": 0.025416343096306433, + "acc_norm": 0.8549222797927462, + "acc_norm_stderr": 0.025416343096306433 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6153846153846154, + "acc_stderr": 0.02466674491518721, + "acc_norm": 0.6153846153846154, + "acc_norm_stderr": 0.02466674491518721 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.028037929969114993, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.028037929969114993 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6554621848739496, + "acc_stderr": 0.030868682604121622, + "acc_norm": 0.6554621848739496, + "acc_norm_stderr": 0.030868682604121622 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.36423841059602646, + "acc_stderr": 0.03929111781242741, + "acc_norm": 0.36423841059602646, + "acc_norm_stderr": 0.03929111781242741 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8238532110091743, + "acc_stderr": 0.016332882393431353, + "acc_norm": 0.8238532110091743, + "acc_norm_stderr": 0.016332882393431353 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.49074074074074076, + "acc_stderr": 0.034093869469927006, + "acc_norm": 0.49074074074074076, + "acc_norm_stderr": 0.034093869469927006 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7647058823529411, + "acc_stderr": 0.029771775228145628, + "acc_norm": 0.7647058823529411, + "acc_norm_stderr": 0.029771775228145628 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7637130801687764, + "acc_stderr": 0.02765215314415927, + "acc_norm": 0.7637130801687764, + "acc_norm_stderr": 0.02765215314415927 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.672645739910314, + "acc_stderr": 0.031493846709941306, + "acc_norm": 0.672645739910314, + "acc_norm_stderr": 0.031493846709941306 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6946564885496184, + "acc_stderr": 0.04039314978724561, + "acc_norm": 0.6946564885496184, + "acc_norm_stderr": 0.04039314978724561 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7768595041322314, + "acc_stderr": 0.03800754475228732, + "acc_norm": 0.7768595041322314, + "acc_norm_stderr": 0.03800754475228732 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7129629629629629, + "acc_stderr": 0.043733130409147614, + "acc_norm": 0.7129629629629629, + "acc_norm_stderr": 0.043733130409147614 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7177914110429447, + "acc_stderr": 0.03536117886664742, + "acc_norm": 0.7177914110429447, + "acc_norm_stderr": 0.03536117886664742 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.45535714285714285, + "acc_stderr": 0.047268355537191, + "acc_norm": 0.45535714285714285, + "acc_norm_stderr": 0.047268355537191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8155339805825242, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.8155339805825242, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8547008547008547, + "acc_stderr": 0.023086635086841407, + "acc_norm": 0.8547008547008547, + "acc_norm_stderr": 0.023086635086841407 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.73, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.73, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8033205619412516, + "acc_stderr": 0.01421413855691392, + "acc_norm": 0.8033205619412516, + "acc_norm_stderr": 0.01421413855691392 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6445086705202312, + "acc_stderr": 0.025770292082977254, + "acc_norm": 0.6445086705202312, + "acc_norm_stderr": 0.025770292082977254 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.40893854748603353, + "acc_stderr": 0.01644283065471554, + "acc_norm": 0.40893854748603353, + "acc_norm_stderr": 0.01644283065471554 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6699346405228758, + "acc_stderr": 0.026925654653615697, + "acc_norm": 0.6699346405228758, + "acc_norm_stderr": 0.026925654653615697 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6495176848874598, + "acc_stderr": 0.027098652621301757, + "acc_norm": 0.6495176848874598, + "acc_norm_stderr": 0.027098652621301757 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6851851851851852, + "acc_stderr": 0.025842248700902168, + "acc_norm": 0.6851851851851852, + "acc_norm_stderr": 0.025842248700902168 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.40425531914893614, + "acc_stderr": 0.029275532159704725, + "acc_norm": 0.40425531914893614, + "acc_norm_stderr": 0.029275532159704725 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4380704041720991, + "acc_stderr": 0.012671902782567645, + "acc_norm": 0.4380704041720991, + "acc_norm_stderr": 0.012671902782567645 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5772058823529411, + "acc_stderr": 0.03000856284500348, + "acc_norm": 0.5772058823529411, + "acc_norm_stderr": 0.03000856284500348 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6323529411764706, + "acc_stderr": 0.019506291693954843, + "acc_norm": 0.6323529411764706, + "acc_norm_stderr": 0.019506291693954843 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6181818181818182, + "acc_stderr": 0.046534298079135075, + "acc_norm": 0.6181818181818182, + "acc_norm_stderr": 0.046534298079135075 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.673469387755102, + "acc_stderr": 0.030021056238440303, + "acc_norm": 0.673469387755102, + "acc_norm_stderr": 0.030021056238440303 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8109452736318408, + "acc_stderr": 0.027686913588013024, + "acc_norm": 0.8109452736318408, + "acc_norm_stderr": 0.027686913588013024 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5120481927710844, + "acc_stderr": 0.03891364495835817, + "acc_norm": 0.5120481927710844, + "acc_norm_stderr": 0.03891364495835817 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8070175438596491, + "acc_stderr": 0.030267457554898458, + "acc_norm": 0.8070175438596491, + "acc_norm_stderr": 0.030267457554898458 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.408812729498164, + "mc1_stderr": 0.01720995215164173, + "mc2": 0.5639976168180671, + "mc2_stderr": 0.015520174282786663 + }, + "harness|winogrande|5": { + "acc": 0.7734806629834254, + "acc_stderr": 0.011764149054698332 + }, + "harness|drop|3": { + "em": 0.31658976510067116, + "em_stderr": 0.004763529136106869, + "f1": 0.43081690436241865, + "f1_stderr": 0.004535720326298335 + }, + "harness|gsm8k|5": { + "acc": 0.18119787717968158, + "acc_stderr": 0.010609827611527352 + }, + "all": { + "acc": 0.604106672961726, + "acc_stderr": 0.032952085551478036, + "acc_norm": 0.6121805430060155, + "acc_norm_stderr": 0.033668516489621665, + "mc1": 0.408812729498164, + "mc1_stderr": 0.01720995215164173, + "mc2": 0.5639976168180671, + "mc2_stderr": 0.015520174282786663, + "em": 0.31658976510067116, + "em_stderr": 0.004763529136106869, + "f1": 0.43081690436241865, + "f1_stderr": 0.004535720326298335 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "b70c8eee29a47250" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "123e86a560a2f836" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "50a10f01ce62b1b1" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Intel/neural-chat-7b-v3-1/results_2023-11-18T15-23-13.598780.json b/eval-results/Intel/neural-chat-7b-v3-1/results_2023-11-18T15-23-13.598780.json new file mode 100644 index 0000000000000000000000000000000000000000..f446b3bc97c374e8f98a51dcaf259f2345d3edbe --- /dev/null +++ b/eval-results/Intel/neural-chat-7b-v3-1/results_2023-11-18T15-23-13.598780.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 169189.777311114, + "end_time": 181160.579365127, + "total_evaluation_time_secondes": "11970.80205401298", + "model_name": "Intel/neural-chat-7b-v3-1", + "model_sha": "3995e9a13d54ce95f0ad55de2eaa92e2dc580174", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6356655290102389, + "acc_stderr": 0.014063260279882415, + "acc_norm": 0.6621160409556314, + "acc_norm_stderr": 0.013822047922283507 + }, + "harness|hellaswag|10": { + "acc": 0.6448914558852819, + "acc_stderr": 0.004775681871529863, + "acc_norm": 0.836387173869747, + "acc_norm_stderr": 0.003691678495767969 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6222222222222222, + "acc_stderr": 0.04188307537595853, + "acc_norm": 0.6222222222222222, + "acc_norm_stderr": 0.04188307537595853 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6710526315789473, + "acc_stderr": 0.03823428969926605, + "acc_norm": 0.6710526315789473, + "acc_norm_stderr": 0.03823428969926605 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.660377358490566, + "acc_stderr": 0.02914690474779834, + "acc_norm": 0.660377358490566, + "acc_norm_stderr": 0.02914690474779834 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7291666666666666, + "acc_stderr": 0.03716177437566017, + "acc_norm": 0.7291666666666666, + "acc_norm_stderr": 0.03716177437566017 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.036928207672648664, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.036928207672648664 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.39215686274509803, + "acc_stderr": 0.04858083574266345, + "acc_norm": 0.39215686274509803, + "acc_norm_stderr": 0.04858083574266345 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.74, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.74, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5234042553191489, + "acc_stderr": 0.032650194750335815, + "acc_norm": 0.5234042553191489, + "acc_norm_stderr": 0.032650194750335815 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.45614035087719296, + "acc_stderr": 0.046854730419077895, + "acc_norm": 0.45614035087719296, + "acc_norm_stderr": 0.046854730419077895 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.37566137566137564, + "acc_stderr": 0.024942368931159788, + "acc_norm": 0.37566137566137564, + "acc_norm_stderr": 0.024942368931159788 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4365079365079365, + "acc_stderr": 0.04435932892851466, + "acc_norm": 0.4365079365079365, + "acc_norm_stderr": 0.04435932892851466 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7677419354838709, + "acc_stderr": 0.024022256130308235, + "acc_norm": 0.7677419354838709, + "acc_norm_stderr": 0.024022256130308235 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.035158955511656986, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.035158955511656986 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7636363636363637, + "acc_stderr": 0.03317505930009182, + "acc_norm": 0.7636363636363637, + "acc_norm_stderr": 0.03317505930009182 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7626262626262627, + "acc_stderr": 0.030313710538198896, + "acc_norm": 0.7626262626262627, + "acc_norm_stderr": 0.030313710538198896 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8963730569948186, + "acc_stderr": 0.02199531196364424, + "acc_norm": 0.8963730569948186, + "acc_norm_stderr": 0.02199531196364424 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6076923076923076, + "acc_stderr": 0.024756000382130952, + "acc_norm": 0.6076923076923076, + "acc_norm_stderr": 0.024756000382130952 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34444444444444444, + "acc_stderr": 0.02897264888484427, + "acc_norm": 0.34444444444444444, + "acc_norm_stderr": 0.02897264888484427 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6722689075630253, + "acc_stderr": 0.03048991141767323, + "acc_norm": 0.6722689075630253, + "acc_norm_stderr": 0.03048991141767323 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3576158940397351, + "acc_stderr": 0.03913453431177258, + "acc_norm": 0.3576158940397351, + "acc_norm_stderr": 0.03913453431177258 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8330275229357799, + "acc_stderr": 0.01599015488507338, + "acc_norm": 0.8330275229357799, + "acc_norm_stderr": 0.01599015488507338 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.49537037037037035, + "acc_stderr": 0.03409825519163572, + "acc_norm": 0.49537037037037035, + "acc_norm_stderr": 0.03409825519163572 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.02812597226565438, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.02812597226565438 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7890295358649789, + "acc_stderr": 0.02655837250266192, + "acc_norm": 0.7890295358649789, + "acc_norm_stderr": 0.02655837250266192 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.732824427480916, + "acc_stderr": 0.038808483010823944, + "acc_norm": 0.732824427480916, + "acc_norm_stderr": 0.038808483010823944 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8016528925619835, + "acc_stderr": 0.03640118271990947, + "acc_norm": 0.8016528925619835, + "acc_norm_stderr": 0.03640118271990947 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.04133119440243839, + "acc_norm": 0.7592592592592593, + "acc_norm_stderr": 0.04133119440243839 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7177914110429447, + "acc_stderr": 0.03536117886664742, + "acc_norm": 0.7177914110429447, + "acc_norm_stderr": 0.03536117886664742 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.48214285714285715, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.48214285714285715, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8155339805825242, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.8155339805825242, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8589743589743589, + "acc_stderr": 0.022801382534597528, + "acc_norm": 0.8589743589743589, + "acc_norm_stderr": 0.022801382534597528 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8122605363984674, + "acc_stderr": 0.013964393769899143, + "acc_norm": 0.8122605363984674, + "acc_norm_stderr": 0.013964393769899143 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6763005780346821, + "acc_stderr": 0.025190181327608408, + "acc_norm": 0.6763005780346821, + "acc_norm_stderr": 0.025190181327608408 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3865921787709497, + "acc_stderr": 0.016286674879101022, + "acc_norm": 0.3865921787709497, + "acc_norm_stderr": 0.016286674879101022 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6928104575163399, + "acc_stderr": 0.026415601914388995, + "acc_norm": 0.6928104575163399, + "acc_norm_stderr": 0.026415601914388995 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6752411575562701, + "acc_stderr": 0.026596782287697043, + "acc_norm": 0.6752411575562701, + "acc_norm_stderr": 0.026596782287697043 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.024922001168886324, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.024922001168886324 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.43617021276595747, + "acc_stderr": 0.029583452036284066, + "acc_norm": 0.43617021276595747, + "acc_norm_stderr": 0.029583452036284066 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44784876140808344, + "acc_stderr": 0.012700582404768223, + "acc_norm": 0.44784876140808344, + "acc_norm_stderr": 0.012700582404768223 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6580882352941176, + "acc_stderr": 0.028814722422254187, + "acc_norm": 0.6580882352941176, + "acc_norm_stderr": 0.028814722422254187 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6405228758169934, + "acc_stderr": 0.01941253924203216, + "acc_norm": 0.6405228758169934, + "acc_norm_stderr": 0.01941253924203216 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.710204081632653, + "acc_stderr": 0.029043088683304328, + "acc_norm": 0.710204081632653, + "acc_norm_stderr": 0.029043088683304328 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.835820895522388, + "acc_stderr": 0.026193923544454142, + "acc_norm": 0.835820895522388, + "acc_norm_stderr": 0.026193923544454142 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036625, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036625 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5180722891566265, + "acc_stderr": 0.03889951252827216, + "acc_norm": 0.5180722891566265, + "acc_norm_stderr": 0.03889951252827216 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8187134502923976, + "acc_stderr": 0.029547741687640038, + "acc_norm": 0.8187134502923976, + "acc_norm_stderr": 0.029547741687640038 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.44430844553243576, + "mc1_stderr": 0.01739458625074317, + "mc2": 0.596468573226102, + "mc2_stderr": 0.015337888566380171 + }, + "harness|winogrande|5": { + "acc": 0.7813733228097869, + "acc_stderr": 0.011616198215773236 + }, + "harness|drop|3": { + "em": 0.31512164429530204, + "em_stderr": 0.004757573308442557, + "f1": 0.43838401845637875, + "f1_stderr": 0.004511299753314001 + }, + "harness|gsm8k|5": { + "acc": 0.1956027293404094, + "acc_stderr": 0.010926096810556464 + }, + "all": { + "acc": 0.6198496581816488, + "acc_stderr": 0.03259259478405919, + "acc_norm": 0.627996598760343, + "acc_norm_stderr": 0.03329289442488, + "mc1": 0.44430844553243576, + "mc1_stderr": 0.01739458625074317, + "mc2": 0.596468573226102, + "mc2_stderr": 0.015337888566380171, + "em": 0.31512164429530204, + "em_stderr": 0.004757573308442557, + "f1": 0.43838401845637875, + "f1_stderr": 0.004511299753314001 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "bcf5d8355d7bc598" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "41c9a0d921eef105" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "9cffb40e714fb1d4" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Intel/neural-chat-7b-v3-1/results_2023-11-18T15-42-45.444313.json b/eval-results/Intel/neural-chat-7b-v3-1/results_2023-11-18T15-42-45.444313.json new file mode 100644 index 0000000000000000000000000000000000000000..dbc36172f03e3ae4c387db50498b14652f0eb3c3 --- /dev/null +++ b/eval-results/Intel/neural-chat-7b-v3-1/results_2023-11-18T15-42-45.444313.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 168761.472805058, + "end_time": 181135.466975231, + "total_evaluation_time_secondes": "12373.994170172984", + "model_name": "Intel/neural-chat-7b-v3-1", + "model_sha": "3995e9a13d54ce95f0ad55de2eaa92e2dc580174", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6322525597269625, + "acc_stderr": 0.01409099561816848, + "acc_norm": 0.6629692832764505, + "acc_norm_stderr": 0.013813476652902276 + }, + "harness|hellaswag|10": { + "acc": 0.6446922923720374, + "acc_stderr": 0.0047762832034680975, + "acc_norm": 0.8359888468432584, + "acc_norm_stderr": 0.003695289340514483 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6148148148148148, + "acc_stderr": 0.04203921040156279, + "acc_norm": 0.6148148148148148, + "acc_norm_stderr": 0.04203921040156279 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6842105263157895, + "acc_stderr": 0.0378272898086547, + "acc_norm": 0.6842105263157895, + "acc_norm_stderr": 0.0378272898086547 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6716981132075471, + "acc_stderr": 0.02890159361241178, + "acc_norm": 0.6716981132075471, + "acc_norm_stderr": 0.02890159361241178 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.037455547914624555, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.037455547914624555 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411019, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411019 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.630057803468208, + "acc_stderr": 0.0368122963339432, + "acc_norm": 0.630057803468208, + "acc_norm_stderr": 0.0368122963339432 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.39215686274509803, + "acc_stderr": 0.04858083574266345, + "acc_norm": 0.39215686274509803, + "acc_norm_stderr": 0.04858083574266345 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.74, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.74, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5148936170212766, + "acc_stderr": 0.032671518489247764, + "acc_norm": 0.5148936170212766, + "acc_norm_stderr": 0.032671518489247764 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4649122807017544, + "acc_stderr": 0.046920083813689104, + "acc_norm": 0.4649122807017544, + "acc_norm_stderr": 0.046920083813689104 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5241379310344828, + "acc_stderr": 0.0416180850350153, + "acc_norm": 0.5241379310344828, + "acc_norm_stderr": 0.0416180850350153 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3862433862433862, + "acc_stderr": 0.025075981767601684, + "acc_norm": 0.3862433862433862, + "acc_norm_stderr": 0.025075981767601684 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.0442626668137991, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.0442626668137991 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7709677419354839, + "acc_stderr": 0.023904914311782658, + "acc_norm": 0.7709677419354839, + "acc_norm_stderr": 0.023904914311782658 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5221674876847291, + "acc_stderr": 0.03514528562175008, + "acc_norm": 0.5221674876847291, + "acc_norm_stderr": 0.03514528562175008 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7636363636363637, + "acc_stderr": 0.03317505930009182, + "acc_norm": 0.7636363636363637, + "acc_norm_stderr": 0.03317505930009182 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.03053289223393202, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.03053289223393202 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9015544041450777, + "acc_stderr": 0.021500249576033446, + "acc_norm": 0.9015544041450777, + "acc_norm_stderr": 0.021500249576033446 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6051282051282051, + "acc_stderr": 0.024784316942156395, + "acc_norm": 0.6051282051282051, + "acc_norm_stderr": 0.024784316942156395 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.028742040903948485, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.028742040903948485 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.030388353551886793, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.030388353551886793 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.36423841059602646, + "acc_stderr": 0.03929111781242741, + "acc_norm": 0.36423841059602646, + "acc_norm_stderr": 0.03929111781242741 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8366972477064221, + "acc_stderr": 0.015848255806501562, + "acc_norm": 0.8366972477064221, + "acc_norm_stderr": 0.015848255806501562 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4861111111111111, + "acc_stderr": 0.03408655867977749, + "acc_norm": 0.4861111111111111, + "acc_norm_stderr": 0.03408655867977749 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.803921568627451, + "acc_stderr": 0.027865942286639325, + "acc_norm": 0.803921568627451, + "acc_norm_stderr": 0.027865942286639325 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7974683544303798, + "acc_stderr": 0.026160568246601446, + "acc_norm": 0.7974683544303798, + "acc_norm_stderr": 0.026160568246601446 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.732824427480916, + "acc_stderr": 0.038808483010823944, + "acc_norm": 0.732824427480916, + "acc_norm_stderr": 0.038808483010823944 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7851239669421488, + "acc_stderr": 0.037494924487096966, + "acc_norm": 0.7851239669421488, + "acc_norm_stderr": 0.037494924487096966 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.04133119440243839, + "acc_norm": 0.7592592592592593, + "acc_norm_stderr": 0.04133119440243839 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7177914110429447, + "acc_stderr": 0.03536117886664742, + "acc_norm": 0.7177914110429447, + "acc_norm_stderr": 0.03536117886664742 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.48214285714285715, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.48214285714285715, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.037601780060266196, + "acc_norm": 0.8252427184466019, + "acc_norm_stderr": 0.037601780060266196 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8632478632478633, + "acc_stderr": 0.022509033937077805, + "acc_norm": 0.8632478632478633, + "acc_norm_stderr": 0.022509033937077805 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8122605363984674, + "acc_stderr": 0.01396439376989914, + "acc_norm": 0.8122605363984674, + "acc_norm_stderr": 0.01396439376989914 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6791907514450867, + "acc_stderr": 0.025131000233647893, + "acc_norm": 0.6791907514450867, + "acc_norm_stderr": 0.025131000233647893 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.38100558659217876, + "acc_stderr": 0.016242028834053627, + "acc_norm": 0.38100558659217876, + "acc_norm_stderr": 0.016242028834053627 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.696078431372549, + "acc_stderr": 0.026336613469046626, + "acc_norm": 0.696078431372549, + "acc_norm_stderr": 0.026336613469046626 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6752411575562701, + "acc_stderr": 0.026596782287697043, + "acc_norm": 0.6752411575562701, + "acc_norm_stderr": 0.026596782287697043 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7191358024691358, + "acc_stderr": 0.02500646975579921, + "acc_norm": 0.7191358024691358, + "acc_norm_stderr": 0.02500646975579921 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4432624113475177, + "acc_stderr": 0.029634838473766006, + "acc_norm": 0.4432624113475177, + "acc_norm_stderr": 0.029634838473766006 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4439374185136897, + "acc_stderr": 0.012689708167787684, + "acc_norm": 0.4439374185136897, + "acc_norm_stderr": 0.012689708167787684 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6544117647058824, + "acc_stderr": 0.02888819310398863, + "acc_norm": 0.6544117647058824, + "acc_norm_stderr": 0.02888819310398863 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6421568627450981, + "acc_stderr": 0.019393058402355435, + "acc_norm": 0.6421568627450981, + "acc_norm_stderr": 0.019393058402355435 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.710204081632653, + "acc_stderr": 0.029043088683304328, + "acc_norm": 0.710204081632653, + "acc_norm_stderr": 0.029043088683304328 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8407960199004975, + "acc_stderr": 0.02587064676616913, + "acc_norm": 0.8407960199004975, + "acc_norm_stderr": 0.02587064676616913 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5120481927710844, + "acc_stderr": 0.03891364495835817, + "acc_norm": 0.5120481927710844, + "acc_norm_stderr": 0.03891364495835817 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8187134502923976, + "acc_stderr": 0.029547741687640038, + "acc_norm": 0.8187134502923976, + "acc_norm_stderr": 0.029547741687640038 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.44063647490820074, + "mc1_stderr": 0.01737969755543745, + "mc2": 0.5953808732777186, + "mc2_stderr": 0.015347393503467649 + }, + "harness|winogrande|5": { + "acc": 0.7797947908445146, + "acc_stderr": 0.011646276755089691 + }, + "harness|drop|3": { + "em": 0.3183724832214765, + "em_stderr": 0.004770687516057205, + "f1": 0.44000419463087526, + "f1_stderr": 0.00452137107601273 + }, + "harness|gsm8k|5": { + "acc": 0.19408642911296436, + "acc_stderr": 0.01089391830819241 + }, + "all": { + "acc": 0.6203975476749912, + "acc_stderr": 0.03253317374017875, + "acc_norm": 0.6286844485803, + "acc_norm_stderr": 0.03323093034337969, + "mc1": 0.44063647490820074, + "mc1_stderr": 0.01737969755543745, + "mc2": 0.5953808732777186, + "mc2_stderr": 0.015347393503467649, + "em": 0.3183724832214765, + "em_stderr": 0.004770687516057205, + "f1": 0.44000419463087526, + "f1_stderr": 0.00452137107601273 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "a0ad137843e9453a" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "2c1bee4987cc4550" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "ae175c3bfe19e814" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Intel/neural-chat-7b-v3-1/results_2023-11-27T01-54-15.914874.json b/eval-results/Intel/neural-chat-7b-v3-1/results_2023-11-27T01-54-15.914874.json new file mode 100644 index 0000000000000000000000000000000000000000..858486fc9a649cda937d671696f2ca5b32d715d4 --- /dev/null +++ b/eval-results/Intel/neural-chat-7b-v3-1/results_2023-11-27T01-54-15.914874.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 861625.555103256, + "end_time": 897085.948783758, + "total_evaluation_time_secondes": "35460.39368050196", + "model_name": "Intel/neural-chat-7b-v3-1", + "model_sha": "af2489cde09e9d2c175622f651875e83824c4b10", + "model_dtype": "8bit", + "model_size": "7.49 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.636518771331058, + "acc_stderr": 0.014056207319068285, + "acc_norm": 0.6569965870307167, + "acc_norm_stderr": 0.013872423223718164 + }, + "harness|hellaswag|10": { + "acc": 0.6453893646683927, + "acc_stderr": 0.0047741745902051425, + "acc_norm": 0.8353913563035252, + "acc_norm_stderr": 0.0037006909956008908 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252606, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252606 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6296296296296297, + "acc_stderr": 0.041716541613545426, + "acc_norm": 0.6296296296296297, + "acc_norm_stderr": 0.041716541613545426 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6776315789473685, + "acc_stderr": 0.038035102483515854, + "acc_norm": 0.6776315789473685, + "acc_norm_stderr": 0.038035102483515854 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6679245283018868, + "acc_stderr": 0.028985455652334388, + "acc_norm": 0.6679245283018868, + "acc_norm_stderr": 0.028985455652334388 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7291666666666666, + "acc_stderr": 0.03716177437566017, + "acc_norm": 0.7291666666666666, + "acc_norm_stderr": 0.03716177437566017 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956913, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956913 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.630057803468208, + "acc_stderr": 0.0368122963339432, + "acc_norm": 0.630057803468208, + "acc_norm_stderr": 0.0368122963339432 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.39215686274509803, + "acc_stderr": 0.04858083574266345, + "acc_norm": 0.39215686274509803, + "acc_norm_stderr": 0.04858083574266345 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.74, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.74, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5148936170212766, + "acc_stderr": 0.03267151848924777, + "acc_norm": 0.5148936170212766, + "acc_norm_stderr": 0.03267151848924777 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4649122807017544, + "acc_stderr": 0.046920083813689104, + "acc_norm": 0.4649122807017544, + "acc_norm_stderr": 0.046920083813689104 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.04166567577101579, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.04166567577101579 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.37566137566137564, + "acc_stderr": 0.02494236893115978, + "acc_norm": 0.37566137566137564, + "acc_norm_stderr": 0.02494236893115978 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.04415438226743744, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.04415438226743744 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7612903225806451, + "acc_stderr": 0.02425107126220884, + "acc_norm": 0.7612903225806451, + "acc_norm_stderr": 0.02425107126220884 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.49261083743842365, + "acc_stderr": 0.03517603540361009, + "acc_norm": 0.49261083743842365, + "acc_norm_stderr": 0.03517603540361009 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.031922715695483, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.031922715695483 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.03053289223393202, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.03053289223393202 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8860103626943006, + "acc_stderr": 0.022935144053919436, + "acc_norm": 0.8860103626943006, + "acc_norm_stderr": 0.022935144053919436 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6, + "acc_stderr": 0.024838811988033165, + "acc_norm": 0.6, + "acc_norm_stderr": 0.024838811988033165 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.31851851851851853, + "acc_stderr": 0.02840653309060846, + "acc_norm": 0.31851851851851853, + "acc_norm_stderr": 0.02840653309060846 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6596638655462185, + "acc_stderr": 0.03077805742293167, + "acc_norm": 0.6596638655462185, + "acc_norm_stderr": 0.03077805742293167 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.03879687024073327, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.03879687024073327 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8275229357798165, + "acc_stderr": 0.016197807956848036, + "acc_norm": 0.8275229357798165, + "acc_norm_stderr": 0.016197807956848036 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.47685185185185186, + "acc_stderr": 0.03406315360711507, + "acc_norm": 0.47685185185185186, + "acc_norm_stderr": 0.03406315360711507 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.028867431449849313, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.028867431449849313 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7848101265822784, + "acc_stderr": 0.026750826994676177, + "acc_norm": 0.7848101265822784, + "acc_norm_stderr": 0.026750826994676177 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7251908396946565, + "acc_stderr": 0.03915345408847835, + "acc_norm": 0.7251908396946565, + "acc_norm_stderr": 0.03915345408847835 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7851239669421488, + "acc_stderr": 0.037494924487096966, + "acc_norm": 0.7851239669421488, + "acc_norm_stderr": 0.037494924487096966 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.042844679680521934, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.042844679680521934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7239263803680982, + "acc_stderr": 0.035123852837050475, + "acc_norm": 0.7239263803680982, + "acc_norm_stderr": 0.035123852837050475 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.48214285714285715, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.48214285714285715, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.037601780060266196, + "acc_norm": 0.8252427184466019, + "acc_norm_stderr": 0.037601780060266196 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8547008547008547, + "acc_stderr": 0.023086635086841407, + "acc_norm": 0.8547008547008547, + "acc_norm_stderr": 0.023086635086841407 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.80970625798212, + "acc_stderr": 0.014036945850381401, + "acc_norm": 0.80970625798212, + "acc_norm_stderr": 0.014036945850381401 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6705202312138728, + "acc_stderr": 0.02530525813187971, + "acc_norm": 0.6705202312138728, + "acc_norm_stderr": 0.02530525813187971 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.38212290502793295, + "acc_stderr": 0.016251139711570772, + "acc_norm": 0.38212290502793295, + "acc_norm_stderr": 0.016251139711570772 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6830065359477124, + "acc_stderr": 0.026643278474508758, + "acc_norm": 0.6830065359477124, + "acc_norm_stderr": 0.026643278474508758 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6655948553054662, + "acc_stderr": 0.026795422327893934, + "acc_norm": 0.6655948553054662, + "acc_norm_stderr": 0.026795422327893934 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7283950617283951, + "acc_stderr": 0.024748624490537368, + "acc_norm": 0.7283950617283951, + "acc_norm_stderr": 0.024748624490537368 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4397163120567376, + "acc_stderr": 0.029609912075594113, + "acc_norm": 0.4397163120567376, + "acc_norm_stderr": 0.029609912075594113 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4471968709256845, + "acc_stderr": 0.012698825252435113, + "acc_norm": 0.4471968709256845, + "acc_norm_stderr": 0.012698825252435113 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6507352941176471, + "acc_stderr": 0.028959755196824876, + "acc_norm": 0.6507352941176471, + "acc_norm_stderr": 0.028959755196824876 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6372549019607843, + "acc_stderr": 0.01945076843250551, + "acc_norm": 0.6372549019607843, + "acc_norm_stderr": 0.01945076843250551 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6818181818181818, + "acc_stderr": 0.044612721759105085, + "acc_norm": 0.6818181818181818, + "acc_norm_stderr": 0.044612721759105085 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7020408163265306, + "acc_stderr": 0.029279567411065674, + "acc_norm": 0.7020408163265306, + "acc_norm_stderr": 0.029279567411065674 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.835820895522388, + "acc_stderr": 0.026193923544454142, + "acc_norm": 0.835820895522388, + "acc_norm_stderr": 0.026193923544454142 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036625, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036625 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5120481927710844, + "acc_stderr": 0.03891364495835817, + "acc_norm": 0.5120481927710844, + "acc_norm_stderr": 0.03891364495835817 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8245614035087719, + "acc_stderr": 0.02917088550072767, + "acc_norm": 0.8245614035087719, + "acc_norm_stderr": 0.02917088550072767 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.44063647490820074, + "mc1_stderr": 0.01737969755543745, + "mc2": 0.5948150094881657, + "mc2_stderr": 0.015368298346344834 + }, + "harness|winogrande|5": { + "acc": 0.7861089187056038, + "acc_stderr": 0.011524466954090254 + }, + "harness|drop|3": { + "em": 0.3104026845637584, + "em_stderr": 0.004738055752138192, + "f1": 0.4353963926174528, + "f1_stderr": 0.00449914777310532 + }, + "harness|gsm8k|5": { + "acc": 0.20090978013646701, + "acc_stderr": 0.011036738221872362 + }, + "all": { + "acc": 0.6176463031493693, + "acc_stderr": 0.032662290108723034, + "acc_norm": 0.62542178984075, + "acc_norm_stderr": 0.033365783453664206, + "mc1": 0.44063647490820074, + "mc1_stderr": 0.01737969755543745, + "mc2": 0.5948150094881657, + "mc2_stderr": 0.015368298346344834, + "em": 0.3104026845637584, + "em_stderr": 0.004738055752138192, + "f1": 0.4353963926174528, + "f1_stderr": 0.00449914777310532 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "15989f43c993515c" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "ef5b6f5a742d35f8" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "b08561998071c028" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Intel/neural-chat-7b-v3-2/results_2023-12-04T15-53-32.280845.json b/eval-results/Intel/neural-chat-7b-v3-2/results_2023-12-04T15-53-32.280845.json new file mode 100644 index 0000000000000000000000000000000000000000..11e2bc900fc0d3cb6da68a1446e67a7be86e3d4e --- /dev/null +++ b/eval-results/Intel/neural-chat-7b-v3-2/results_2023-12-04T15-53-32.280845.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 153091.393375818, + "end_time": 160083.333280671, + "total_evaluation_time_secondes": "6991.939904853003", + "model_name": "Intel/neural-chat-7b-v3-2", + "model_sha": "2ecaf100bcf63da6cf87dd7bfbea5732fa74c413", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.636518771331058, + "acc_stderr": 0.014056207319068283, + "acc_norm": 0.6749146757679181, + "acc_norm_stderr": 0.013688147309729124 + }, + "harness|hellaswag|10": { + "acc": 0.6414060944035053, + "acc_stderr": 0.004786075107572188, + "acc_norm": 0.8391754630551683, + "acc_norm_stderr": 0.0036661823284423437 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6222222222222222, + "acc_stderr": 0.04188307537595852, + "acc_norm": 0.6222222222222222, + "acc_norm_stderr": 0.04188307537595852 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6973684210526315, + "acc_stderr": 0.03738520676119668, + "acc_norm": 0.6973684210526315, + "acc_norm_stderr": 0.03738520676119668 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6754716981132075, + "acc_stderr": 0.02881561571343211, + "acc_norm": 0.6754716981132075, + "acc_norm_stderr": 0.02881561571343211 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7083333333333334, + "acc_stderr": 0.038009680605548594, + "acc_norm": 0.7083333333333334, + "acc_norm_stderr": 0.038009680605548594 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.036430371689585475, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.036430371689585475 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.43137254901960786, + "acc_stderr": 0.04928099597287534, + "acc_norm": 0.43137254901960786, + "acc_norm_stderr": 0.04928099597287534 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.73, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.73, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5787234042553191, + "acc_stderr": 0.03227834510146268, + "acc_norm": 0.5787234042553191, + "acc_norm_stderr": 0.03227834510146268 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.49122807017543857, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.49122807017543857, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.025010749116137602, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.025010749116137602 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4365079365079365, + "acc_stderr": 0.04435932892851466, + "acc_norm": 0.4365079365079365, + "acc_norm_stderr": 0.04435932892851466 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7774193548387097, + "acc_stderr": 0.02366421667164251, + "acc_norm": 0.7774193548387097, + "acc_norm_stderr": 0.02366421667164251 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.49261083743842365, + "acc_stderr": 0.035176035403610084, + "acc_norm": 0.49261083743842365, + "acc_norm_stderr": 0.035176035403610084 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7696969696969697, + "acc_stderr": 0.0328766675860349, + "acc_norm": 0.7696969696969697, + "acc_norm_stderr": 0.0328766675860349 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.03053289223393202, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.03053289223393202 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8497409326424871, + "acc_stderr": 0.025787723180723875, + "acc_norm": 0.8497409326424871, + "acc_norm_stderr": 0.025787723180723875 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6615384615384615, + "acc_stderr": 0.023991500500313036, + "acc_norm": 0.6615384615384615, + "acc_norm_stderr": 0.023991500500313036 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34074074074074073, + "acc_stderr": 0.028897748741131143, + "acc_norm": 0.34074074074074073, + "acc_norm_stderr": 0.028897748741131143 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6932773109243697, + "acc_stderr": 0.029953823891887037, + "acc_norm": 0.6932773109243697, + "acc_norm_stderr": 0.029953823891887037 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3841059602649007, + "acc_stderr": 0.03971301814719197, + "acc_norm": 0.3841059602649007, + "acc_norm_stderr": 0.03971301814719197 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8366972477064221, + "acc_stderr": 0.015848255806501534, + "acc_norm": 0.8366972477064221, + "acc_norm_stderr": 0.015848255806501534 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5416666666666666, + "acc_stderr": 0.03398110890294636, + "acc_norm": 0.5416666666666666, + "acc_norm_stderr": 0.03398110890294636 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.803921568627451, + "acc_stderr": 0.027865942286639318, + "acc_norm": 0.803921568627451, + "acc_norm_stderr": 0.027865942286639318 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7974683544303798, + "acc_stderr": 0.026160568246601457, + "acc_norm": 0.7974683544303798, + "acc_norm_stderr": 0.026160568246601457 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6995515695067265, + "acc_stderr": 0.030769352008229146, + "acc_norm": 0.6995515695067265, + "acc_norm_stderr": 0.030769352008229146 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7099236641221374, + "acc_stderr": 0.03980066246467766, + "acc_norm": 0.7099236641221374, + "acc_norm_stderr": 0.03980066246467766 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.75, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7177914110429447, + "acc_stderr": 0.03536117886664742, + "acc_norm": 0.7177914110429447, + "acc_norm_stderr": 0.03536117886664742 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5267857142857143, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.5267857142857143, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.03916667762822584, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.03916667762822584 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8675213675213675, + "acc_stderr": 0.022209309073165623, + "acc_norm": 0.8675213675213675, + "acc_norm_stderr": 0.022209309073165623 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8109833971902938, + "acc_stderr": 0.014000791294406999, + "acc_norm": 0.8109833971902938, + "acc_norm_stderr": 0.014000791294406999 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6936416184971098, + "acc_stderr": 0.024818350129436596, + "acc_norm": 0.6936416184971098, + "acc_norm_stderr": 0.024818350129436596 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.37988826815642457, + "acc_stderr": 0.01623282681867849, + "acc_norm": 0.37988826815642457, + "acc_norm_stderr": 0.01623282681867849 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7026143790849673, + "acc_stderr": 0.02617390850671858, + "acc_norm": 0.7026143790849673, + "acc_norm_stderr": 0.02617390850671858 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7009646302250804, + "acc_stderr": 0.026003301117885142, + "acc_norm": 0.7009646302250804, + "acc_norm_stderr": 0.026003301117885142 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7191358024691358, + "acc_stderr": 0.025006469755799208, + "acc_norm": 0.7191358024691358, + "acc_norm_stderr": 0.025006469755799208 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.48226950354609927, + "acc_stderr": 0.02980873964223777, + "acc_norm": 0.48226950354609927, + "acc_norm_stderr": 0.02980873964223777 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.43415906127770537, + "acc_stderr": 0.012659033237067248, + "acc_norm": 0.43415906127770537, + "acc_norm_stderr": 0.012659033237067248 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6838235294117647, + "acc_stderr": 0.02824568739146292, + "acc_norm": 0.6838235294117647, + "acc_norm_stderr": 0.02824568739146292 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6519607843137255, + "acc_stderr": 0.019270998708223974, + "acc_norm": 0.6519607843137255, + "acc_norm_stderr": 0.019270998708223974 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7346938775510204, + "acc_stderr": 0.028263889943784603, + "acc_norm": 0.7346938775510204, + "acc_norm_stderr": 0.028263889943784603 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.835820895522388, + "acc_stderr": 0.026193923544454125, + "acc_norm": 0.835820895522388, + "acc_norm_stderr": 0.026193923544454125 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5481927710843374, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.5481927710843374, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8070175438596491, + "acc_stderr": 0.030267457554898458, + "acc_norm": 0.8070175438596491, + "acc_norm_stderr": 0.030267457554898458 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.43818849449204406, + "mc1_stderr": 0.017369236164404445, + "mc2": 0.596824313919398, + "mc2_stderr": 0.015111088211554574 + }, + "harness|winogrande|5": { + "acc": 0.7995264404104183, + "acc_stderr": 0.011251958281205083 + }, + "harness|gsm8k|5": { + "acc": 0.5511751326762699, + "acc_stderr": 0.013700157442788066 + }, + "all": { + "acc": 0.6369210872251465, + "acc_stderr": 0.0326377312636888, + "acc_norm": 0.6396211867921318, + "acc_norm_stderr": 0.03329595828089076, + "mc1": 0.43818849449204406, + "mc1_stderr": 0.017369236164404445, + "mc2": 0.596824313919398, + "mc2_stderr": 0.015111088211554574 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "b8746785d71b605a" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "d663f02da8ff62d5" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Intel/neural-chat-7b-v3-3-Slerp/results_2023-12-10T17-57-49.451204.json b/eval-results/Intel/neural-chat-7b-v3-3-Slerp/results_2023-12-10T17-57-49.451204.json new file mode 100644 index 0000000000000000000000000000000000000000..aab9e2bbab10b55cd16fe2fc010025b65a67a8ab --- /dev/null +++ b/eval-results/Intel/neural-chat-7b-v3-3-Slerp/results_2023-12-10T17-57-49.451204.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 458575.758327981, + "end_time": 465978.60006467, + "total_evaluation_time_secondes": "7402.841736689035", + "model_name": "Intel/neural-chat-7b-v3-3-Slerp", + "model_sha": "cbd4f663365e40d50ed9834016bf840971b35db5", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6467576791808873, + "acc_stderr": 0.013967822714840055, + "acc_norm": 0.6663822525597269, + "acc_norm_stderr": 0.013778687054176536 + }, + "harness|hellaswag|10": { + "acc": 0.6664011153156741, + "acc_stderr": 0.0047053471376996185, + "acc_norm": 0.8543118900617407, + "acc_norm_stderr": 0.003520722505332094 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6074074074074074, + "acc_stderr": 0.0421850621536888, + "acc_norm": 0.6074074074074074, + "acc_norm_stderr": 0.0421850621536888 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6513157894736842, + "acc_stderr": 0.03878139888797612, + "acc_norm": 0.6513157894736842, + "acc_norm_stderr": 0.03878139888797612 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6867924528301886, + "acc_stderr": 0.028544793319055326, + "acc_norm": 0.6867924528301886, + "acc_norm_stderr": 0.028544793319055326 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7083333333333334, + "acc_stderr": 0.038009680605548594, + "acc_norm": 0.7083333333333334, + "acc_norm_stderr": 0.038009680605548594 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.036430371689585475, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.036430371689585475 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.049406356306056595, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.049406356306056595 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5531914893617021, + "acc_stderr": 0.0325005368436584, + "acc_norm": 0.5531914893617021, + "acc_norm_stderr": 0.0325005368436584 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.43859649122807015, + "acc_stderr": 0.04668000738510455, + "acc_norm": 0.43859649122807015, + "acc_norm_stderr": 0.04668000738510455 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5517241379310345, + "acc_stderr": 0.04144311810878152, + "acc_norm": 0.5517241379310345, + "acc_norm_stderr": 0.04144311810878152 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.37566137566137564, + "acc_stderr": 0.024942368931159798, + "acc_norm": 0.37566137566137564, + "acc_norm_stderr": 0.024942368931159798 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.043758884927270605, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.043758884927270605 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411019, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411019 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7419354838709677, + "acc_stderr": 0.02489246917246283, + "acc_norm": 0.7419354838709677, + "acc_norm_stderr": 0.02489246917246283 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4630541871921182, + "acc_stderr": 0.035083705204426656, + "acc_norm": 0.4630541871921182, + "acc_norm_stderr": 0.035083705204426656 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7515151515151515, + "acc_stderr": 0.033744026441394036, + "acc_norm": 0.7515151515151515, + "acc_norm_stderr": 0.033744026441394036 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7676767676767676, + "acc_stderr": 0.030088629490217487, + "acc_norm": 0.7676767676767676, + "acc_norm_stderr": 0.030088629490217487 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8704663212435233, + "acc_stderr": 0.024233532297758733, + "acc_norm": 0.8704663212435233, + "acc_norm_stderr": 0.024233532297758733 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.617948717948718, + "acc_stderr": 0.024635549163908234, + "acc_norm": 0.617948717948718, + "acc_norm_stderr": 0.024635549163908234 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.337037037037037, + "acc_stderr": 0.028820884666253255, + "acc_norm": 0.337037037037037, + "acc_norm_stderr": 0.028820884666253255 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6386554621848739, + "acc_stderr": 0.03120469122515002, + "acc_norm": 0.6386554621848739, + "acc_norm_stderr": 0.03120469122515002 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943343, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943343 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8293577981651377, + "acc_stderr": 0.016129271025099867, + "acc_norm": 0.8293577981651377, + "acc_norm_stderr": 0.016129271025099867 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5416666666666666, + "acc_stderr": 0.03398110890294636, + "acc_norm": 0.5416666666666666, + "acc_norm_stderr": 0.03398110890294636 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.028867431449849316, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.028867431449849316 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7805907172995781, + "acc_stderr": 0.026939106581553945, + "acc_norm": 0.7805907172995781, + "acc_norm_stderr": 0.026939106581553945 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6591928251121076, + "acc_stderr": 0.0318114974705536, + "acc_norm": 0.6591928251121076, + "acc_norm_stderr": 0.0318114974705536 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7175572519083969, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.7175572519083969, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7768595041322314, + "acc_stderr": 0.03800754475228733, + "acc_norm": 0.7768595041322314, + "acc_norm_stderr": 0.03800754475228733 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.042844679680521934, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.042844679680521934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7055214723926381, + "acc_stderr": 0.03581165790474082, + "acc_norm": 0.7055214723926381, + "acc_norm_stderr": 0.03581165790474082 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.44642857142857145, + "acc_stderr": 0.04718471485219588, + "acc_norm": 0.44642857142857145, + "acc_norm_stderr": 0.04718471485219588 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8632478632478633, + "acc_stderr": 0.022509033937077802, + "acc_norm": 0.8632478632478633, + "acc_norm_stderr": 0.022509033937077802 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8020434227330779, + "acc_stderr": 0.014248873549217575, + "acc_norm": 0.8020434227330779, + "acc_norm_stderr": 0.014248873549217575 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7052023121387283, + "acc_stderr": 0.02454761779480383, + "acc_norm": 0.7052023121387283, + "acc_norm_stderr": 0.02454761779480383 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4491620111731844, + "acc_stderr": 0.01663583834163192, + "acc_norm": 0.4491620111731844, + "acc_norm_stderr": 0.01663583834163192 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6928104575163399, + "acc_stderr": 0.02641560191438898, + "acc_norm": 0.6928104575163399, + "acc_norm_stderr": 0.02641560191438898 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6784565916398714, + "acc_stderr": 0.026527724079528872, + "acc_norm": 0.6784565916398714, + "acc_norm_stderr": 0.026527724079528872 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7006172839506173, + "acc_stderr": 0.025483115601195455, + "acc_norm": 0.7006172839506173, + "acc_norm_stderr": 0.025483115601195455 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.45390070921985815, + "acc_stderr": 0.02970045324729146, + "acc_norm": 0.45390070921985815, + "acc_norm_stderr": 0.02970045324729146 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4256844850065189, + "acc_stderr": 0.012628393551811947, + "acc_norm": 0.4256844850065189, + "acc_norm_stderr": 0.012628393551811947 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6323529411764706, + "acc_stderr": 0.029289413409403192, + "acc_norm": 0.6323529411764706, + "acc_norm_stderr": 0.029289413409403192 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6290849673202614, + "acc_stderr": 0.019542101564854125, + "acc_norm": 0.6290849673202614, + "acc_norm_stderr": 0.019542101564854125 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.0449429086625209, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.0449429086625209 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7346938775510204, + "acc_stderr": 0.028263889943784596, + "acc_norm": 0.7346938775510204, + "acc_norm_stderr": 0.028263889943784596 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8258706467661692, + "acc_stderr": 0.026814951200421606, + "acc_norm": 0.8258706467661692, + "acc_norm_stderr": 0.026814951200421606 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5180722891566265, + "acc_stderr": 0.03889951252827216, + "acc_norm": 0.5180722891566265, + "acc_norm_stderr": 0.03889951252827216 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8245614035087719, + "acc_stderr": 0.02917088550072767, + "acc_norm": 0.8245614035087719, + "acc_norm_stderr": 0.02917088550072767 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.47368421052631576, + "mc1_stderr": 0.017479241161975526, + "mc2": 0.6319769000319811, + "mc2_stderr": 0.0150681826970418 + }, + "harness|winogrande|5": { + "acc": 0.7971586424625099, + "acc_stderr": 0.011301439925936662 + }, + "harness|gsm8k|5": { + "acc": 0.6997725549658832, + "acc_stderr": 0.01262542315228303 + }, + "all": { + "acc": 0.6272160356239721, + "acc_stderr": 0.03276418695667091, + "acc_norm": 0.6266234292162511, + "acc_norm_stderr": 0.03344601323704533, + "mc1": 0.47368421052631576, + "mc1_stderr": 0.017479241161975526, + "mc2": 0.6319769000319811, + "mc2_stderr": 0.0150681826970418 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "7e3b63f52ffea7be" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "889941516934f8e7" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Intel/neural-chat-7b-v3-3/results_2023-12-09T20-33-34.862293.json b/eval-results/Intel/neural-chat-7b-v3-3/results_2023-12-09T20-33-34.862293.json new file mode 100644 index 0000000000000000000000000000000000000000..0fc534e263e19061ec40c269ed834c594b347c74 --- /dev/null +++ b/eval-results/Intel/neural-chat-7b-v3-3/results_2023-12-09T20-33-34.862293.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 601538.359941734, + "end_time": 608885.033690497, + "total_evaluation_time_secondes": "7346.673748763045", + "model_name": "Intel/neural-chat-7b-v3-3", + "model_sha": "fac83ab297a1c9ecc8affd97c998d864c10b9ff4", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6373720136518771, + "acc_stderr": 0.014049106564955007, + "acc_norm": 0.6689419795221843, + "acc_norm_stderr": 0.013752062419817837 + }, + "harness|hellaswag|10": { + "acc": 0.6617207727544314, + "acc_stderr": 0.004721571443354415, + "acc_norm": 0.8526190001991635, + "acc_norm_stderr": 0.0035376085010691773 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6148148148148148, + "acc_stderr": 0.0420392104015628, + "acc_norm": 0.6148148148148148, + "acc_norm_stderr": 0.0420392104015628 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6578947368421053, + "acc_stderr": 0.03860731599316092, + "acc_norm": 0.6578947368421053, + "acc_norm_stderr": 0.03860731599316092 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6679245283018868, + "acc_stderr": 0.02898545565233439, + "acc_norm": 0.6679245283018868, + "acc_norm_stderr": 0.02898545565233439 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7291666666666666, + "acc_stderr": 0.03716177437566017, + "acc_norm": 0.7291666666666666, + "acc_norm_stderr": 0.03716177437566017 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.03643037168958546, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.03643037168958546 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.46078431372549017, + "acc_stderr": 0.04959859966384181, + "acc_norm": 0.46078431372549017, + "acc_norm_stderr": 0.04959859966384181 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.74, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.74, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5531914893617021, + "acc_stderr": 0.032500536843658404, + "acc_norm": 0.5531914893617021, + "acc_norm_stderr": 0.032500536843658404 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4298245614035088, + "acc_stderr": 0.04657047260594963, + "acc_norm": 0.4298245614035088, + "acc_norm_stderr": 0.04657047260594963 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5655172413793104, + "acc_stderr": 0.04130740879555498, + "acc_norm": 0.5655172413793104, + "acc_norm_stderr": 0.04130740879555498 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3835978835978836, + "acc_stderr": 0.025043757318520193, + "acc_norm": 0.3835978835978836, + "acc_norm_stderr": 0.025043757318520193 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7483870967741936, + "acc_stderr": 0.02468597928623996, + "acc_norm": 0.7483870967741936, + "acc_norm_stderr": 0.02468597928623996 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.458128078817734, + "acc_stderr": 0.03505630140785741, + "acc_norm": 0.458128078817734, + "acc_norm_stderr": 0.03505630140785741 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.68, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7636363636363637, + "acc_stderr": 0.033175059300091826, + "acc_norm": 0.7636363636363637, + "acc_norm_stderr": 0.033175059300091826 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.02912652283458682, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.02912652283458682 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8704663212435233, + "acc_stderr": 0.02423353229775873, + "acc_norm": 0.8704663212435233, + "acc_norm_stderr": 0.02423353229775873 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6333333333333333, + "acc_stderr": 0.02443301646605246, + "acc_norm": 0.6333333333333333, + "acc_norm_stderr": 0.02443301646605246 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3592592592592593, + "acc_stderr": 0.029252905927251972, + "acc_norm": 0.3592592592592593, + "acc_norm_stderr": 0.029252905927251972 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6596638655462185, + "acc_stderr": 0.030778057422931673, + "acc_norm": 0.6596638655462185, + "acc_norm_stderr": 0.030778057422931673 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8348623853211009, + "acc_stderr": 0.015919557829976044, + "acc_norm": 0.8348623853211009, + "acc_norm_stderr": 0.015919557829976044 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5046296296296297, + "acc_stderr": 0.03409825519163572, + "acc_norm": 0.5046296296296297, + "acc_norm_stderr": 0.03409825519163572 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7892156862745098, + "acc_stderr": 0.028626547912437406, + "acc_norm": 0.7892156862745098, + "acc_norm_stderr": 0.028626547912437406 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7805907172995781, + "acc_stderr": 0.026939106581553945, + "acc_norm": 0.7805907172995781, + "acc_norm_stderr": 0.026939106581553945 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6860986547085202, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.6860986547085202, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7709923664122137, + "acc_stderr": 0.036853466317118506, + "acc_norm": 0.7709923664122137, + "acc_norm_stderr": 0.036853466317118506 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.04330043749650742, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.04330043749650742 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7361963190184049, + "acc_stderr": 0.03462419931615624, + "acc_norm": 0.7361963190184049, + "acc_norm_stderr": 0.03462419931615624 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.039166677628225836, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.039166677628225836 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.02190190511507333, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.02190190511507333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.73, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.73, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8199233716475096, + "acc_stderr": 0.013740797258579825, + "acc_norm": 0.8199233716475096, + "acc_norm_stderr": 0.013740797258579825 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7109826589595376, + "acc_stderr": 0.02440517393578323, + "acc_norm": 0.7109826589595376, + "acc_norm_stderr": 0.02440517393578323 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4, + "acc_stderr": 0.016384638410380823, + "acc_norm": 0.4, + "acc_norm_stderr": 0.016384638410380823 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6993464052287581, + "acc_stderr": 0.02625605383571896, + "acc_norm": 0.6993464052287581, + "acc_norm_stderr": 0.02625605383571896 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6784565916398714, + "acc_stderr": 0.026527724079528872, + "acc_norm": 0.6784565916398714, + "acc_norm_stderr": 0.026527724079528872 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7345679012345679, + "acc_stderr": 0.024569223600460845, + "acc_norm": 0.7345679012345679, + "acc_norm_stderr": 0.024569223600460845 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.45390070921985815, + "acc_stderr": 0.02970045324729146, + "acc_norm": 0.45390070921985815, + "acc_norm_stderr": 0.02970045324729146 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.43546284224250326, + "acc_stderr": 0.01266341210124834, + "acc_norm": 0.43546284224250326, + "acc_norm_stderr": 0.01266341210124834 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6691176470588235, + "acc_stderr": 0.028582709753898445, + "acc_norm": 0.6691176470588235, + "acc_norm_stderr": 0.028582709753898445 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6584967320261438, + "acc_stderr": 0.019184639328092487, + "acc_norm": 0.6584967320261438, + "acc_norm_stderr": 0.019184639328092487 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.045820048415054174, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.045820048415054174 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7142857142857143, + "acc_stderr": 0.0289205832206756, + "acc_norm": 0.7142857142857143, + "acc_norm_stderr": 0.0289205832206756 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8109452736318408, + "acc_stderr": 0.02768691358801302, + "acc_norm": 0.8109452736318408, + "acc_norm_stderr": 0.02768691358801302 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5421686746987951, + "acc_stderr": 0.0387862677100236, + "acc_norm": 0.5421686746987951, + "acc_norm_stderr": 0.0387862677100236 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8070175438596491, + "acc_stderr": 0.030267457554898458, + "acc_norm": 0.8070175438596491, + "acc_norm_stderr": 0.030267457554898458 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.4700122399020808, + "mc1_stderr": 0.017471992091697534, + "mc2": 0.6301479198844473, + "mc2_stderr": 0.015176409746133967 + }, + "harness|winogrande|5": { + "acc": 0.7963693764798737, + "acc_stderr": 0.011317798781626913 + }, + "harness|gsm8k|5": { + "acc": 0.6110689916603488, + "acc_stderr": 0.013428382481274231 + }, + "all": { + "acc": 0.633718840288445, + "acc_stderr": 0.03262856399270551, + "acc_norm": 0.6351165946232198, + "acc_norm_stderr": 0.03329008839330021, + "mc1": 0.4700122399020808, + "mc1_stderr": 0.017471992091697534, + "mc2": 0.6301479198844473, + "mc2_stderr": 0.015176409746133967 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "782f3bd8a5c15bb4" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "58d73023ca41603e" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Intel/neural-chat-7b-v3/results_2023-11-14T07-40-49.387630.json b/eval-results/Intel/neural-chat-7b-v3/results_2023-11-14T07-40-49.387630.json new file mode 100644 index 0000000000000000000000000000000000000000..3ae6d0ebf14728e1a2b7179bf290fe9c92ce2b06 --- /dev/null +++ b/eval-results/Intel/neural-chat-7b-v3/results_2023-11-14T07-40-49.387630.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 631669.723911664, + "end_time": 642348.313574379, + "total_evaluation_time_secondes": "10678.589662714978", + "model_name": "Intel/neural-chat-7b-v3", + "model_sha": "7a05c8a2151f7d32252d9ef5db10445c13ae1f20", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6399317406143344, + "acc_stderr": 0.014027516814585188, + "acc_norm": 0.6715017064846417, + "acc_norm_stderr": 0.0137249784655373 + }, + "harness|hellaswag|10": { + "acc": 0.6532563234415455, + "acc_stderr": 0.004749606196363344, + "acc_norm": 0.8329018123879706, + "acc_norm_stderr": 0.0037230107458783917 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5851851851851851, + "acc_stderr": 0.04256193767901408, + "acc_norm": 0.5851851851851851, + "acc_norm_stderr": 0.04256193767901408 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6710526315789473, + "acc_stderr": 0.03823428969926604, + "acc_norm": 0.6710526315789473, + "acc_norm_stderr": 0.03823428969926604 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.04960449637488583, + "acc_norm": 0.58, + "acc_norm_stderr": 0.04960449637488583 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6716981132075471, + "acc_stderr": 0.02890159361241178, + "acc_norm": 0.6716981132075471, + "acc_norm_stderr": 0.02890159361241178 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7152777777777778, + "acc_stderr": 0.03773809990686934, + "acc_norm": 0.7152777777777778, + "acc_norm_stderr": 0.03773809990686934 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.630057803468208, + "acc_stderr": 0.0368122963339432, + "acc_norm": 0.630057803468208, + "acc_norm_stderr": 0.0368122963339432 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5531914893617021, + "acc_stderr": 0.0325005368436584, + "acc_norm": 0.5531914893617021, + "acc_norm_stderr": 0.0325005368436584 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.046970851366478626, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.046970851366478626 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5586206896551724, + "acc_stderr": 0.04137931034482757, + "acc_norm": 0.5586206896551724, + "acc_norm_stderr": 0.04137931034482757 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3994708994708995, + "acc_stderr": 0.02522545028406788, + "acc_norm": 0.3994708994708995, + "acc_norm_stderr": 0.02522545028406788 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4365079365079365, + "acc_stderr": 0.04435932892851466, + "acc_norm": 0.4365079365079365, + "acc_norm_stderr": 0.04435932892851466 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7645161290322581, + "acc_stderr": 0.024137632429337717, + "acc_norm": 0.7645161290322581, + "acc_norm_stderr": 0.024137632429337717 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.49261083743842365, + "acc_stderr": 0.035176035403610084, + "acc_norm": 0.49261083743842365, + "acc_norm_stderr": 0.035176035403610084 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7636363636363637, + "acc_stderr": 0.03317505930009182, + "acc_norm": 0.7636363636363637, + "acc_norm_stderr": 0.03317505930009182 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7727272727272727, + "acc_stderr": 0.029857515673386414, + "acc_norm": 0.7727272727272727, + "acc_norm_stderr": 0.029857515673386414 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8601036269430051, + "acc_stderr": 0.025033870583015184, + "acc_norm": 0.8601036269430051, + "acc_norm_stderr": 0.025033870583015184 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6205128205128205, + "acc_stderr": 0.024603626924097417, + "acc_norm": 0.6205128205128205, + "acc_norm_stderr": 0.024603626924097417 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.027840811495871937, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.027840811495871937 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.03038835355188679, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.03038835355188679 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8165137614678899, + "acc_stderr": 0.01659525971039931, + "acc_norm": 0.8165137614678899, + "acc_norm_stderr": 0.01659525971039931 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4305555555555556, + "acc_stderr": 0.03376922151252335, + "acc_norm": 0.4305555555555556, + "acc_norm_stderr": 0.03376922151252335 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.02812597226565438, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.02812597226565438 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7932489451476793, + "acc_stderr": 0.0263616516683891, + "acc_norm": 0.7932489451476793, + "acc_norm_stderr": 0.0263616516683891 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7251908396946565, + "acc_stderr": 0.03915345408847835, + "acc_norm": 0.7251908396946565, + "acc_norm_stderr": 0.03915345408847835 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8264462809917356, + "acc_stderr": 0.0345727283691767, + "acc_norm": 0.8264462809917356, + "acc_norm_stderr": 0.0345727283691767 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7685185185185185, + "acc_stderr": 0.04077494709252626, + "acc_norm": 0.7685185185185185, + "acc_norm_stderr": 0.04077494709252626 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7300613496932515, + "acc_stderr": 0.03487825168497892, + "acc_norm": 0.7300613496932515, + "acc_norm_stderr": 0.03487825168497892 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5267857142857143, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.5267857142857143, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7961165048543689, + "acc_stderr": 0.039891398595317706, + "acc_norm": 0.7961165048543689, + "acc_norm_stderr": 0.039891398595317706 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8376068376068376, + "acc_stderr": 0.02416161812798774, + "acc_norm": 0.8376068376068376, + "acc_norm_stderr": 0.02416161812798774 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8263090676883781, + "acc_stderr": 0.01354741565866225, + "acc_norm": 0.8263090676883781, + "acc_norm_stderr": 0.01354741565866225 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6907514450867052, + "acc_stderr": 0.02488314057007176, + "acc_norm": 0.6907514450867052, + "acc_norm_stderr": 0.02488314057007176 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.35083798882681566, + "acc_stderr": 0.01596103667523096, + "acc_norm": 0.35083798882681566, + "acc_norm_stderr": 0.01596103667523096 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7189542483660131, + "acc_stderr": 0.025738854797818737, + "acc_norm": 0.7189542483660131, + "acc_norm_stderr": 0.025738854797818737 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6977491961414791, + "acc_stderr": 0.02608270069539966, + "acc_norm": 0.6977491961414791, + "acc_norm_stderr": 0.02608270069539966 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6975308641975309, + "acc_stderr": 0.02555765398186806, + "acc_norm": 0.6975308641975309, + "acc_norm_stderr": 0.02555765398186806 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.425531914893617, + "acc_stderr": 0.02949482760014437, + "acc_norm": 0.425531914893617, + "acc_norm_stderr": 0.02949482760014437 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.45371577574967403, + "acc_stderr": 0.012715404841277745, + "acc_norm": 0.45371577574967403, + "acc_norm_stderr": 0.012715404841277745 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.625, + "acc_stderr": 0.029408372932278746, + "acc_norm": 0.625, + "acc_norm_stderr": 0.029408372932278746 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6421568627450981, + "acc_stderr": 0.01939305840235544, + "acc_norm": 0.6421568627450981, + "acc_norm_stderr": 0.01939305840235544 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7020408163265306, + "acc_stderr": 0.02927956741106568, + "acc_norm": 0.7020408163265306, + "acc_norm_stderr": 0.02927956741106568 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8308457711442786, + "acc_stderr": 0.02650859065623325, + "acc_norm": 0.8308457711442786, + "acc_norm_stderr": 0.02650859065623325 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5240963855421686, + "acc_stderr": 0.03887971849597264, + "acc_norm": 0.5240963855421686, + "acc_norm_stderr": 0.03887971849597264 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8128654970760234, + "acc_stderr": 0.02991312723236804, + "acc_norm": 0.8128654970760234, + "acc_norm_stderr": 0.02991312723236804 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.42472460220318237, + "mc1_stderr": 0.01730400095716748, + "mc2": 0.5876875540849609, + "mc2_stderr": 0.015599384707939359 + }, + "harness|winogrande|5": { + "acc": 0.7805840568271507, + "acc_stderr": 0.011631268360607778 + }, + "harness|drop|3": { + "em": 0.43435402684563756, + "em_stderr": 0.005076143925092938, + "f1": 0.5043477348993302, + "f1_stderr": 0.004800980590603821 + }, + "harness|gsm8k|5": { + "acc": 0.012130401819560273, + "acc_stderr": 0.0030152942428909512 + }, + "all": { + "acc": 0.6159787282968994, + "acc_stderr": 0.03250548890211772, + "acc_norm": 0.6270034478352691, + "acc_norm_stderr": 0.03333659536654491, + "mc1": 0.42472460220318237, + "mc1_stderr": 0.01730400095716748, + "mc2": 0.5876875540849609, + "mc2_stderr": 0.015599384707939359, + "em": 0.43435402684563756, + "em_stderr": 0.005076143925092938, + "f1": 0.5043477348993302, + "f1_stderr": 0.004800980590603821 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "989e4cc6fe3b3673" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "59b8c60f99ac9009" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "246eb6b1e594bdee" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Jiayi-Pan/Tiny-Vicuna-1B/results_2023-11-28T00-25-42.048913.json b/eval-results/Jiayi-Pan/Tiny-Vicuna-1B/results_2023-11-28T00-25-42.048913.json new file mode 100644 index 0000000000000000000000000000000000000000..b96122278ee6f5b8de6ac4378324f13a0baa349f --- /dev/null +++ b/eval-results/Jiayi-Pan/Tiny-Vicuna-1B/results_2023-11-28T00-25-42.048913.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1007875.786959209, + "end_time": 1018242.963892403, + "total_evaluation_time_secondes": "10367.176933193929", + "model_name": "Jiayi-Pan/Tiny-Vicuna-1B", + "model_sha": "175336a0000f36b508575ef1a2da05755faf48c3", + "model_dtype": "torch.float16", + "model_size": "2.06 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.30119453924914674, + "acc_stderr": 0.013406741767847627, + "acc_norm": 0.33447098976109213, + "acc_norm_stderr": 0.013787460322441372 + }, + "harness|hellaswag|10": { + "acc": 0.43308105954989046, + "acc_stderr": 0.004944889545497948, + "acc_norm": 0.559151563433579, + "acc_norm_stderr": 0.004954740808837195 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.03915450630414251, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.03915450630414251 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.18421052631578946, + "acc_stderr": 0.0315469804508223, + "acc_norm": 0.18421052631578946, + "acc_norm_stderr": 0.0315469804508223 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.22, + "acc_stderr": 0.0416333199893227, + "acc_norm": 0.22, + "acc_norm_stderr": 0.0416333199893227 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2679245283018868, + "acc_stderr": 0.027257260322494845, + "acc_norm": 0.2679245283018868, + "acc_norm_stderr": 0.027257260322494845 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2361111111111111, + "acc_stderr": 0.03551446610810826, + "acc_norm": 0.2361111111111111, + "acc_norm_stderr": 0.03551446610810826 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.18497109826589594, + "acc_stderr": 0.02960562398177123, + "acc_norm": 0.18497109826589594, + "acc_norm_stderr": 0.02960562398177123 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.040233822736177476, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.040233822736177476 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.28085106382978725, + "acc_stderr": 0.029379170464124825, + "acc_norm": 0.28085106382978725, + "acc_norm_stderr": 0.029379170464124825 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.040969851398436716, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.040969851398436716 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2482758620689655, + "acc_stderr": 0.03600105692727771, + "acc_norm": 0.2482758620689655, + "acc_norm_stderr": 0.03600105692727771 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.28835978835978837, + "acc_stderr": 0.02333065405453589, + "acc_norm": 0.28835978835978837, + "acc_norm_stderr": 0.02333065405453589 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.19047619047619047, + "acc_stderr": 0.03512207412302054, + "acc_norm": 0.19047619047619047, + "acc_norm_stderr": 0.03512207412302054 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24838709677419354, + "acc_stderr": 0.024580028921481003, + "acc_norm": 0.24838709677419354, + "acc_norm_stderr": 0.024580028921481003 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.21182266009852216, + "acc_stderr": 0.028748983689941054, + "acc_norm": 0.21182266009852216, + "acc_norm_stderr": 0.028748983689941054 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.23030303030303031, + "acc_stderr": 0.03287666758603489, + "acc_norm": 0.23030303030303031, + "acc_norm_stderr": 0.03287666758603489 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.20707070707070707, + "acc_stderr": 0.02886977846026706, + "acc_norm": 0.20707070707070707, + "acc_norm_stderr": 0.02886977846026706 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.23316062176165803, + "acc_stderr": 0.03051611137147601, + "acc_norm": 0.23316062176165803, + "acc_norm_stderr": 0.03051611137147601 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3076923076923077, + "acc_stderr": 0.023400928918310512, + "acc_norm": 0.3076923076923077, + "acc_norm_stderr": 0.023400928918310512 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.026067159222275798, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.026067159222275798 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.02865749128507196, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.02865749128507196 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.25165562913907286, + "acc_stderr": 0.03543304234389985, + "acc_norm": 0.25165562913907286, + "acc_norm_stderr": 0.03543304234389985 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.23853211009174313, + "acc_stderr": 0.01827257581023187, + "acc_norm": 0.23853211009174313, + "acc_norm_stderr": 0.01827257581023187 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3287037037037037, + "acc_stderr": 0.03203614084670058, + "acc_norm": 0.3287037037037037, + "acc_norm_stderr": 0.03203614084670058 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.030964517926923403, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.030964517926923403 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2616033755274262, + "acc_stderr": 0.028609516716994934, + "acc_norm": 0.2616033755274262, + "acc_norm_stderr": 0.028609516716994934 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3632286995515695, + "acc_stderr": 0.03227790442850499, + "acc_norm": 0.3632286995515695, + "acc_norm_stderr": 0.03227790442850499 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2748091603053435, + "acc_stderr": 0.03915345408847835, + "acc_norm": 0.2748091603053435, + "acc_norm_stderr": 0.03915345408847835 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.256198347107438, + "acc_stderr": 0.03984979653302872, + "acc_norm": 0.256198347107438, + "acc_norm_stderr": 0.03984979653302872 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.04330043749650743, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.04330043749650743 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.25153374233128833, + "acc_stderr": 0.03408997886857529, + "acc_norm": 0.25153374233128833, + "acc_norm_stderr": 0.03408997886857529 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285713, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285713 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.22330097087378642, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.22330097087378642, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2692307692307692, + "acc_stderr": 0.029058588303748842, + "acc_norm": 0.2692307692307692, + "acc_norm_stderr": 0.029058588303748842 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2886334610472541, + "acc_stderr": 0.016203792703197797, + "acc_norm": 0.2886334610472541, + "acc_norm_stderr": 0.016203792703197797 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2023121387283237, + "acc_stderr": 0.021628077380196137, + "acc_norm": 0.2023121387283237, + "acc_norm_stderr": 0.021628077380196137 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.21241830065359477, + "acc_stderr": 0.02342037547829613, + "acc_norm": 0.21241830065359477, + "acc_norm_stderr": 0.02342037547829613 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.28938906752411575, + "acc_stderr": 0.025755865922632935, + "acc_norm": 0.28938906752411575, + "acc_norm_stderr": 0.025755865922632935 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.024922001168886335, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.024922001168886335 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.24113475177304963, + "acc_stderr": 0.025518731049537773, + "acc_norm": 0.24113475177304963, + "acc_norm_stderr": 0.025518731049537773 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24445893089960888, + "acc_stderr": 0.0109764250131139, + "acc_norm": 0.24445893089960888, + "acc_norm_stderr": 0.0109764250131139 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.030161911930767102, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.030161911930767102 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.24673202614379086, + "acc_stderr": 0.0174408203674025, + "acc_norm": 0.24673202614379086, + "acc_norm_stderr": 0.0174408203674025 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2909090909090909, + "acc_stderr": 0.04350271442923243, + "acc_norm": 0.2909090909090909, + "acc_norm_stderr": 0.04350271442923243 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.1673469387755102, + "acc_stderr": 0.023897144768914524, + "acc_norm": 0.1673469387755102, + "acc_norm_stderr": 0.023897144768914524 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23880597014925373, + "acc_stderr": 0.030147775935409224, + "acc_norm": 0.23880597014925373, + "acc_norm_stderr": 0.030147775935409224 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.2, + "acc_stderr": 0.040201512610368445, + "acc_norm": 0.2, + "acc_norm_stderr": 0.040201512610368445 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3072289156626506, + "acc_stderr": 0.03591566797824663, + "acc_norm": 0.3072289156626506, + "acc_norm_stderr": 0.03591566797824663 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.22807017543859648, + "acc_stderr": 0.03218093795602357, + "acc_norm": 0.22807017543859648, + "acc_norm_stderr": 0.03218093795602357 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.204406364749082, + "mc1_stderr": 0.014117174337432618, + "mc2": 0.33821339956551233, + "mc2_stderr": 0.013513197627302775 + }, + "harness|winogrande|5": { + "acc": 0.5840568271507498, + "acc_stderr": 0.013852485356798259 + }, + "harness|drop|3": { + "em": 0.0035654362416107383, + "em_stderr": 0.0006104082299890361, + "f1": 0.0542627936241611, + "f1_stderr": 0.0013577132939932642 + }, + "harness|gsm8k|5": { + "acc": 0.009855951478392721, + "acc_stderr": 0.0027210765770416586 + }, + "all": { + "acc": 0.25955570864218946, + "acc_stderr": 0.03082824647414592, + "acc_norm": 0.26098868479559406, + "acc_norm_stderr": 0.031598983776389734, + "mc1": 0.204406364749082, + "mc1_stderr": 0.014117174337432618, + "mc2": 0.33821339956551233, + "mc2_stderr": 0.013513197627302775, + "em": 0.0035654362416107383, + "em_stderr": 0.0006104082299890361, + "f1": 0.0542627936241611, + "f1_stderr": 0.0013577132939932642 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "c2d55d68c4441c39", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "38dc8458e001ab84", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "5e69bf9422c979cd", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "55065fe953492209", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non_truncated": -495, + "padded": 0, + "non_padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "0903f3aba4ea094f", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non_truncated": -612, + "padded": 0, + "non_padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non_truncated": 229, + "padded": 940, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non_truncated": 930, + "padded": 5524, + "non_padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "5beea5d67d4adeee" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "0e022f01fc9470bc" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "6c2529964ad5cacf", + "hash_cont_tokens": "7427008a0b03b306" + }, + "truncated": 3351, + "non_truncated": 34844, + "padded": 111256, + "non_padded": 13152, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Jiayi-Pan/Tiny-Vicuna-1B/results_2023-12-03T18-54-15.372610.json b/eval-results/Jiayi-Pan/Tiny-Vicuna-1B/results_2023-12-03T18-54-15.372610.json new file mode 100644 index 0000000000000000000000000000000000000000..f06e0059c2a0040f90705b17f474172875b28bfd --- /dev/null +++ b/eval-results/Jiayi-Pan/Tiny-Vicuna-1B/results_2023-12-03T18-54-15.372610.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 82191.45622114, + "end_time": 84534.608990056, + "total_evaluation_time_secondes": "2343.1527689159993", + "model_name": "Jiayi-Pan/Tiny-Vicuna-1B", + "model_sha": "4b23c3c6a5860b5f5bd3ca27521dc1fb875794aa", + "model_dtype": "torch.float16", + "model_size": "2.06 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.015163002274450341, + "acc_stderr": 0.003366022949726332 + }, + "all": { + "acc": 0.015163002274450341, + "acc_stderr": 0.003366022949726332 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "0e022f01fc9470bc" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "42036645de5ac59d", + "hash_cont_tokens": "16923eff5e3c220b" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/JosephusCheung/Guanaco/results_2023-08-12T08-51-35.036959.json b/eval-results/JosephusCheung/Guanaco/results_2023-08-12T08-51-35.036959.json new file mode 100644 index 0000000000000000000000000000000000000000..d61aee61eb4f2b4eea3a4b3f7ec99fef03c3cd02 --- /dev/null +++ b/eval-results/JosephusCheung/Guanaco/results_2023-08-12T08-51-35.036959.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.46075085324232085, + "acc_stderr": 0.014566303676636584, + "acc_norm": 0.5017064846416383, + "acc_norm_stderr": 0.014611305705056983 + }, + "harness|hellaswag|10": { + "acc": 0.548496315475005, + "acc_stderr": 0.004966255089212419, + "acc_norm": 0.7269468233419637, + "acc_norm_stderr": 0.004446173999993614 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.03972552884785137, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.03972552884785137 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.24342105263157895, + "acc_stderr": 0.034923496688842384, + "acc_norm": 0.24342105263157895, + "acc_norm_stderr": 0.034923496688842384 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.27169811320754716, + "acc_stderr": 0.027377706624670713, + "acc_norm": 0.27169811320754716, + "acc_norm_stderr": 0.027377706624670713 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2916666666666667, + "acc_stderr": 0.03800968060554859, + "acc_norm": 0.2916666666666667, + "acc_norm_stderr": 0.03800968060554859 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.21, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.21, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2254335260115607, + "acc_stderr": 0.03186209851641145, + "acc_norm": 0.2254335260115607, + "acc_norm_stderr": 0.03186209851641145 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.28085106382978725, + "acc_stderr": 0.029379170464124825, + "acc_norm": 0.28085106382978725, + "acc_norm_stderr": 0.029379170464124825 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.038351539543994194, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.038351539543994194 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2689655172413793, + "acc_stderr": 0.03695183311650232, + "acc_norm": 0.2689655172413793, + "acc_norm_stderr": 0.03695183311650232 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.23015873015873015, + "acc_stderr": 0.02167921966369313, + "acc_norm": 0.23015873015873015, + "acc_norm_stderr": 0.02167921966369313 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.038932596106046734, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.038932596106046734 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.267741935483871, + "acc_stderr": 0.025189006660212385, + "acc_norm": 0.267741935483871, + "acc_norm_stderr": 0.025189006660212385 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.21674876847290642, + "acc_stderr": 0.028990331252516235, + "acc_norm": 0.21674876847290642, + "acc_norm_stderr": 0.028990331252516235 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.036810508691615486, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.036810508691615486 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.30808080808080807, + "acc_stderr": 0.03289477330098617, + "acc_norm": 0.30808080808080807, + "acc_norm_stderr": 0.03289477330098617 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.31088082901554404, + "acc_stderr": 0.03340361906276586, + "acc_norm": 0.31088082901554404, + "acc_norm_stderr": 0.03340361906276586 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2794871794871795, + "acc_stderr": 0.022752388839776823, + "acc_norm": 0.2794871794871795, + "acc_norm_stderr": 0.022752388839776823 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.22592592592592592, + "acc_stderr": 0.025497532639609553, + "acc_norm": 0.22592592592592592, + "acc_norm_stderr": 0.025497532639609553 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.29831932773109243, + "acc_stderr": 0.02971914287634285, + "acc_norm": 0.29831932773109243, + "acc_norm_stderr": 0.02971914287634285 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.24503311258278146, + "acc_stderr": 0.03511807571804725, + "acc_norm": 0.24503311258278146, + "acc_norm_stderr": 0.03511807571804725 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3486238532110092, + "acc_stderr": 0.02043125409071433, + "acc_norm": 0.3486238532110092, + "acc_norm_stderr": 0.02043125409071433 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.20833333333333334, + "acc_stderr": 0.027696910713093936, + "acc_norm": 0.20833333333333334, + "acc_norm_stderr": 0.027696910713093936 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.30392156862745096, + "acc_stderr": 0.032282103870378914, + "acc_norm": 0.30392156862745096, + "acc_norm_stderr": 0.032282103870378914 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.3628691983122363, + "acc_stderr": 0.031299208255302136, + "acc_norm": 0.3628691983122363, + "acc_norm_stderr": 0.031299208255302136 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.42152466367713004, + "acc_stderr": 0.033141902221106564, + "acc_norm": 0.42152466367713004, + "acc_norm_stderr": 0.033141902221106564 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3511450381679389, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.3511450381679389, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.4049586776859504, + "acc_stderr": 0.044811377559424694, + "acc_norm": 0.4049586776859504, + "acc_norm_stderr": 0.044811377559424694 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.3425925925925926, + "acc_stderr": 0.045879047413018084, + "acc_norm": 0.3425925925925926, + "acc_norm_stderr": 0.045879047413018084 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2883435582822086, + "acc_stderr": 0.03559039531617342, + "acc_norm": 0.2883435582822086, + "acc_norm_stderr": 0.03559039531617342 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3106796116504854, + "acc_stderr": 0.045821241601615506, + "acc_norm": 0.3106796116504854, + "acc_norm_stderr": 0.045821241601615506 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.49572649572649574, + "acc_stderr": 0.03275489264382132, + "acc_norm": 0.49572649572649574, + "acc_norm_stderr": 0.03275489264382132 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.4648786717752235, + "acc_stderr": 0.01783579880629064, + "acc_norm": 0.4648786717752235, + "acc_norm_stderr": 0.01783579880629064 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2976878612716763, + "acc_stderr": 0.024617055388676992, + "acc_norm": 0.2976878612716763, + "acc_norm_stderr": 0.024617055388676992 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25027932960893856, + "acc_stderr": 0.014487500852850407, + "acc_norm": 0.25027932960893856, + "acc_norm_stderr": 0.014487500852850407 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.02609016250427904, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.02609016250427904 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.27009646302250806, + "acc_stderr": 0.02521804037341061, + "acc_norm": 0.27009646302250806, + "acc_norm_stderr": 0.02521804037341061 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.31790123456790126, + "acc_stderr": 0.02591006352824086, + "acc_norm": 0.31790123456790126, + "acc_norm_stderr": 0.02591006352824086 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2872340425531915, + "acc_stderr": 0.026992199173064356, + "acc_norm": 0.2872340425531915, + "acc_norm_stderr": 0.026992199173064356 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.27249022164276404, + "acc_stderr": 0.011371658294311525, + "acc_norm": 0.27249022164276404, + "acc_norm_stderr": 0.011371658294311525 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.22794117647058823, + "acc_stderr": 0.025483081468029804, + "acc_norm": 0.22794117647058823, + "acc_norm_stderr": 0.025483081468029804 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.3284313725490196, + "acc_stderr": 0.018999707383162666, + "acc_norm": 0.3284313725490196, + "acc_norm_stderr": 0.018999707383162666 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.37272727272727274, + "acc_stderr": 0.046313813194254635, + "acc_norm": 0.37272727272727274, + "acc_norm_stderr": 0.046313813194254635 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.20408163265306123, + "acc_stderr": 0.025801283475090506, + "acc_norm": 0.20408163265306123, + "acc_norm_stderr": 0.025801283475090506 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.31840796019900497, + "acc_stderr": 0.032941184790540944, + "acc_norm": 0.31840796019900497, + "acc_norm_stderr": 0.032941184790540944 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.29518072289156627, + "acc_stderr": 0.03550920185689629, + "acc_norm": 0.29518072289156627, + "acc_norm_stderr": 0.03550920185689629 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.45614035087719296, + "acc_stderr": 0.03820042586602967, + "acc_norm": 0.45614035087719296, + "acc_norm_stderr": 0.03820042586602967 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.24969400244798043, + "mc1_stderr": 0.015152286907148128, + "mc2": 0.37636042536499775, + "mc2_stderr": 0.014793997053722314 + }, + "all": { + "acc": 0.30986766368957314, + "acc_stderr": 0.03319934163083704, + "acc_norm": 0.31358641181273034, + "acc_norm_stderr": 0.033191289443365884, + "mc1": 0.24969400244798043, + "mc1_stderr": 0.015152286907148128, + "mc2": 0.37636042536499775, + "mc2_stderr": 0.014793997053722314 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "JosephusCheung/Guanaco", + "model_sha": "bed6f3bd18f07a4a379525645cbd86d622b12836", + "model_dtype": "torch.float16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "4449.53001499176", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/JosephusCheung/Guanaco/results_2023-09-23T06-44-02.813633.json b/eval-results/JosephusCheung/Guanaco/results_2023-09-23T06-44-02.813633.json new file mode 100644 index 0000000000000000000000000000000000000000..9a675f21b94cb8ea4534d04563d3fa5b566ec2ba --- /dev/null +++ b/eval-results/JosephusCheung/Guanaco/results_2023-09-23T06-44-02.813633.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "JosephusCheung/Guanaco", + "model_sha": "bed6f3bd18f07a4a379525645cbd86d622b12836", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.23343120805369127, + "em_stderr": 0.004332062137833453, + "f1": 0.2960843120805377, + "f1_stderr": 0.004351433413685765 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.6866614048934491, + "acc_stderr": 0.013036512096747976 + }, + "all": { + "em": 0.23343120805369127, + "em_stderr": 0.004332062137833453, + "f1": 0.2960843120805377, + "f1_stderr": 0.004351433413685765, + "acc": 0.34333070244672453, + "acc_stderr": 0.006518256048373988 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "e0bcc89214ecc76a" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "c37e69fd0667fa03" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "41cc9c18fd204644" + }, + "total_evaluation_time_secondes": "13844.575718641281", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/JosephusCheung/LL7M/results_2023-10-10T15-26-54.562937.json b/eval-results/JosephusCheung/LL7M/results_2023-10-10T15-26-54.562937.json new file mode 100644 index 0000000000000000000000000000000000000000..6fc116b5b0467ca781ca74e38eec14d8fa4b20b1 --- /dev/null +++ b/eval-results/JosephusCheung/LL7M/results_2023-10-10T15-26-54.562937.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "JosephusCheung/LL7M", + "model_sha": "9b31bbf38a43d41eaf166fb3573f706b23cb1c13", + "model_size": "12.7 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4121160409556314, + "acc_stderr": 0.014383915302225393, + "acc_norm": 0.4496587030716723, + "acc_norm_stderr": 0.014537144444284738 + }, + "harness|hellaswag|10": { + "acc": 0.5034853614817766, + "acc_stderr": 0.004989660180792182, + "acc_norm": 0.6881099382593109, + "acc_norm_stderr": 0.004623184227344774 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.37777777777777777, + "acc_stderr": 0.04188307537595853, + "acc_norm": 0.37777777777777777, + "acc_norm_stderr": 0.04188307537595853 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.29605263157894735, + "acc_stderr": 0.03715062154998904, + "acc_norm": 0.29605263157894735, + "acc_norm_stderr": 0.03715062154998904 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.35094339622641507, + "acc_stderr": 0.029373646253234686, + "acc_norm": 0.35094339622641507, + "acc_norm_stderr": 0.029373646253234686 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3402777777777778, + "acc_stderr": 0.03962135573486219, + "acc_norm": 0.3402777777777778, + "acc_norm_stderr": 0.03962135573486219 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.34104046242774566, + "acc_stderr": 0.036146654241808254, + "acc_norm": 0.34104046242774566, + "acc_norm_stderr": 0.036146654241808254 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179963, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179963 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3446808510638298, + "acc_stderr": 0.03106898596312215, + "acc_norm": 0.3446808510638298, + "acc_norm_stderr": 0.03106898596312215 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748142, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748142 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4, + "acc_stderr": 0.04082482904638629, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04082482904638629 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.022418042891113946, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.022418042891113946 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.040061680838488774, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.040061680838488774 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3096774193548387, + "acc_stderr": 0.026302774983517418, + "acc_norm": 0.3096774193548387, + "acc_norm_stderr": 0.026302774983517418 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.22660098522167488, + "acc_stderr": 0.02945486383529297, + "acc_norm": 0.22660098522167488, + "acc_norm_stderr": 0.02945486383529297 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.37575757575757573, + "acc_stderr": 0.03781887353205982, + "acc_norm": 0.37575757575757573, + "acc_norm_stderr": 0.03781887353205982 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3282828282828283, + "acc_stderr": 0.03345678422756779, + "acc_norm": 0.3282828282828283, + "acc_norm_stderr": 0.03345678422756779 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.41968911917098445, + "acc_stderr": 0.035615873276858834, + "acc_norm": 0.41968911917098445, + "acc_norm_stderr": 0.035615873276858834 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.31025641025641026, + "acc_stderr": 0.023454674889404288, + "acc_norm": 0.31025641025641026, + "acc_norm_stderr": 0.023454674889404288 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.026466117538959912, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.026466117538959912 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.02934457250063434, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.02934457250063434 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23178807947019867, + "acc_stderr": 0.034454062719870546, + "acc_norm": 0.23178807947019867, + "acc_norm_stderr": 0.034454062719870546 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.4073394495412844, + "acc_stderr": 0.021065986244412877, + "acc_norm": 0.4073394495412844, + "acc_norm_stderr": 0.021065986244412877 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.030225226160012414, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.030225226160012414 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.29901960784313725, + "acc_stderr": 0.03213325717373616, + "acc_norm": 0.29901960784313725, + "acc_norm_stderr": 0.03213325717373616 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.4936708860759494, + "acc_stderr": 0.03254462010767859, + "acc_norm": 0.4936708860759494, + "acc_norm_stderr": 0.03254462010767859 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.4304932735426009, + "acc_stderr": 0.033231973029429394, + "acc_norm": 0.4304932735426009, + "acc_norm_stderr": 0.033231973029429394 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3282442748091603, + "acc_stderr": 0.04118438565806298, + "acc_norm": 0.3282442748091603, + "acc_norm_stderr": 0.04118438565806298 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.3884297520661157, + "acc_stderr": 0.04449270350068382, + "acc_norm": 0.3884297520661157, + "acc_norm_stderr": 0.04449270350068382 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.37037037037037035, + "acc_stderr": 0.04668408033024932, + "acc_norm": 0.37037037037037035, + "acc_norm_stderr": 0.04668408033024932 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3374233128834356, + "acc_stderr": 0.03714908409935575, + "acc_norm": 0.3374233128834356, + "acc_norm_stderr": 0.03714908409935575 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.38392857142857145, + "acc_stderr": 0.04616143075028547, + "acc_norm": 0.38392857142857145, + "acc_norm_stderr": 0.04616143075028547 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.4563106796116505, + "acc_stderr": 0.049318019942204146, + "acc_norm": 0.4563106796116505, + "acc_norm_stderr": 0.049318019942204146 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.405982905982906, + "acc_stderr": 0.03217180182641086, + "acc_norm": 0.405982905982906, + "acc_norm_stderr": 0.03217180182641086 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.454661558109834, + "acc_stderr": 0.017806304585052602, + "acc_norm": 0.454661558109834, + "acc_norm_stderr": 0.017806304585052602 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.36127167630057805, + "acc_stderr": 0.025862201852277875, + "acc_norm": 0.36127167630057805, + "acc_norm_stderr": 0.025862201852277875 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2346368715083799, + "acc_stderr": 0.01417304409830368, + "acc_norm": 0.2346368715083799, + "acc_norm_stderr": 0.01417304409830368 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.35947712418300654, + "acc_stderr": 0.027475969910660952, + "acc_norm": 0.35947712418300654, + "acc_norm_stderr": 0.027475969910660952 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3440514469453376, + "acc_stderr": 0.026981478043648036, + "acc_norm": 0.3440514469453376, + "acc_norm_stderr": 0.026981478043648036 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.3734567901234568, + "acc_stderr": 0.026915003011380157, + "acc_norm": 0.3734567901234568, + "acc_norm_stderr": 0.026915003011380157 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.28368794326241137, + "acc_stderr": 0.026891709428343957, + "acc_norm": 0.28368794326241137, + "acc_norm_stderr": 0.026891709428343957 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2926988265971317, + "acc_stderr": 0.01162094919584953, + "acc_norm": 0.2926988265971317, + "acc_norm_stderr": 0.01162094919584953 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.02952009569768777, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.02952009569768777 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.31699346405228757, + "acc_stderr": 0.018824219512706204, + "acc_norm": 0.31699346405228757, + "acc_norm_stderr": 0.018824219512706204 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.4090909090909091, + "acc_stderr": 0.04709306978661895, + "acc_norm": 0.4090909090909091, + "acc_norm_stderr": 0.04709306978661895 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.03168091161233882, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.03168091161233882 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.43283582089552236, + "acc_stderr": 0.03503490923673282, + "acc_norm": 0.43283582089552236, + "acc_norm_stderr": 0.03503490923673282 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3674698795180723, + "acc_stderr": 0.03753267402120574, + "acc_norm": 0.3674698795180723, + "acc_norm_stderr": 0.03753267402120574 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.4269005847953216, + "acc_stderr": 0.03793620616529916, + "acc_norm": 0.4269005847953216, + "acc_norm_stderr": 0.03793620616529916 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.26560587515299877, + "mc1_stderr": 0.015461027627253595, + "mc2": 0.41389661402155314, + "mc2_stderr": 0.014667249870313126 + }, + "all": { + "acc": 0.34825966475221526, + "acc_stderr": 0.03432792752430016, + "acc_norm": 0.35202521117414026, + "acc_norm_stderr": 0.03432431317156477, + "mc1": 0.26560587515299877, + "mc1_stderr": 0.015461027627253595, + "mc2": 0.41389661402155314, + "mc2_stderr": 0.014667249870313126 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "2875.524067878723", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/JosephusCheung/LL7M/results_2023-10-24T03-13-24.379539.json b/eval-results/JosephusCheung/LL7M/results_2023-10-24T03-13-24.379539.json new file mode 100644 index 0000000000000000000000000000000000000000..85ff93d2431f9971127f61a30e9142d9752bbe72 --- /dev/null +++ b/eval-results/JosephusCheung/LL7M/results_2023-10-24T03-13-24.379539.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "JosephusCheung/LL7M", + "model_sha": "9b31bbf38a43d41eaf166fb3573f706b23cb1c13", + "model_size": "12.7 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.019819630872483222, + "em_stderr": 0.0014273827117586067, + "f1": 0.07556312919463118, + "f1_stderr": 0.0018868261588306972 + }, + "harness|gsm8k|5": { + "acc": 0.006065200909780136, + "acc_stderr": 0.00213867030146044 + }, + "harness|winogrande|5": { + "acc": 0.6408839779005525, + "acc_stderr": 0.013483115202120236 + }, + "all": { + "em": 0.019819630872483222, + "em_stderr": 0.0014273827117586067, + "f1": 0.07556312919463118, + "f1_stderr": 0.0018868261588306972, + "acc": 0.3234745894051663, + "acc_stderr": 0.007810892751790338 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "4f38515ddd508802", + "hash_cont_tokens": "3be4a8a13d73b09f" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "5c1882801e34a45b", + "hash_cont_tokens": "5953d5df12e66c55" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "b7d2ede05bc9cad7", + "hash_cont_tokens": "624fc8f48375913b" + }, + "total_evaluation_time_secondes": "28160.855958223343", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/JosephusCheung/Pwen-14B-Chat-20_30/results_2023-10-08T18-25-24.586385.json b/eval-results/JosephusCheung/Pwen-14B-Chat-20_30/results_2023-10-08T18-25-24.586385.json new file mode 100644 index 0000000000000000000000000000000000000000..7436f616ee380ea202dabe9ca6fb66333a799b66 --- /dev/null +++ b/eval-results/JosephusCheung/Pwen-14B-Chat-20_30/results_2023-10-08T18-25-24.586385.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "JosephusCheung/Pwen-14B-Chat-20_30", + "model_sha": "e878e1f1f7b533c32beb8e06ebcf0cfa23f3fe9b", + "model_size": "26.54 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5290102389078498, + "acc_stderr": 0.014586776355294321, + "acc_norm": 0.5614334470989761, + "acc_norm_stderr": 0.014500682618212864 + }, + "harness|hellaswag|10": { + "acc": 0.611929894443338, + "acc_stderr": 0.004863147544177516, + "acc_norm": 0.7978490340569607, + "acc_norm_stderr": 0.004007834585541846 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.562962962962963, + "acc_stderr": 0.04284958639753401, + "acc_norm": 0.562962962962963, + "acc_norm_stderr": 0.04284958639753401 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6973684210526315, + "acc_stderr": 0.03738520676119668, + "acc_norm": 0.6973684210526315, + "acc_norm_stderr": 0.03738520676119668 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.68, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.68, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6528301886792452, + "acc_stderr": 0.029300101705549652, + "acc_norm": 0.6528301886792452, + "acc_norm_stderr": 0.029300101705549652 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7083333333333334, + "acc_stderr": 0.038009680605548594, + "acc_norm": 0.7083333333333334, + "acc_norm_stderr": 0.038009680605548594 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6184971098265896, + "acc_stderr": 0.03703851193099522, + "acc_norm": 0.6184971098265896, + "acc_norm_stderr": 0.03703851193099522 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.048971049527263666, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.048971049527263666 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.43829787234042555, + "acc_stderr": 0.03243618636108101, + "acc_norm": 0.43829787234042555, + "acc_norm_stderr": 0.03243618636108101 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4473684210526316, + "acc_stderr": 0.04677473004491199, + "acc_norm": 0.4473684210526316, + "acc_norm_stderr": 0.04677473004491199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5586206896551724, + "acc_stderr": 0.04137931034482757, + "acc_norm": 0.5586206896551724, + "acc_norm_stderr": 0.04137931034482757 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4947089947089947, + "acc_stderr": 0.02574986828855657, + "acc_norm": 0.4947089947089947, + "acc_norm_stderr": 0.02574986828855657 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.0437588849272706, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.0437588849272706 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7032258064516129, + "acc_stderr": 0.025988500792411905, + "acc_norm": 0.7032258064516129, + "acc_norm_stderr": 0.025988500792411905 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.541871921182266, + "acc_stderr": 0.03505630140785741, + "acc_norm": 0.541871921182266, + "acc_norm_stderr": 0.03505630140785741 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.4909090909090909, + "acc_stderr": 0.039036986477484395, + "acc_norm": 0.4909090909090909, + "acc_norm_stderr": 0.039036986477484395 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.797979797979798, + "acc_stderr": 0.02860620428922987, + "acc_norm": 0.797979797979798, + "acc_norm_stderr": 0.02860620428922987 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7927461139896373, + "acc_stderr": 0.02925282329180363, + "acc_norm": 0.7927461139896373, + "acc_norm_stderr": 0.02925282329180363 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6076923076923076, + "acc_stderr": 0.02475600038213095, + "acc_norm": 0.6076923076923076, + "acc_norm_stderr": 0.02475600038213095 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.362962962962963, + "acc_stderr": 0.02931820364520686, + "acc_norm": 0.362962962962963, + "acc_norm_stderr": 0.02931820364520686 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6680672268907563, + "acc_stderr": 0.03058869701378364, + "acc_norm": 0.6680672268907563, + "acc_norm_stderr": 0.03058869701378364 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.423841059602649, + "acc_stderr": 0.040348466786033974, + "acc_norm": 0.423841059602649, + "acc_norm_stderr": 0.040348466786033974 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7834862385321101, + "acc_stderr": 0.01765871059444313, + "acc_norm": 0.7834862385321101, + "acc_norm_stderr": 0.01765871059444313 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5046296296296297, + "acc_stderr": 0.03409825519163572, + "acc_norm": 0.5046296296296297, + "acc_norm_stderr": 0.03409825519163572 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.03410785338904719, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.03410785338904719 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7637130801687764, + "acc_stderr": 0.02765215314415927, + "acc_norm": 0.7637130801687764, + "acc_norm_stderr": 0.02765215314415927 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5874439461883408, + "acc_stderr": 0.03304062175449297, + "acc_norm": 0.5874439461883408, + "acc_norm_stderr": 0.03304062175449297 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6946564885496184, + "acc_stderr": 0.0403931497872456, + "acc_norm": 0.6946564885496184, + "acc_norm_stderr": 0.0403931497872456 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7768595041322314, + "acc_stderr": 0.03800754475228733, + "acc_norm": 0.7768595041322314, + "acc_norm_stderr": 0.03800754475228733 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6388888888888888, + "acc_stderr": 0.04643454608906275, + "acc_norm": 0.6388888888888888, + "acc_norm_stderr": 0.04643454608906275 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6748466257668712, + "acc_stderr": 0.03680350371286461, + "acc_norm": 0.6748466257668712, + "acc_norm_stderr": 0.03680350371286461 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4017857142857143, + "acc_stderr": 0.04653333146973646, + "acc_norm": 0.4017857142857143, + "acc_norm_stderr": 0.04653333146973646 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7864077669902912, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.7864077669902912, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8076923076923077, + "acc_stderr": 0.02581923325648372, + "acc_norm": 0.8076923076923077, + "acc_norm_stderr": 0.02581923325648372 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7496807151979565, + "acc_stderr": 0.015491088951494583, + "acc_norm": 0.7496807151979565, + "acc_norm_stderr": 0.015491088951494583 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6271676300578035, + "acc_stderr": 0.026033890613576277, + "acc_norm": 0.6271676300578035, + "acc_norm_stderr": 0.026033890613576277 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.38212290502793295, + "acc_stderr": 0.016251139711570765, + "acc_norm": 0.38212290502793295, + "acc_norm_stderr": 0.016251139711570765 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6928104575163399, + "acc_stderr": 0.026415601914388992, + "acc_norm": 0.6928104575163399, + "acc_norm_stderr": 0.026415601914388992 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6527331189710611, + "acc_stderr": 0.027040745502307336, + "acc_norm": 0.6527331189710611, + "acc_norm_stderr": 0.027040745502307336 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6388888888888888, + "acc_stderr": 0.026725868809100793, + "acc_norm": 0.6388888888888888, + "acc_norm_stderr": 0.026725868809100793 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.46099290780141844, + "acc_stderr": 0.02973659252642444, + "acc_norm": 0.46099290780141844, + "acc_norm_stderr": 0.02973659252642444 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.423728813559322, + "acc_stderr": 0.012620785155885996, + "acc_norm": 0.423728813559322, + "acc_norm_stderr": 0.012620785155885996 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6286764705882353, + "acc_stderr": 0.02934980313976587, + "acc_norm": 0.6286764705882353, + "acc_norm_stderr": 0.02934980313976587 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6062091503267973, + "acc_stderr": 0.019766211991073056, + "acc_norm": 0.6062091503267973, + "acc_norm_stderr": 0.019766211991073056 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6090909090909091, + "acc_stderr": 0.04673752333670239, + "acc_norm": 0.6090909090909091, + "acc_norm_stderr": 0.04673752333670239 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6857142857142857, + "acc_stderr": 0.02971932942241748, + "acc_norm": 0.6857142857142857, + "acc_norm_stderr": 0.02971932942241748 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7810945273631841, + "acc_stderr": 0.029239174636647, + "acc_norm": 0.7810945273631841, + "acc_norm_stderr": 0.029239174636647 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4578313253012048, + "acc_stderr": 0.0387862677100236, + "acc_norm": 0.4578313253012048, + "acc_norm_stderr": 0.0387862677100236 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.783625730994152, + "acc_stderr": 0.03158149539338734, + "acc_norm": 0.783625730994152, + "acc_norm_stderr": 0.03158149539338734 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3268053855569155, + "mc1_stderr": 0.01641987473113503, + "mc2": 0.4701781470729953, + "mc2_stderr": 0.014777434418052576 + }, + "all": { + "acc": 0.5990888068369461, + "acc_stderr": 0.03431005125414193, + "acc_norm": 0.6027895245963486, + "acc_norm_stderr": 0.03429409520845181, + "mc1": 0.3268053855569155, + "mc1_stderr": 0.01641987473113503, + "mc2": 0.4701781470729953, + "mc2_stderr": 0.014777434418052576 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "c991f8a5814f8d2f", + "hash_cont_tokens": "bc6e686b575268af" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "9d221d28a199a09c", + "hash_cont_tokens": "e7e52367a92daa27" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40052, + "non-padded": 116, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5afce491c120616a", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "f59f8967e61fde18", + "hash_cont_tokens": "f9dae0f98ef7c0f2" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "2efbd578c3185755", + "hash_cont_tokens": "dff84e206d2f1e0d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "e328ff0ca8fc7890", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "145fa357c13fe43c", + "hash_cont_tokens": "b81dd170f83789d1" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "888579887c9a665a", + "hash_cont_tokens": "85c3400292af3bb8" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 569, + "non-padded": 7, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "e2ca7bc279c63b09", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1671195b9f861e25", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "47ed680b9caddd90", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "f71f719c1032180b", + "hash_cont_tokens": "e5cb48f872b79ee7" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5bde7875f9f1d5dd", + "hash_cont_tokens": "40862171591ad909" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "6de5a5feab854eed", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "9a95e6bc66294b33", + "hash_cont_tokens": "36bb2a47e8ff1bd8" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "b581c488a50d149d", + "hash_cont_tokens": "433685e9aa542c2d" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0afd8e37a73e499b", + "hash_cont_tokens": "f086b291b3aa0628" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 560, + "non-padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "6300cbea203e27e1", + "hash_cont_tokens": "4f402da407619e4d" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dff3e10f0162548b", + "hash_cont_tokens": "80d8e3e54d900608" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "133115320d06c025", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b9c0577c9c2daf4b", + "hash_cont_tokens": "e07819899bd63630" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "154d573ba30378ad", + "hash_cont_tokens": "eb6259a94d61e372" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "91754fb26290a162", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "2e32e47bd2233827", + "hash_cont_tokens": "c3336566c025bc59" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "23f9a0b07be2ba2e", + "hash_cont_tokens": "999a32d098465441" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "de99699b0f5b162d", + "hash_cont_tokens": "361410848e01f8ed" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "c96ba9fc2d1deb87", + "hash_cont_tokens": "18f9ae57b2444806" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "00509312373e95f1", + "hash_cont_tokens": "a13496e646060699" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "56e5bf80535561ec", + "hash_cont_tokens": "791a7a25f0571e59" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "c9b689b4034de87c", + "hash_cont_tokens": "9677b0687811cf73" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ccecbb5539c34c08", + "hash_cont_tokens": "6393201d9136920e" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "3f75abf85d2b9fe9", + "hash_cont_tokens": "17caccbb3a38c7bf" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "f52124b61354d42e", + "hash_cont_tokens": "7128e2eeb930d3b3" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b5b75910265dc2ff", + "hash_cont_tokens": "48e22ae63ee54721" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "a26fe13fa58cbbed", + "hash_cont_tokens": "0f40704815d5b3f6" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ad768773a7782c0c", + "hash_cont_tokens": "a9fdf5917bdddc9b" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "5e16e7eb92789a03", + "hash_cont_tokens": "c63e45a81fbe97b2" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6346e2bce86e76fe", + "hash_cont_tokens": "9df89edb95ea3c08" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "76581e704996be9d", + "hash_cont_tokens": "5b4f21454680a984" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "0425f5feb26f8c3f", + "hash_cont_tokens": "0c2fc7f9e9101fbb" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fb4ebd06a3a58fd2", + "hash_cont_tokens": "1279a23b3bc7b32c" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "33ff7687ba4867f3", + "hash_cont_tokens": "be76778b3b861344" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "e6e5c037eb26a498", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "6db91a99fee03712", + "hash_cont_tokens": "c61a0f86b50f0556" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "11bf0e6ef564edfb", + "hash_cont_tokens": "a208a34c74088f6c" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1380, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "42af46cc9aa77d99", + "hash_cont_tokens": "996ce7a5b6c4aef1" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "484b72f626c82f6b", + "hash_cont_tokens": "9d4280b06a73f2ad" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "d3b8f0fb55346a71", + "hash_cont_tokens": "9a708d21688a0b16" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "f8bbc40534f54a72", + "hash_cont_tokens": "ed0ff6b6c4caf978" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "53f58b4e8af11f6e", + "hash_cont_tokens": "4fd1a023ef90b43a" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1127, + "non-padded": 1, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "a95688e641cf31f1", + "hash_cont_tokens": "d2c1c75d7c0e6ec5" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "fc49c75113daa07a", + "hash_cont_tokens": "ff4c3ef8a56efe40" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "92d9588dfc6ac3f9", + "hash_cont_tokens": "b4566ef91a66db7d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "6a093aeebb63f500", + "hash_cont_tokens": "b713ae56c89df822" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "2fef5cbd88ee376f", + "hash_cont_tokens": "89baef8c4b642ed0" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "35a984bdddcb71dc", + "hash_cont_tokens": "b92ed9d8dde61395" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 796, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "64e26afac44fd84d", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "3bce3760b179a55c", + "hash_cont_tokens": "1c1bf88d7c979ef5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6554c1be40513fa9", + "hash_cont_tokens": "9fbfaba067301be2" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "c1ed17b2cce8daea", + "hash_cont_tokens": "ad4c4cfcbb927635" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "41fb26e769733d20", + "hash_cont_tokens": "d6b023af5cbcb9cf" + }, + "total_evaluation_time_secondes": "6007.52667593956", + "truncated": 0, + "non-truncated": 111019, + "padded": 110855, + "non-padded": 164, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/JosephusCheung/Pwen-14B-Chat-20_30/results_2023-10-27T13-38-56.103845.json b/eval-results/JosephusCheung/Pwen-14B-Chat-20_30/results_2023-10-27T13-38-56.103845.json new file mode 100644 index 0000000000000000000000000000000000000000..f5d66e3b4f5df8362490483aad3960545fd9a466 --- /dev/null +++ b/eval-results/JosephusCheung/Pwen-14B-Chat-20_30/results_2023-10-27T13-38-56.103845.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "JosephusCheung/Pwen-14B-Chat-20_30", + "model_sha": "0a832caa54d17623a18ecc42d4901118a4620a59", + "model_size": "26.54 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.2828229865771812, + "em_stderr": 0.004612221798127954, + "f1": 0.3398972315436241, + "f1_stderr": 0.004521141568402689 + }, + "harness|gsm8k|5": { + "acc": 0.2699014404852161, + "acc_stderr": 0.012227442856468897 + }, + "harness|winogrande|5": { + "acc": 0.7647987371744278, + "acc_stderr": 0.011920008163650872 + }, + "all": { + "em": 0.2828229865771812, + "em_stderr": 0.004612221798127954, + "f1": 0.3398972315436241, + "f1_stderr": 0.004521141568402689, + "acc": 0.5173500888298219, + "acc_stderr": 0.012073725510059884 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "e4d9d658ccb42fc3", + "hash_cont_tokens": "3849d7a299278cad" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "2282d6efefcc7579", + "hash_cont_tokens": "46ceeb3798361342" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "288ed7294cb59f7d", + "hash_cont_tokens": "f4a307afe0c47a4a" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2429, + "non-padded": 105, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "329083a90a12723b", + "hash_cont_tokens": "70aa25123a1c5b8a" + }, + "total_evaluation_time_secondes": "12143.512230157852", + "truncated": 0, + "non-truncated": 13389, + "padded": 2429, + "non-padded": 10960, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/JosephusCheung/Pwen-7B-Chat-20_30/results_2023-10-10T07-01-15.573690.json b/eval-results/JosephusCheung/Pwen-7B-Chat-20_30/results_2023-10-10T07-01-15.573690.json new file mode 100644 index 0000000000000000000000000000000000000000..0b043ce3da883318c6d765895579559ff1ffe2d7 --- /dev/null +++ b/eval-results/JosephusCheung/Pwen-7B-Chat-20_30/results_2023-10-10T07-01-15.573690.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "JosephusCheung/Pwen-7B-Chat-20_30", + "model_sha": "e6c38a7d2f4ba7b867fff421c08c02ba1908224e", + "model_size": "14.51 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4854948805460751, + "acc_stderr": 0.014605241081370053, + "acc_norm": 0.514505119453925, + "acc_norm_stderr": 0.014605241081370056 + }, + "harness|hellaswag|10": { + "acc": 0.5524795857398924, + "acc_stderr": 0.0049622205125483525, + "acc_norm": 0.739892451702848, + "acc_norm_stderr": 0.004377965074211627 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.04292596718256981, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.04292596718256981 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6513157894736842, + "acc_stderr": 0.038781398887976104, + "acc_norm": 0.6513157894736842, + "acc_norm_stderr": 0.038781398887976104 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.690566037735849, + "acc_stderr": 0.028450154794118634, + "acc_norm": 0.690566037735849, + "acc_norm_stderr": 0.028450154794118634 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.03852084696008534, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.03852084696008534 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6705202312138728, + "acc_stderr": 0.03583901754736412, + "acc_norm": 0.6705202312138728, + "acc_norm_stderr": 0.03583901754736412 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.37254901960784315, + "acc_stderr": 0.048108401480826346, + "acc_norm": 0.37254901960784315, + "acc_norm_stderr": 0.048108401480826346 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5787234042553191, + "acc_stderr": 0.03227834510146268, + "acc_norm": 0.5787234042553191, + "acc_norm_stderr": 0.03227834510146268 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.39473684210526316, + "acc_stderr": 0.045981880578165414, + "acc_norm": 0.39473684210526316, + "acc_norm_stderr": 0.045981880578165414 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5241379310344828, + "acc_stderr": 0.0416180850350153, + "acc_norm": 0.5241379310344828, + "acc_norm_stderr": 0.0416180850350153 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.025591857761382175, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.025591857761382175 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.48412698412698413, + "acc_stderr": 0.04469881854072606, + "acc_norm": 0.48412698412698413, + "acc_norm_stderr": 0.04469881854072606 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7516129032258064, + "acc_stderr": 0.024580028921481003, + "acc_norm": 0.7516129032258064, + "acc_norm_stderr": 0.024580028921481003 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.541871921182266, + "acc_stderr": 0.03505630140785741, + "acc_norm": 0.541871921182266, + "acc_norm_stderr": 0.03505630140785741 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.0347769116216366, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.0347769116216366 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7929292929292929, + "acc_stderr": 0.02886977846026704, + "acc_norm": 0.7929292929292929, + "acc_norm_stderr": 0.02886977846026704 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8911917098445595, + "acc_stderr": 0.02247325333276877, + "acc_norm": 0.8911917098445595, + "acc_norm_stderr": 0.02247325333276877 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6051282051282051, + "acc_stderr": 0.02478431694215639, + "acc_norm": 0.6051282051282051, + "acc_norm_stderr": 0.02478431694215639 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32222222222222224, + "acc_stderr": 0.028493465091028597, + "acc_norm": 0.32222222222222224, + "acc_norm_stderr": 0.028493465091028597 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6428571428571429, + "acc_stderr": 0.031124619309328177, + "acc_norm": 0.6428571428571429, + "acc_norm_stderr": 0.031124619309328177 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.37748344370860926, + "acc_stderr": 0.0395802723112157, + "acc_norm": 0.37748344370860926, + "acc_norm_stderr": 0.0395802723112157 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8275229357798165, + "acc_stderr": 0.016197807956848036, + "acc_norm": 0.8275229357798165, + "acc_norm_stderr": 0.016197807956848036 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5046296296296297, + "acc_stderr": 0.03409825519163572, + "acc_norm": 0.5046296296296297, + "acc_norm_stderr": 0.03409825519163572 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7892156862745098, + "acc_stderr": 0.0286265479124374, + "acc_norm": 0.7892156862745098, + "acc_norm_stderr": 0.0286265479124374 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7848101265822784, + "acc_stderr": 0.026750826994676177, + "acc_norm": 0.7848101265822784, + "acc_norm_stderr": 0.026750826994676177 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.031024411740572203, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.031024411740572203 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6717557251908397, + "acc_stderr": 0.04118438565806298, + "acc_norm": 0.6717557251908397, + "acc_norm_stderr": 0.04118438565806298 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.042365112580946315, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.042365112580946315 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7361963190184049, + "acc_stderr": 0.03462419931615623, + "acc_norm": 0.7361963190184049, + "acc_norm_stderr": 0.03462419931615623 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.45535714285714285, + "acc_stderr": 0.04726835553719099, + "acc_norm": 0.45535714285714285, + "acc_norm_stderr": 0.04726835553719099 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.042450224863844956, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.042450224863844956 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8547008547008547, + "acc_stderr": 0.023086635086841403, + "acc_norm": 0.8547008547008547, + "acc_norm_stderr": 0.023086635086841403 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.73, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.73, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8058748403575989, + "acc_stderr": 0.01414397027665757, + "acc_norm": 0.8058748403575989, + "acc_norm_stderr": 0.01414397027665757 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6676300578034682, + "acc_stderr": 0.025361168749688218, + "acc_norm": 0.6676300578034682, + "acc_norm_stderr": 0.025361168749688218 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.30837988826815643, + "acc_stderr": 0.015445716910998874, + "acc_norm": 0.30837988826815643, + "acc_norm_stderr": 0.015445716910998874 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7254901960784313, + "acc_stderr": 0.025553169991826514, + "acc_norm": 0.7254901960784313, + "acc_norm_stderr": 0.025553169991826514 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7170418006430869, + "acc_stderr": 0.025583062489984824, + "acc_norm": 0.7170418006430869, + "acc_norm_stderr": 0.025583062489984824 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6820987654320988, + "acc_stderr": 0.02591006352824088, + "acc_norm": 0.6820987654320988, + "acc_norm_stderr": 0.02591006352824088 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4574468085106383, + "acc_stderr": 0.029719281272236837, + "acc_norm": 0.4574468085106383, + "acc_norm_stderr": 0.029719281272236837 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.49282920469361147, + "acc_stderr": 0.012768922739553313, + "acc_norm": 0.49282920469361147, + "acc_norm_stderr": 0.012768922739553313 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6139705882352942, + "acc_stderr": 0.029573269134411124, + "acc_norm": 0.6139705882352942, + "acc_norm_stderr": 0.029573269134411124 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6209150326797386, + "acc_stderr": 0.019627444748412243, + "acc_norm": 0.6209150326797386, + "acc_norm_stderr": 0.019627444748412243 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.04582004841505416, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.04582004841505416 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6979591836734694, + "acc_stderr": 0.0293936093198798, + "acc_norm": 0.6979591836734694, + "acc_norm_stderr": 0.0293936093198798 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8159203980099502, + "acc_stderr": 0.027403859410786848, + "acc_norm": 0.8159203980099502, + "acc_norm_stderr": 0.027403859410786848 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.03487350880197768, + "acc_norm": 0.86, + "acc_norm_stderr": 0.03487350880197768 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4879518072289157, + "acc_stderr": 0.03891364495835821, + "acc_norm": 0.4879518072289157, + "acc_norm_stderr": 0.03891364495835821 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7894736842105263, + "acc_stderr": 0.0312678171466318, + "acc_norm": 0.7894736842105263, + "acc_norm_stderr": 0.0312678171466318 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.31334149326805383, + "mc1_stderr": 0.0162380650690596, + "mc2": 0.47014420938426915, + "mc2_stderr": 0.014571966148559557 + }, + "all": { + "acc": 0.6173946376864352, + "acc_stderr": 0.03337871209335492, + "acc_norm": 0.6210628259045844, + "acc_norm_stderr": 0.03336880945880684, + "mc1": 0.31334149326805383, + "mc1_stderr": 0.0162380650690596, + "mc2": 0.47014420938426915, + "mc2_stderr": 0.014571966148559557 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "c991f8a5814f8d2f", + "hash_cont_tokens": "bc6e686b575268af" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "9d221d28a199a09c", + "hash_cont_tokens": "e7e52367a92daa27" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40052, + "non-padded": 116, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5afce491c120616a", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "f59f8967e61fde18", + "hash_cont_tokens": "f9dae0f98ef7c0f2" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "2efbd578c3185755", + "hash_cont_tokens": "dff84e206d2f1e0d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "e328ff0ca8fc7890", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "145fa357c13fe43c", + "hash_cont_tokens": "b81dd170f83789d1" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "888579887c9a665a", + "hash_cont_tokens": "85c3400292af3bb8" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 569, + "non-padded": 7, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "e2ca7bc279c63b09", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1671195b9f861e25", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "47ed680b9caddd90", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "f71f719c1032180b", + "hash_cont_tokens": "e5cb48f872b79ee7" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5bde7875f9f1d5dd", + "hash_cont_tokens": "40862171591ad909" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "6de5a5feab854eed", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "9a95e6bc66294b33", + "hash_cont_tokens": "36bb2a47e8ff1bd8" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "b581c488a50d149d", + "hash_cont_tokens": "433685e9aa542c2d" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0afd8e37a73e499b", + "hash_cont_tokens": "f086b291b3aa0628" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 560, + "non-padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "6300cbea203e27e1", + "hash_cont_tokens": "4f402da407619e4d" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dff3e10f0162548b", + "hash_cont_tokens": "80d8e3e54d900608" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "133115320d06c025", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b9c0577c9c2daf4b", + "hash_cont_tokens": "e07819899bd63630" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "154d573ba30378ad", + "hash_cont_tokens": "eb6259a94d61e372" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "91754fb26290a162", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "2e32e47bd2233827", + "hash_cont_tokens": "c3336566c025bc59" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "23f9a0b07be2ba2e", + "hash_cont_tokens": "999a32d098465441" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "de99699b0f5b162d", + "hash_cont_tokens": "361410848e01f8ed" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "c96ba9fc2d1deb87", + "hash_cont_tokens": "18f9ae57b2444806" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "00509312373e95f1", + "hash_cont_tokens": "a13496e646060699" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "56e5bf80535561ec", + "hash_cont_tokens": "791a7a25f0571e59" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "c9b689b4034de87c", + "hash_cont_tokens": "9677b0687811cf73" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ccecbb5539c34c08", + "hash_cont_tokens": "6393201d9136920e" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "3f75abf85d2b9fe9", + "hash_cont_tokens": "17caccbb3a38c7bf" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "f52124b61354d42e", + "hash_cont_tokens": "7128e2eeb930d3b3" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b5b75910265dc2ff", + "hash_cont_tokens": "48e22ae63ee54721" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "a26fe13fa58cbbed", + "hash_cont_tokens": "0f40704815d5b3f6" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ad768773a7782c0c", + "hash_cont_tokens": "a9fdf5917bdddc9b" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "5e16e7eb92789a03", + "hash_cont_tokens": "c63e45a81fbe97b2" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6346e2bce86e76fe", + "hash_cont_tokens": "9df89edb95ea3c08" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "76581e704996be9d", + "hash_cont_tokens": "5b4f21454680a984" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "0425f5feb26f8c3f", + "hash_cont_tokens": "0c2fc7f9e9101fbb" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fb4ebd06a3a58fd2", + "hash_cont_tokens": "1279a23b3bc7b32c" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "33ff7687ba4867f3", + "hash_cont_tokens": "be76778b3b861344" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "e6e5c037eb26a498", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "6db91a99fee03712", + "hash_cont_tokens": "c61a0f86b50f0556" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "11bf0e6ef564edfb", + "hash_cont_tokens": "a208a34c74088f6c" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1380, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "42af46cc9aa77d99", + "hash_cont_tokens": "996ce7a5b6c4aef1" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "484b72f626c82f6b", + "hash_cont_tokens": "9d4280b06a73f2ad" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "d3b8f0fb55346a71", + "hash_cont_tokens": "9a708d21688a0b16" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "f8bbc40534f54a72", + "hash_cont_tokens": "ed0ff6b6c4caf978" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "53f58b4e8af11f6e", + "hash_cont_tokens": "4fd1a023ef90b43a" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1127, + "non-padded": 1, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "a95688e641cf31f1", + "hash_cont_tokens": "d2c1c75d7c0e6ec5" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "fc49c75113daa07a", + "hash_cont_tokens": "ff4c3ef8a56efe40" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "92d9588dfc6ac3f9", + "hash_cont_tokens": "b4566ef91a66db7d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "6a093aeebb63f500", + "hash_cont_tokens": "b713ae56c89df822" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "2fef5cbd88ee376f", + "hash_cont_tokens": "89baef8c4b642ed0" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "35a984bdddcb71dc", + "hash_cont_tokens": "b92ed9d8dde61395" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 796, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "64e26afac44fd84d", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "3bce3760b179a55c", + "hash_cont_tokens": "1c1bf88d7c979ef5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6554c1be40513fa9", + "hash_cont_tokens": "9fbfaba067301be2" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "c1ed17b2cce8daea", + "hash_cont_tokens": "ad4c4cfcbb927635" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "41fb26e769733d20", + "hash_cont_tokens": "d6b023af5cbcb9cf" + }, + "total_evaluation_time_secondes": "4008.6308782100677", + "truncated": 0, + "non-truncated": 111019, + "padded": 110855, + "non-padded": 164, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/JosephusCheung/Pwen-7B-Chat-20_30/results_2023-10-26T02-42-36.258115.json b/eval-results/JosephusCheung/Pwen-7B-Chat-20_30/results_2023-10-26T02-42-36.258115.json new file mode 100644 index 0000000000000000000000000000000000000000..529fe45dabd48a56a5cdd518e151bc463fa22cd5 --- /dev/null +++ b/eval-results/JosephusCheung/Pwen-7B-Chat-20_30/results_2023-10-26T02-42-36.258115.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "JosephusCheung/Pwen-7B-Chat-20_30", + "model_sha": "da8f573dfae6cc36e89b8bc40a036feb411978d9", + "model_size": "14.51 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.2954068791946309, + "em_stderr": 0.004672175556184236, + "f1": 0.3814209312080561, + "f1_stderr": 0.004573085663083055 + }, + "harness|gsm8k|5": { + "acc": 0.20621683093252463, + "acc_stderr": 0.011144364089781436 + }, + "harness|winogrande|5": { + "acc": 0.6842936069455406, + "acc_stderr": 0.01306309474300081 + }, + "all": { + "em": 0.2954068791946309, + "em_stderr": 0.004672175556184236, + "f1": 0.3814209312080561, + "f1_stderr": 0.004573085663083055, + "acc": 0.44525521893903264, + "acc_stderr": 0.012103729416391124 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "e4d9d658ccb42fc3", + "hash_cont_tokens": "3e5fda34e012b148" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "2282d6efefcc7579", + "hash_cont_tokens": "3ca497ba221bb53d" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "288ed7294cb59f7d", + "hash_cont_tokens": "f4a307afe0c47a4a" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2429, + "non-padded": 105, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "329083a90a12723b", + "hash_cont_tokens": "817b8222844b59c1" + }, + "total_evaluation_time_secondes": "7674.348122358322", + "truncated": 0, + "non-truncated": 13389, + "padded": 2429, + "non-padded": 10960, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/JosephusCheung/Pwen-VL-Chat-20_30/results_2023-10-10T08-17-20.929764.json b/eval-results/JosephusCheung/Pwen-VL-Chat-20_30/results_2023-10-10T08-17-20.929764.json new file mode 100644 index 0000000000000000000000000000000000000000..8bc1a0a7534ce27c6956b91220ca8194fa06281b --- /dev/null +++ b/eval-results/JosephusCheung/Pwen-VL-Chat-20_30/results_2023-10-10T08-17-20.929764.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "JosephusCheung/Pwen-VL-Chat-20_30", + "model_sha": "64a9b89fb18140fc1af1f11471dc9fe34ebc7446", + "model_size": "14.51 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4709897610921502, + "acc_stderr": 0.014586776355294316, + "acc_norm": 0.5017064846416383, + "acc_norm_stderr": 0.01461130570505699 + }, + "harness|hellaswag|10": { + "acc": 0.5382393945429197, + "acc_stderr": 0.004975167382061832, + "acc_norm": 0.7220673172674766, + "acc_norm_stderr": 0.004470644845242893 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5037037037037037, + "acc_stderr": 0.043192236258113324, + "acc_norm": 0.5037037037037037, + "acc_norm_stderr": 0.043192236258113324 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5789473684210527, + "acc_stderr": 0.04017901275981748, + "acc_norm": 0.5789473684210527, + "acc_norm_stderr": 0.04017901275981748 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.65, + "acc_stderr": 0.04793724854411019, + "acc_norm": 0.65, + "acc_norm_stderr": 0.04793724854411019 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6415094339622641, + "acc_stderr": 0.02951470358398177, + "acc_norm": 0.6415094339622641, + "acc_norm_stderr": 0.02951470358398177 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.04076663253918567, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.04076663253918567 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6184971098265896, + "acc_stderr": 0.03703851193099521, + "acc_norm": 0.6184971098265896, + "acc_norm_stderr": 0.03703851193099521 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.30392156862745096, + "acc_stderr": 0.04576665403207762, + "acc_norm": 0.30392156862745096, + "acc_norm_stderr": 0.04576665403207762 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.502127659574468, + "acc_stderr": 0.032685726586674915, + "acc_norm": 0.502127659574468, + "acc_norm_stderr": 0.032685726586674915 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4473684210526316, + "acc_stderr": 0.04677473004491199, + "acc_norm": 0.4473684210526316, + "acc_norm_stderr": 0.04677473004491199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.0416180850350153, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.0416180850350153 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3412698412698413, + "acc_stderr": 0.024419234966819067, + "acc_norm": 0.3412698412698413, + "acc_norm_stderr": 0.024419234966819067 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04285714285714281, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04285714285714281 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6774193548387096, + "acc_stderr": 0.026593084516572284, + "acc_norm": 0.6774193548387096, + "acc_norm_stderr": 0.026593084516572284 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4187192118226601, + "acc_stderr": 0.03471192860518468, + "acc_norm": 0.4187192118226601, + "acc_norm_stderr": 0.03471192860518468 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2606060606060606, + "acc_stderr": 0.03427743175816524, + "acc_norm": 0.2606060606060606, + "acc_norm_stderr": 0.03427743175816524 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7373737373737373, + "acc_stderr": 0.03135305009533084, + "acc_norm": 0.7373737373737373, + "acc_norm_stderr": 0.03135305009533084 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7979274611398963, + "acc_stderr": 0.02897908979429673, + "acc_norm": 0.7979274611398963, + "acc_norm_stderr": 0.02897908979429673 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5128205128205128, + "acc_stderr": 0.025342671293807257, + "acc_norm": 0.5128205128205128, + "acc_norm_stderr": 0.025342671293807257 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.02696242432507382, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.02696242432507382 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5462184873949579, + "acc_stderr": 0.03233943468182088, + "acc_norm": 0.5462184873949579, + "acc_norm_stderr": 0.03233943468182088 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7541284403669725, + "acc_stderr": 0.018461940968708443, + "acc_norm": 0.7541284403669725, + "acc_norm_stderr": 0.018461940968708443 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.41203703703703703, + "acc_stderr": 0.03356787758160835, + "acc_norm": 0.41203703703703703, + "acc_norm_stderr": 0.03356787758160835 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6029411764705882, + "acc_stderr": 0.03434131164719129, + "acc_norm": 0.6029411764705882, + "acc_norm_stderr": 0.03434131164719129 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7637130801687764, + "acc_stderr": 0.02765215314415927, + "acc_norm": 0.7637130801687764, + "acc_norm_stderr": 0.02765215314415927 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.03102441174057221, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.03102441174057221 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7520661157024794, + "acc_stderr": 0.03941897526516302, + "acc_norm": 0.7520661157024794, + "acc_norm_stderr": 0.03941897526516302 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.04453197507374984, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.04453197507374984 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6441717791411042, + "acc_stderr": 0.03761521380046734, + "acc_norm": 0.6441717791411042, + "acc_norm_stderr": 0.03761521380046734 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.44642857142857145, + "acc_stderr": 0.047184714852195886, + "acc_norm": 0.44642857142857145, + "acc_norm_stderr": 0.047184714852195886 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.045416094465039476, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.045416094465039476 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8290598290598291, + "acc_stderr": 0.02466249684520982, + "acc_norm": 0.8290598290598291, + "acc_norm_stderr": 0.02466249684520982 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7624521072796935, + "acc_stderr": 0.015218733046150193, + "acc_norm": 0.7624521072796935, + "acc_norm_stderr": 0.015218733046150193 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6069364161849711, + "acc_stderr": 0.02629622791561367, + "acc_norm": 0.6069364161849711, + "acc_norm_stderr": 0.02629622791561367 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3027932960893855, + "acc_stderr": 0.01536686038639711, + "acc_norm": 0.3027932960893855, + "acc_norm_stderr": 0.01536686038639711 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6339869281045751, + "acc_stderr": 0.02758281141515961, + "acc_norm": 0.6339869281045751, + "acc_norm_stderr": 0.02758281141515961 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6559485530546624, + "acc_stderr": 0.02698147804364804, + "acc_norm": 0.6559485530546624, + "acc_norm_stderr": 0.02698147804364804 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6080246913580247, + "acc_stderr": 0.027163686038271146, + "acc_norm": 0.6080246913580247, + "acc_norm_stderr": 0.027163686038271146 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.41134751773049644, + "acc_stderr": 0.02935491115994098, + "acc_norm": 0.41134751773049644, + "acc_norm_stderr": 0.02935491115994098 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4517601043024772, + "acc_stderr": 0.012710662233660247, + "acc_norm": 0.4517601043024772, + "acc_norm_stderr": 0.012710662233660247 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5183823529411765, + "acc_stderr": 0.030352303395351964, + "acc_norm": 0.5183823529411765, + "acc_norm_stderr": 0.030352303395351964 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5718954248366013, + "acc_stderr": 0.0200176292142131, + "acc_norm": 0.5718954248366013, + "acc_norm_stderr": 0.0200176292142131 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5818181818181818, + "acc_stderr": 0.04724577405731572, + "acc_norm": 0.5818181818181818, + "acc_norm_stderr": 0.04724577405731572 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6326530612244898, + "acc_stderr": 0.030862144921087548, + "acc_norm": 0.6326530612244898, + "acc_norm_stderr": 0.030862144921087548 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7810945273631841, + "acc_stderr": 0.029239174636647, + "acc_norm": 0.7810945273631841, + "acc_norm_stderr": 0.029239174636647 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932261, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932261 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.46987951807228917, + "acc_stderr": 0.03885425420866766, + "acc_norm": 0.46987951807228917, + "acc_norm_stderr": 0.03885425420866766 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.783625730994152, + "acc_stderr": 0.03158149539338734, + "acc_norm": 0.783625730994152, + "acc_norm_stderr": 0.03158149539338734 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2913096695226438, + "mc1_stderr": 0.015905987048184828, + "mc2": 0.42517178573631115, + "mc2_stderr": 0.01461529390566251 + }, + "all": { + "acc": 0.5614045523007456, + "acc_stderr": 0.034472805150990236, + "acc_norm": 0.5650409022375938, + "acc_norm_stderr": 0.03446466967324352, + "mc1": 0.2913096695226438, + "mc1_stderr": 0.015905987048184828, + "mc2": 0.42517178573631115, + "mc2_stderr": 0.01461529390566251 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "c991f8a5814f8d2f", + "hash_cont_tokens": "bc6e686b575268af" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "9d221d28a199a09c", + "hash_cont_tokens": "e7e52367a92daa27" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40052, + "non-padded": 116, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5afce491c120616a", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "f59f8967e61fde18", + "hash_cont_tokens": "f9dae0f98ef7c0f2" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "2efbd578c3185755", + "hash_cont_tokens": "dff84e206d2f1e0d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "e328ff0ca8fc7890", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "145fa357c13fe43c", + "hash_cont_tokens": "b81dd170f83789d1" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "888579887c9a665a", + "hash_cont_tokens": "85c3400292af3bb8" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 569, + "non-padded": 7, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "e2ca7bc279c63b09", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1671195b9f861e25", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "47ed680b9caddd90", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "f71f719c1032180b", + "hash_cont_tokens": "e5cb48f872b79ee7" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5bde7875f9f1d5dd", + "hash_cont_tokens": "40862171591ad909" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "6de5a5feab854eed", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "9a95e6bc66294b33", + "hash_cont_tokens": "36bb2a47e8ff1bd8" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "b581c488a50d149d", + "hash_cont_tokens": "433685e9aa542c2d" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0afd8e37a73e499b", + "hash_cont_tokens": "f086b291b3aa0628" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 560, + "non-padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "6300cbea203e27e1", + "hash_cont_tokens": "4f402da407619e4d" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dff3e10f0162548b", + "hash_cont_tokens": "80d8e3e54d900608" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "133115320d06c025", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b9c0577c9c2daf4b", + "hash_cont_tokens": "e07819899bd63630" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "154d573ba30378ad", + "hash_cont_tokens": "eb6259a94d61e372" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "91754fb26290a162", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "2e32e47bd2233827", + "hash_cont_tokens": "c3336566c025bc59" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "23f9a0b07be2ba2e", + "hash_cont_tokens": "999a32d098465441" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "de99699b0f5b162d", + "hash_cont_tokens": "361410848e01f8ed" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "c96ba9fc2d1deb87", + "hash_cont_tokens": "18f9ae57b2444806" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "00509312373e95f1", + "hash_cont_tokens": "a13496e646060699" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "56e5bf80535561ec", + "hash_cont_tokens": "791a7a25f0571e59" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "c9b689b4034de87c", + "hash_cont_tokens": "9677b0687811cf73" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ccecbb5539c34c08", + "hash_cont_tokens": "6393201d9136920e" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "3f75abf85d2b9fe9", + "hash_cont_tokens": "17caccbb3a38c7bf" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "f52124b61354d42e", + "hash_cont_tokens": "7128e2eeb930d3b3" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b5b75910265dc2ff", + "hash_cont_tokens": "48e22ae63ee54721" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "a26fe13fa58cbbed", + "hash_cont_tokens": "0f40704815d5b3f6" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ad768773a7782c0c", + "hash_cont_tokens": "a9fdf5917bdddc9b" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "5e16e7eb92789a03", + "hash_cont_tokens": "c63e45a81fbe97b2" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6346e2bce86e76fe", + "hash_cont_tokens": "9df89edb95ea3c08" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "76581e704996be9d", + "hash_cont_tokens": "5b4f21454680a984" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "0425f5feb26f8c3f", + "hash_cont_tokens": "0c2fc7f9e9101fbb" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fb4ebd06a3a58fd2", + "hash_cont_tokens": "1279a23b3bc7b32c" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "33ff7687ba4867f3", + "hash_cont_tokens": "be76778b3b861344" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "e6e5c037eb26a498", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "6db91a99fee03712", + "hash_cont_tokens": "c61a0f86b50f0556" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "11bf0e6ef564edfb", + "hash_cont_tokens": "a208a34c74088f6c" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1380, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "42af46cc9aa77d99", + "hash_cont_tokens": "996ce7a5b6c4aef1" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "484b72f626c82f6b", + "hash_cont_tokens": "9d4280b06a73f2ad" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "d3b8f0fb55346a71", + "hash_cont_tokens": "9a708d21688a0b16" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "f8bbc40534f54a72", + "hash_cont_tokens": "ed0ff6b6c4caf978" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "53f58b4e8af11f6e", + "hash_cont_tokens": "4fd1a023ef90b43a" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1127, + "non-padded": 1, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "a95688e641cf31f1", + "hash_cont_tokens": "d2c1c75d7c0e6ec5" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "fc49c75113daa07a", + "hash_cont_tokens": "ff4c3ef8a56efe40" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "92d9588dfc6ac3f9", + "hash_cont_tokens": "b4566ef91a66db7d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "6a093aeebb63f500", + "hash_cont_tokens": "b713ae56c89df822" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "2fef5cbd88ee376f", + "hash_cont_tokens": "89baef8c4b642ed0" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "35a984bdddcb71dc", + "hash_cont_tokens": "b92ed9d8dde61395" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 796, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "64e26afac44fd84d", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "3bce3760b179a55c", + "hash_cont_tokens": "1c1bf88d7c979ef5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6554c1be40513fa9", + "hash_cont_tokens": "9fbfaba067301be2" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "c1ed17b2cce8daea", + "hash_cont_tokens": "ad4c4cfcbb927635" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "41fb26e769733d20", + "hash_cont_tokens": "d6b023af5cbcb9cf" + }, + "total_evaluation_time_secondes": "7424.131828784943", + "truncated": 0, + "non-truncated": 111019, + "padded": 110855, + "non-padded": 164, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/JosephusCheung/Pwen-VL-Chat-20_30/results_2023-11-04T15-47-38.506264.json b/eval-results/JosephusCheung/Pwen-VL-Chat-20_30/results_2023-11-04T15-47-38.506264.json new file mode 100644 index 0000000000000000000000000000000000000000..116ba6f68efaaa1b0d6a3d1185fad1d9aff5989f --- /dev/null +++ b/eval-results/JosephusCheung/Pwen-VL-Chat-20_30/results_2023-11-04T15-47-38.506264.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "JosephusCheung/Pwen-VL-Chat-20_30", + "model_sha": "64a9b89fb18140fc1af1f11471dc9fe34ebc7446", + "model_dtype": "torch.bfloat16", + "model_size": "14.51 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.3234060402684564, + "em_stderr": 0.004790466119380845, + "f1": 0.3795564177852361, + "f1_stderr": 0.004705234681743664 + }, + "harness|gsm8k|5": { + "acc": 0.1910538286580743, + "acc_stderr": 0.010828791191755175 + }, + "harness|winogrande|5": { + "acc": 0.6835043409629045, + "acc_stderr": 0.013071868328051477 + }, + "all": { + "em": 0.3234060402684564, + "em_stderr": 0.004790466119380845, + "f1": 0.3795564177852361, + "f1_stderr": 0.004705234681743664, + "acc": 0.4372790848104894, + "acc_stderr": 0.011950329759903327 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "e4d9d658ccb42fc3", + "hash_cont_tokens": "3acffef0fcdaa041" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "2282d6efefcc7579", + "hash_cont_tokens": "dcabb1d9463db6f1" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "288ed7294cb59f7d", + "hash_cont_tokens": "f4a307afe0c47a4a" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2429, + "non_padded": 105, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "329083a90a12723b", + "hash_cont_tokens": "1578bed2931bae1e" + }, + "truncated": 0, + "non_truncated": 12122, + "padded": 2429, + "non_padded": 10960, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/JosephusCheung/Pwen-VL-Chat-20_30/results_2023-11-06T13-45-28.201357.json b/eval-results/JosephusCheung/Pwen-VL-Chat-20_30/results_2023-11-06T13-45-28.201357.json new file mode 100644 index 0000000000000000000000000000000000000000..116ba6f68efaaa1b0d6a3d1185fad1d9aff5989f --- /dev/null +++ b/eval-results/JosephusCheung/Pwen-VL-Chat-20_30/results_2023-11-06T13-45-28.201357.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "JosephusCheung/Pwen-VL-Chat-20_30", + "model_sha": "64a9b89fb18140fc1af1f11471dc9fe34ebc7446", + "model_dtype": "torch.bfloat16", + "model_size": "14.51 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.3234060402684564, + "em_stderr": 0.004790466119380845, + "f1": 0.3795564177852361, + "f1_stderr": 0.004705234681743664 + }, + "harness|gsm8k|5": { + "acc": 0.1910538286580743, + "acc_stderr": 0.010828791191755175 + }, + "harness|winogrande|5": { + "acc": 0.6835043409629045, + "acc_stderr": 0.013071868328051477 + }, + "all": { + "em": 0.3234060402684564, + "em_stderr": 0.004790466119380845, + "f1": 0.3795564177852361, + "f1_stderr": 0.004705234681743664, + "acc": 0.4372790848104894, + "acc_stderr": 0.011950329759903327 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "e4d9d658ccb42fc3", + "hash_cont_tokens": "3acffef0fcdaa041" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "2282d6efefcc7579", + "hash_cont_tokens": "dcabb1d9463db6f1" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "288ed7294cb59f7d", + "hash_cont_tokens": "f4a307afe0c47a4a" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2429, + "non_padded": 105, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "329083a90a12723b", + "hash_cont_tokens": "1578bed2931bae1e" + }, + "truncated": 0, + "non_truncated": 12122, + "padded": 2429, + "non_padded": 10960, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/JosephusCheung/Qwen-LLaMAfied-7B-Chat/results_2023-09-12T19-56-23.146408.json b/eval-results/JosephusCheung/Qwen-LLaMAfied-7B-Chat/results_2023-09-12T19-56-23.146408.json new file mode 100644 index 0000000000000000000000000000000000000000..bc997c87e84fc9ac963a0e664487f8707dddefc9 --- /dev/null +++ b/eval-results/JosephusCheung/Qwen-LLaMAfied-7B-Chat/results_2023-09-12T19-56-23.146408.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "JosephusCheung/Qwen-LLaMAfied-7B-Chat", + "model_sha": "4d70cf0047a7a5cd2c864bc2606e81f0830e4405", + "model_size": "14.48 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.46501706484641636, + "acc_stderr": 0.014575583922019667, + "acc_norm": 0.5093856655290102, + "acc_norm_stderr": 0.014608816322065003 + }, + "harness|hellaswag|10": { + "acc": 0.6408086038637721, + "acc_stderr": 0.0047878291682556555, + "acc_norm": 0.8346942840071699, + "acc_norm_stderr": 0.0037069708564109647 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4740740740740741, + "acc_stderr": 0.04313531696750574, + "acc_norm": 0.4740740740740741, + "acc_norm_stderr": 0.04313531696750574 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5986842105263158, + "acc_stderr": 0.039889037033362836, + "acc_norm": 0.5986842105263158, + "acc_norm_stderr": 0.039889037033362836 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6150943396226415, + "acc_stderr": 0.02994649856769995, + "acc_norm": 0.6150943396226415, + "acc_norm_stderr": 0.02994649856769995 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5763888888888888, + "acc_stderr": 0.04132125019723369, + "acc_norm": 0.5763888888888888, + "acc_norm_stderr": 0.04132125019723369 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5260115606936416, + "acc_stderr": 0.03807301726504511, + "acc_norm": 0.5260115606936416, + "acc_norm_stderr": 0.03807301726504511 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.04389869956808777, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.04389869956808777 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4425531914893617, + "acc_stderr": 0.03246956919789958, + "acc_norm": 0.4425531914893617, + "acc_norm_stderr": 0.03246956919789958 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.38596491228070173, + "acc_stderr": 0.04579639422070434, + "acc_norm": 0.38596491228070173, + "acc_norm_stderr": 0.04579639422070434 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.024594975128920945, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.024594975128920945 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.04325506042017086, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.04325506042017086 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6419354838709678, + "acc_stderr": 0.027273890594300645, + "acc_norm": 0.6419354838709678, + "acc_norm_stderr": 0.027273890594300645 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.45320197044334976, + "acc_stderr": 0.03502544650845872, + "acc_norm": 0.45320197044334976, + "acc_norm_stderr": 0.03502544650845872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.696969696969697, + "acc_stderr": 0.03274287914026868, + "acc_norm": 0.696969696969697, + "acc_norm_stderr": 0.03274287914026868 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7875647668393783, + "acc_stderr": 0.029519282616817244, + "acc_norm": 0.7875647668393783, + "acc_norm_stderr": 0.029519282616817244 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5128205128205128, + "acc_stderr": 0.02534267129380725, + "acc_norm": 0.5128205128205128, + "acc_norm_stderr": 0.02534267129380725 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.02671924078371215, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.02671924078371215 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5210084033613446, + "acc_stderr": 0.03244980849990029, + "acc_norm": 0.5210084033613446, + "acc_norm_stderr": 0.03244980849990029 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7339449541284404, + "acc_stderr": 0.018946022322225604, + "acc_norm": 0.7339449541284404, + "acc_norm_stderr": 0.018946022322225604 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.38425925925925924, + "acc_stderr": 0.03317354514310742, + "acc_norm": 0.38425925925925924, + "acc_norm_stderr": 0.03317354514310742 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5931372549019608, + "acc_stderr": 0.03447891136353382, + "acc_norm": 0.5931372549019608, + "acc_norm_stderr": 0.03447891136353382 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7510548523206751, + "acc_stderr": 0.028146970599422644, + "acc_norm": 0.7510548523206751, + "acc_norm_stderr": 0.028146970599422644 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.600896860986547, + "acc_stderr": 0.032867453125679603, + "acc_norm": 0.600896860986547, + "acc_norm_stderr": 0.032867453125679603 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6106870229007634, + "acc_stderr": 0.04276486542814591, + "acc_norm": 0.6106870229007634, + "acc_norm_stderr": 0.04276486542814591 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.03984979653302871, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.03984979653302871 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6851851851851852, + "acc_stderr": 0.04489931073591312, + "acc_norm": 0.6851851851851852, + "acc_norm_stderr": 0.04489931073591312 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6196319018404908, + "acc_stderr": 0.038142698932618374, + "acc_norm": 0.6196319018404908, + "acc_norm_stderr": 0.038142698932618374 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764377, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764377 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6407766990291263, + "acc_stderr": 0.04750458399041697, + "acc_norm": 0.6407766990291263, + "acc_norm_stderr": 0.04750458399041697 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7863247863247863, + "acc_stderr": 0.026853450377009154, + "acc_norm": 0.7863247863247863, + "acc_norm_stderr": 0.026853450377009154 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7330779054916986, + "acc_stderr": 0.01581845089477757, + "acc_norm": 0.7330779054916986, + "acc_norm_stderr": 0.01581845089477757 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.615606936416185, + "acc_stderr": 0.02618966696627204, + "acc_norm": 0.615606936416185, + "acc_norm_stderr": 0.02618966696627204 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.26033519553072626, + "acc_stderr": 0.014676252009319476, + "acc_norm": 0.26033519553072626, + "acc_norm_stderr": 0.014676252009319476 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6241830065359477, + "acc_stderr": 0.027732834353363933, + "acc_norm": 0.6241830065359477, + "acc_norm_stderr": 0.027732834353363933 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6463022508038585, + "acc_stderr": 0.027155208103200865, + "acc_norm": 0.6463022508038585, + "acc_norm_stderr": 0.027155208103200865 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6419753086419753, + "acc_stderr": 0.026675611926037086, + "acc_norm": 0.6419753086419753, + "acc_norm_stderr": 0.026675611926037086 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3723404255319149, + "acc_stderr": 0.028838921471251455, + "acc_norm": 0.3723404255319149, + "acc_norm_stderr": 0.028838921471251455 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4132985658409387, + "acc_stderr": 0.012576779494860083, + "acc_norm": 0.4132985658409387, + "acc_norm_stderr": 0.012576779494860083 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5220588235294118, + "acc_stderr": 0.03034326422421352, + "acc_norm": 0.5220588235294118, + "acc_norm_stderr": 0.03034326422421352 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5310457516339869, + "acc_stderr": 0.020188804456361887, + "acc_norm": 0.5310457516339869, + "acc_norm_stderr": 0.020188804456361887 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6, + "acc_stderr": 0.0469237132203465, + "acc_norm": 0.6, + "acc_norm_stderr": 0.0469237132203465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6448979591836734, + "acc_stderr": 0.030635655150387638, + "acc_norm": 0.6448979591836734, + "acc_norm_stderr": 0.030635655150387638 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7661691542288557, + "acc_stderr": 0.029929415408348384, + "acc_norm": 0.7661691542288557, + "acc_norm_stderr": 0.029929415408348384 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932262, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932262 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4578313253012048, + "acc_stderr": 0.0387862677100236, + "acc_norm": 0.4578313253012048, + "acc_norm_stderr": 0.0387862677100236 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6842105263157895, + "acc_stderr": 0.03565079670708312, + "acc_norm": 0.6842105263157895, + "acc_norm_stderr": 0.03565079670708312 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3084455324357405, + "mc1_stderr": 0.01616803938315687, + "mc2": 0.4608515013805907, + "mc2_stderr": 0.015086475930316952 + }, + "all": { + "acc": 0.5357741671495968, + "acc_stderr": 0.034555420789771196, + "acc_norm": 0.5398123752991899, + "acc_norm_stderr": 0.03453766441838476, + "mc1": 0.3084455324357405, + "mc1_stderr": 0.01616803938315687, + "mc2": 0.4608515013805907, + "mc2_stderr": 0.015086475930316952 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "c991f8a5814f8d2f", + "hash_cont_tokens": "bc6e686b575268af" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "9d221d28a199a09c", + "hash_cont_tokens": "e7e52367a92daa27" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40052, + "non-padded": 116, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5afce491c120616a", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "f59f8967e61fde18", + "hash_cont_tokens": "f9dae0f98ef7c0f2" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "2efbd578c3185755", + "hash_cont_tokens": "dff84e206d2f1e0d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "e328ff0ca8fc7890", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "145fa357c13fe43c", + "hash_cont_tokens": "b81dd170f83789d1" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "888579887c9a665a", + "hash_cont_tokens": "85c3400292af3bb8" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 569, + "non-padded": 7, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "e2ca7bc279c63b09", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1671195b9f861e25", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "47ed680b9caddd90", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "f71f719c1032180b", + "hash_cont_tokens": "e5cb48f872b79ee7" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5bde7875f9f1d5dd", + "hash_cont_tokens": "40862171591ad909" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "6de5a5feab854eed", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "9a95e6bc66294b33", + "hash_cont_tokens": "36bb2a47e8ff1bd8" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "b581c488a50d149d", + "hash_cont_tokens": "433685e9aa542c2d" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0afd8e37a73e499b", + "hash_cont_tokens": "f086b291b3aa0628" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 560, + "non-padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "6300cbea203e27e1", + "hash_cont_tokens": "4f402da407619e4d" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dff3e10f0162548b", + "hash_cont_tokens": "80d8e3e54d900608" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "133115320d06c025", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b9c0577c9c2daf4b", + "hash_cont_tokens": "e07819899bd63630" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "154d573ba30378ad", + "hash_cont_tokens": "eb6259a94d61e372" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "91754fb26290a162", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "2e32e47bd2233827", + "hash_cont_tokens": "c3336566c025bc59" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "23f9a0b07be2ba2e", + "hash_cont_tokens": "999a32d098465441" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "de99699b0f5b162d", + "hash_cont_tokens": "361410848e01f8ed" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "c96ba9fc2d1deb87", + "hash_cont_tokens": "18f9ae57b2444806" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "00509312373e95f1", + "hash_cont_tokens": "a13496e646060699" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "56e5bf80535561ec", + "hash_cont_tokens": "791a7a25f0571e59" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "c9b689b4034de87c", + "hash_cont_tokens": "9677b0687811cf73" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ccecbb5539c34c08", + "hash_cont_tokens": "6393201d9136920e" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "3f75abf85d2b9fe9", + "hash_cont_tokens": "17caccbb3a38c7bf" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "f52124b61354d42e", + "hash_cont_tokens": "7128e2eeb930d3b3" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b5b75910265dc2ff", + "hash_cont_tokens": "48e22ae63ee54721" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "a26fe13fa58cbbed", + "hash_cont_tokens": "0f40704815d5b3f6" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ad768773a7782c0c", + "hash_cont_tokens": "a9fdf5917bdddc9b" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "5e16e7eb92789a03", + "hash_cont_tokens": "c63e45a81fbe97b2" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6346e2bce86e76fe", + "hash_cont_tokens": "9df89edb95ea3c08" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "76581e704996be9d", + "hash_cont_tokens": "5b4f21454680a984" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "0425f5feb26f8c3f", + "hash_cont_tokens": "0c2fc7f9e9101fbb" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fb4ebd06a3a58fd2", + "hash_cont_tokens": "1279a23b3bc7b32c" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "33ff7687ba4867f3", + "hash_cont_tokens": "be76778b3b861344" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "e6e5c037eb26a498", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "6db91a99fee03712", + "hash_cont_tokens": "c61a0f86b50f0556" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "11bf0e6ef564edfb", + "hash_cont_tokens": "a208a34c74088f6c" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1380, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "42af46cc9aa77d99", + "hash_cont_tokens": "996ce7a5b6c4aef1" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "484b72f626c82f6b", + "hash_cont_tokens": "9d4280b06a73f2ad" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "d3b8f0fb55346a71", + "hash_cont_tokens": "9a708d21688a0b16" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "f8bbc40534f54a72", + "hash_cont_tokens": "ed0ff6b6c4caf978" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "53f58b4e8af11f6e", + "hash_cont_tokens": "4fd1a023ef90b43a" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1127, + "non-padded": 1, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "a95688e641cf31f1", + "hash_cont_tokens": "d2c1c75d7c0e6ec5" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "fc49c75113daa07a", + "hash_cont_tokens": "ff4c3ef8a56efe40" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "92d9588dfc6ac3f9", + "hash_cont_tokens": "b4566ef91a66db7d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "6a093aeebb63f500", + "hash_cont_tokens": "b713ae56c89df822" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "2fef5cbd88ee376f", + "hash_cont_tokens": "89baef8c4b642ed0" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "35a984bdddcb71dc", + "hash_cont_tokens": "b92ed9d8dde61395" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 796, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "64e26afac44fd84d", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "3bce3760b179a55c", + "hash_cont_tokens": "1c1bf88d7c979ef5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6554c1be40513fa9", + "hash_cont_tokens": "9fbfaba067301be2" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "c1ed17b2cce8daea", + "hash_cont_tokens": "ad4c4cfcbb927635" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "41fb26e769733d20", + "hash_cont_tokens": "d6b023af5cbcb9cf" + }, + "total_evaluation_time_secondes": "3782.70871925354", + "truncated": 0, + "non-truncated": 111019, + "padded": 110855, + "non-padded": 164, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/JosephusCheung/Qwen-LLaMAfied-7B-Chat/results_2023-10-29T05-54-59.935248.json b/eval-results/JosephusCheung/Qwen-LLaMAfied-7B-Chat/results_2023-10-29T05-54-59.935248.json new file mode 100644 index 0000000000000000000000000000000000000000..f6a26e41b4d3f810ba57773de93b1f304fbd190d --- /dev/null +++ b/eval-results/JosephusCheung/Qwen-LLaMAfied-7B-Chat/results_2023-10-29T05-54-59.935248.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "JosephusCheung/Qwen-LLaMAfied-7B-Chat", + "model_sha": "3809bc39e7cc555da86840c11a47fe19bd82e40b", + "model_size": "14.48 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.29425335570469796, + "em_stderr": 0.004666860017033486, + "f1": 0.3722158137583904, + "f1_stderr": 0.004557451176367578 + }, + "harness|gsm8k|5": { + "acc": 0.047763457164518575, + "acc_stderr": 0.00587438753622931 + }, + "harness|winogrande|5": { + "acc": 0.7316495659037096, + "acc_stderr": 0.012453340359561195 + }, + "all": { + "em": 0.29425335570469796, + "em_stderr": 0.004666860017033486, + "f1": 0.3722158137583904, + "f1_stderr": 0.004557451176367578, + "acc": 0.38970651153411406, + "acc_stderr": 0.009163863947895253 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "e4d9d658ccb42fc3", + "hash_cont_tokens": "b54f779b97b2bf91" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "2282d6efefcc7579", + "hash_cont_tokens": "8dadfe42772ec7f1" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "288ed7294cb59f7d", + "hash_cont_tokens": "f4a307afe0c47a4a" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2429, + "non-padded": 105, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "329083a90a12723b", + "hash_cont_tokens": "2cd1160e04461fa1" + }, + "total_evaluation_time_secondes": "7637.678814411163", + "truncated": 0, + "non-truncated": 13389, + "padded": 2429, + "non-padded": 10960, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/JosephusCheung/Qwen-VL-LLaMAfied-7B-Chat/results_2023-10-10T07-39-47.100914.json b/eval-results/JosephusCheung/Qwen-VL-LLaMAfied-7B-Chat/results_2023-10-10T07-39-47.100914.json new file mode 100644 index 0000000000000000000000000000000000000000..21036acbef0c92cbb793ca5c974e7e56a573e8f8 --- /dev/null +++ b/eval-results/JosephusCheung/Qwen-VL-LLaMAfied-7B-Chat/results_2023-10-10T07-39-47.100914.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "JosephusCheung/Qwen-VL-LLaMAfied-7B-Chat", + "model_sha": "ccbd599ac46bcfbf7020be393afeecef404bce2b", + "model_size": "14.48 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.45733788395904434, + "acc_stderr": 0.014558106543924068, + "acc_norm": 0.4735494880546075, + "acc_norm_stderr": 0.014590931358120172 + }, + "harness|hellaswag|10": { + "acc": 0.5238996215893248, + "acc_stderr": 0.004984077906216098, + "acc_norm": 0.6996614220274846, + "acc_norm_stderr": 0.004574683373821049 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.04244633238353228, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.04244633238353228 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.42105263157894735, + "acc_stderr": 0.04017901275981749, + "acc_norm": 0.42105263157894735, + "acc_norm_stderr": 0.04017901275981749 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4679245283018868, + "acc_stderr": 0.03070948699255655, + "acc_norm": 0.4679245283018868, + "acc_norm_stderr": 0.03070948699255655 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3472222222222222, + "acc_stderr": 0.039812405437178615, + "acc_norm": 0.3472222222222222, + "acc_norm_stderr": 0.039812405437178615 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421255, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421255 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3872832369942196, + "acc_stderr": 0.037143259063020656, + "acc_norm": 0.3872832369942196, + "acc_norm_stderr": 0.037143259063020656 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171453, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171453 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3829787234042553, + "acc_stderr": 0.03177821250236922, + "acc_norm": 0.3829787234042553, + "acc_norm_stderr": 0.03177821250236922 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4482758620689655, + "acc_stderr": 0.04144311810878151, + "acc_norm": 0.4482758620689655, + "acc_norm_stderr": 0.04144311810878151 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24867724867724866, + "acc_stderr": 0.022261817692400175, + "acc_norm": 0.24867724867724866, + "acc_norm_stderr": 0.022261817692400175 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.040735243221471255, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.040735243221471255 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5387096774193548, + "acc_stderr": 0.028358634859836935, + "acc_norm": 0.5387096774193548, + "acc_norm_stderr": 0.028358634859836935 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.33497536945812806, + "acc_stderr": 0.033208527423483104, + "acc_norm": 0.33497536945812806, + "acc_norm_stderr": 0.033208527423483104 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.42424242424242425, + "acc_stderr": 0.038592681420702615, + "acc_norm": 0.42424242424242425, + "acc_norm_stderr": 0.038592681420702615 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5656565656565656, + "acc_stderr": 0.035315058793591834, + "acc_norm": 0.5656565656565656, + "acc_norm_stderr": 0.035315058793591834 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6269430051813472, + "acc_stderr": 0.03490205592048574, + "acc_norm": 0.6269430051813472, + "acc_norm_stderr": 0.03490205592048574 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4205128205128205, + "acc_stderr": 0.025028610276710855, + "acc_norm": 0.4205128205128205, + "acc_norm_stderr": 0.025028610276710855 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.025928876132766118, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.025928876132766118 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.44537815126050423, + "acc_stderr": 0.032284106267163895, + "acc_norm": 0.44537815126050423, + "acc_norm_stderr": 0.032284106267163895 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23178807947019867, + "acc_stderr": 0.03445406271987053, + "acc_norm": 0.23178807947019867, + "acc_norm_stderr": 0.03445406271987053 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.5724770642201835, + "acc_stderr": 0.021210910204300437, + "acc_norm": 0.5724770642201835, + "acc_norm_stderr": 0.021210910204300437 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3055555555555556, + "acc_stderr": 0.031415546294025445, + "acc_norm": 0.3055555555555556, + "acc_norm_stderr": 0.031415546294025445 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5490196078431373, + "acc_stderr": 0.034924061041636124, + "acc_norm": 0.5490196078431373, + "acc_norm_stderr": 0.034924061041636124 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.5949367088607594, + "acc_stderr": 0.03195514741370672, + "acc_norm": 0.5949367088607594, + "acc_norm_stderr": 0.03195514741370672 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.57847533632287, + "acc_stderr": 0.033141902221106564, + "acc_norm": 0.57847533632287, + "acc_norm_stderr": 0.033141902221106564 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.4351145038167939, + "acc_stderr": 0.04348208051644858, + "acc_norm": 0.4351145038167939, + "acc_norm_stderr": 0.04348208051644858 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6611570247933884, + "acc_stderr": 0.043207678075366705, + "acc_norm": 0.6611570247933884, + "acc_norm_stderr": 0.043207678075366705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.04820403072760627, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.04820403072760627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4785276073619632, + "acc_stderr": 0.0392474687675113, + "acc_norm": 0.4785276073619632, + "acc_norm_stderr": 0.0392474687675113 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4107142857142857, + "acc_stderr": 0.04669510663875191, + "acc_norm": 0.4107142857142857, + "acc_norm_stderr": 0.04669510663875191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5048543689320388, + "acc_stderr": 0.049505043821289195, + "acc_norm": 0.5048543689320388, + "acc_norm_stderr": 0.049505043821289195 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7521367521367521, + "acc_stderr": 0.028286324075564393, + "acc_norm": 0.7521367521367521, + "acc_norm_stderr": 0.028286324075564393 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6143039591315453, + "acc_stderr": 0.017406476619212904, + "acc_norm": 0.6143039591315453, + "acc_norm_stderr": 0.017406476619212904 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5260115606936416, + "acc_stderr": 0.026882643434022885, + "acc_norm": 0.5260115606936416, + "acc_norm_stderr": 0.026882643434022885 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.27150837988826815, + "acc_stderr": 0.014874252168095268, + "acc_norm": 0.27150837988826815, + "acc_norm_stderr": 0.014874252168095268 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.43137254901960786, + "acc_stderr": 0.028358956313423545, + "acc_norm": 0.43137254901960786, + "acc_norm_stderr": 0.028358956313423545 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.4758842443729904, + "acc_stderr": 0.028365041542564577, + "acc_norm": 0.4758842443729904, + "acc_norm_stderr": 0.028365041542564577 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.4876543209876543, + "acc_stderr": 0.027812262269327242, + "acc_norm": 0.4876543209876543, + "acc_norm_stderr": 0.027812262269327242 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.31560283687943264, + "acc_stderr": 0.027724989449509317, + "acc_norm": 0.31560283687943264, + "acc_norm_stderr": 0.027724989449509317 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3500651890482399, + "acc_stderr": 0.01218255231321517, + "acc_norm": 0.3500651890482399, + "acc_norm_stderr": 0.01218255231321517 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.31985294117647056, + "acc_stderr": 0.028332959514031236, + "acc_norm": 0.31985294117647056, + "acc_norm_stderr": 0.028332959514031236 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.42483660130718953, + "acc_stderr": 0.019997973035458336, + "acc_norm": 0.42483660130718953, + "acc_norm_stderr": 0.019997973035458336 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5272727272727272, + "acc_stderr": 0.04782001791380061, + "acc_norm": 0.5272727272727272, + "acc_norm_stderr": 0.04782001791380061 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5551020408163265, + "acc_stderr": 0.031814251181977865, + "acc_norm": 0.5551020408163265, + "acc_norm_stderr": 0.031814251181977865 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.582089552238806, + "acc_stderr": 0.03487558640462064, + "acc_norm": 0.582089552238806, + "acc_norm_stderr": 0.03487558640462064 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.65, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.65, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.03836722176598052, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.03836722176598052 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.5964912280701754, + "acc_stderr": 0.037627386999170565, + "acc_norm": 0.5964912280701754, + "acc_norm_stderr": 0.037627386999170565 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2864137086903305, + "mc1_stderr": 0.015826142439502346, + "mc2": 0.428667116433953, + "mc2_stderr": 0.015095774970188642 + }, + "all": { + "acc": 0.44291345536273574, + "acc_stderr": 0.03513140512866742, + "acc_norm": 0.4461672418802564, + "acc_norm_stderr": 0.03512502259107083, + "mc1": 0.2864137086903305, + "mc1_stderr": 0.015826142439502346, + "mc2": 0.428667116433953, + "mc2_stderr": 0.015095774970188642 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "c991f8a5814f8d2f", + "hash_cont_tokens": "bc6e686b575268af" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "9d221d28a199a09c", + "hash_cont_tokens": "e7e52367a92daa27" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40052, + "non-padded": 116, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5afce491c120616a", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "f59f8967e61fde18", + "hash_cont_tokens": "f9dae0f98ef7c0f2" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "2efbd578c3185755", + "hash_cont_tokens": "dff84e206d2f1e0d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "e328ff0ca8fc7890", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "145fa357c13fe43c", + "hash_cont_tokens": "b81dd170f83789d1" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "888579887c9a665a", + "hash_cont_tokens": "85c3400292af3bb8" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 569, + "non-padded": 7, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "e2ca7bc279c63b09", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1671195b9f861e25", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "47ed680b9caddd90", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "f71f719c1032180b", + "hash_cont_tokens": "e5cb48f872b79ee7" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5bde7875f9f1d5dd", + "hash_cont_tokens": "40862171591ad909" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "6de5a5feab854eed", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "9a95e6bc66294b33", + "hash_cont_tokens": "36bb2a47e8ff1bd8" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "b581c488a50d149d", + "hash_cont_tokens": "433685e9aa542c2d" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0afd8e37a73e499b", + "hash_cont_tokens": "f086b291b3aa0628" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 560, + "non-padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "6300cbea203e27e1", + "hash_cont_tokens": "4f402da407619e4d" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dff3e10f0162548b", + "hash_cont_tokens": "80d8e3e54d900608" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "133115320d06c025", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b9c0577c9c2daf4b", + "hash_cont_tokens": "e07819899bd63630" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "154d573ba30378ad", + "hash_cont_tokens": "eb6259a94d61e372" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "91754fb26290a162", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "2e32e47bd2233827", + "hash_cont_tokens": "c3336566c025bc59" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "23f9a0b07be2ba2e", + "hash_cont_tokens": "999a32d098465441" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "de99699b0f5b162d", + "hash_cont_tokens": "361410848e01f8ed" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "c96ba9fc2d1deb87", + "hash_cont_tokens": "18f9ae57b2444806" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "00509312373e95f1", + "hash_cont_tokens": "a13496e646060699" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "56e5bf80535561ec", + "hash_cont_tokens": "791a7a25f0571e59" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "c9b689b4034de87c", + "hash_cont_tokens": "9677b0687811cf73" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ccecbb5539c34c08", + "hash_cont_tokens": "6393201d9136920e" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "3f75abf85d2b9fe9", + "hash_cont_tokens": "17caccbb3a38c7bf" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "f52124b61354d42e", + "hash_cont_tokens": "7128e2eeb930d3b3" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b5b75910265dc2ff", + "hash_cont_tokens": "48e22ae63ee54721" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "a26fe13fa58cbbed", + "hash_cont_tokens": "0f40704815d5b3f6" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ad768773a7782c0c", + "hash_cont_tokens": "a9fdf5917bdddc9b" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "5e16e7eb92789a03", + "hash_cont_tokens": "c63e45a81fbe97b2" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6346e2bce86e76fe", + "hash_cont_tokens": "9df89edb95ea3c08" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "76581e704996be9d", + "hash_cont_tokens": "5b4f21454680a984" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "0425f5feb26f8c3f", + "hash_cont_tokens": "0c2fc7f9e9101fbb" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fb4ebd06a3a58fd2", + "hash_cont_tokens": "1279a23b3bc7b32c" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "33ff7687ba4867f3", + "hash_cont_tokens": "be76778b3b861344" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "e6e5c037eb26a498", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "6db91a99fee03712", + "hash_cont_tokens": "c61a0f86b50f0556" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "11bf0e6ef564edfb", + "hash_cont_tokens": "a208a34c74088f6c" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1380, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "42af46cc9aa77d99", + "hash_cont_tokens": "996ce7a5b6c4aef1" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "484b72f626c82f6b", + "hash_cont_tokens": "9d4280b06a73f2ad" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "d3b8f0fb55346a71", + "hash_cont_tokens": "9a708d21688a0b16" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "f8bbc40534f54a72", + "hash_cont_tokens": "ed0ff6b6c4caf978" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "53f58b4e8af11f6e", + "hash_cont_tokens": "4fd1a023ef90b43a" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1127, + "non-padded": 1, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "a95688e641cf31f1", + "hash_cont_tokens": "d2c1c75d7c0e6ec5" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "fc49c75113daa07a", + "hash_cont_tokens": "ff4c3ef8a56efe40" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "92d9588dfc6ac3f9", + "hash_cont_tokens": "b4566ef91a66db7d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "6a093aeebb63f500", + "hash_cont_tokens": "b713ae56c89df822" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "2fef5cbd88ee376f", + "hash_cont_tokens": "89baef8c4b642ed0" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "35a984bdddcb71dc", + "hash_cont_tokens": "b92ed9d8dde61395" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 796, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "64e26afac44fd84d", + "hash_cont_tokens": "bc75e4dffef3dc0e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "3bce3760b179a55c", + "hash_cont_tokens": "1c1bf88d7c979ef5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6554c1be40513fa9", + "hash_cont_tokens": "9fbfaba067301be2" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "c1ed17b2cce8daea", + "hash_cont_tokens": "ad4c4cfcbb927635" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "41fb26e769733d20", + "hash_cont_tokens": "d6b023af5cbcb9cf" + }, + "total_evaluation_time_secondes": "4015.9910237789154", + "truncated": 0, + "non-truncated": 111019, + "padded": 110855, + "non-padded": 164, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/JosephusCheung/Qwen-VL-LLaMAfied-7B-Chat/results_2023-11-04T23-03-04.341481.json b/eval-results/JosephusCheung/Qwen-VL-LLaMAfied-7B-Chat/results_2023-11-04T23-03-04.341481.json new file mode 100644 index 0000000000000000000000000000000000000000..390f383728ac06f34e31364aafd79106ae0371a6 --- /dev/null +++ b/eval-results/JosephusCheung/Qwen-VL-LLaMAfied-7B-Chat/results_2023-11-04T23-03-04.341481.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "JosephusCheung/Qwen-VL-LLaMAfied-7B-Chat", + "model_sha": "ccbd599ac46bcfbf7020be393afeecef404bce2b", + "model_dtype": "torch.bfloat16", + "model_size": "14.48 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.2633179530201342, + "em_stderr": 0.004510450588757744, + "f1": 0.33739093959731714, + "f1_stderr": 0.00449081714733481 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.6566692975532754, + "acc_stderr": 0.01334482318535801 + }, + "all": { + "em": 0.2633179530201342, + "em_stderr": 0.004510450588757744, + "f1": 0.33739093959731714, + "f1_stderr": 0.00449081714733481, + "acc": 0.3283346487766377, + "acc_stderr": 0.006672411592679005 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "e4d9d658ccb42fc3", + "hash_cont_tokens": "94aa2000fae5273c" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "2282d6efefcc7579", + "hash_cont_tokens": "699821e92053335c" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "288ed7294cb59f7d", + "hash_cont_tokens": "f4a307afe0c47a4a" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2429, + "non_padded": 105, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "329083a90a12723b", + "hash_cont_tokens": "526714f92a64e65f" + }, + "truncated": 0, + "non_truncated": 12122, + "padded": 2429, + "non_padded": 10960, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/JosephusCheung/Qwen-VL-LLaMAfied-7B-Chat/results_2023-11-06T17-59-10.856732.json b/eval-results/JosephusCheung/Qwen-VL-LLaMAfied-7B-Chat/results_2023-11-06T17-59-10.856732.json new file mode 100644 index 0000000000000000000000000000000000000000..390f383728ac06f34e31364aafd79106ae0371a6 --- /dev/null +++ b/eval-results/JosephusCheung/Qwen-VL-LLaMAfied-7B-Chat/results_2023-11-06T17-59-10.856732.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "JosephusCheung/Qwen-VL-LLaMAfied-7B-Chat", + "model_sha": "ccbd599ac46bcfbf7020be393afeecef404bce2b", + "model_dtype": "torch.bfloat16", + "model_size": "14.48 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.2633179530201342, + "em_stderr": 0.004510450588757744, + "f1": 0.33739093959731714, + "f1_stderr": 0.00449081714733481 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.6566692975532754, + "acc_stderr": 0.01334482318535801 + }, + "all": { + "em": 0.2633179530201342, + "em_stderr": 0.004510450588757744, + "f1": 0.33739093959731714, + "f1_stderr": 0.00449081714733481, + "acc": 0.3283346487766377, + "acc_stderr": 0.006672411592679005 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "e4d9d658ccb42fc3", + "hash_cont_tokens": "94aa2000fae5273c" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "2282d6efefcc7579", + "hash_cont_tokens": "699821e92053335c" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "288ed7294cb59f7d", + "hash_cont_tokens": "f4a307afe0c47a4a" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2429, + "non_padded": 105, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "329083a90a12723b", + "hash_cont_tokens": "526714f92a64e65f" + }, + "truncated": 0, + "non_truncated": 12122, + "padded": 2429, + "non_padded": 10960, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/JosephusCheung/Yee-34B-200K-Chat/results_2023-12-05T04-15-54.776905.json b/eval-results/JosephusCheung/Yee-34B-200K-Chat/results_2023-12-05T04-15-54.776905.json new file mode 100644 index 0000000000000000000000000000000000000000..a869c813824bfc9293223700a998e2288c554036 --- /dev/null +++ b/eval-results/JosephusCheung/Yee-34B-200K-Chat/results_2023-12-05T04-15-54.776905.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 169672.104131814, + "end_time": 204633.837438729, + "total_evaluation_time_secondes": "34961.733306915005", + "model_name": "JosephusCheung/Yee-34B-200K-Chat", + "model_sha": "94bc30449e41628f59dd965cb7d9a8eb53ce9a45", + "model_dtype": "torch.bfloat16", + "model_size": "64.17 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6254266211604096, + "acc_stderr": 0.014144193471893446, + "acc_norm": 0.6561433447098977, + "acc_norm_stderr": 0.013880644570156218 + }, + "harness|hellaswag|10": { + "acc": 0.6506671977693687, + "acc_stderr": 0.0047578490234119605, + "acc_norm": 0.8432583150766779, + "acc_norm_stderr": 0.003628140427399768 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.7333333333333333, + "acc_stderr": 0.038201699145179055, + "acc_norm": 0.7333333333333333, + "acc_norm_stderr": 0.038201699145179055 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.875, + "acc_stderr": 0.026913523521537846, + "acc_norm": 0.875, + "acc_norm_stderr": 0.026913523521537846 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.8301886792452831, + "acc_stderr": 0.023108393799841326, + "acc_norm": 0.8301886792452831, + "acc_norm_stderr": 0.023108393799841326 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.875, + "acc_stderr": 0.02765610492929436, + "acc_norm": 0.875, + "acc_norm_stderr": 0.02765610492929436 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939098, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939098 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6878612716763006, + "acc_stderr": 0.03533133389323657, + "acc_norm": 0.6878612716763006, + "acc_norm_stderr": 0.03533133389323657 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.04940635630605659, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.04940635630605659 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.81, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.81, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.7617021276595745, + "acc_stderr": 0.027851252973889774, + "acc_norm": 0.7617021276595745, + "acc_norm_stderr": 0.027851252973889774 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5526315789473685, + "acc_stderr": 0.04677473004491199, + "acc_norm": 0.5526315789473685, + "acc_norm_stderr": 0.04677473004491199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.7517241379310344, + "acc_stderr": 0.03600105692727771, + "acc_norm": 0.7517241379310344, + "acc_norm_stderr": 0.03600105692727771 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.6375661375661376, + "acc_stderr": 0.024757473902752045, + "acc_norm": 0.6375661375661376, + "acc_norm_stderr": 0.024757473902752045 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5158730158730159, + "acc_stderr": 0.044698818540726076, + "acc_norm": 0.5158730158730159, + "acc_norm_stderr": 0.044698818540726076 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.56, + "acc_stderr": 0.049888765156985884, + "acc_norm": 0.56, + "acc_norm_stderr": 0.049888765156985884 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8612903225806452, + "acc_stderr": 0.019662961321414027, + "acc_norm": 0.8612903225806452, + "acc_norm_stderr": 0.019662961321414027 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.6206896551724138, + "acc_stderr": 0.034139638059062345, + "acc_norm": 0.6206896551724138, + "acc_norm_stderr": 0.034139638059062345 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8787878787878788, + "acc_stderr": 0.02548549837334323, + "acc_norm": 0.8787878787878788, + "acc_norm_stderr": 0.02548549837334323 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.9040404040404041, + "acc_stderr": 0.020984808610047926, + "acc_norm": 0.9040404040404041, + "acc_norm_stderr": 0.020984808610047926 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9689119170984456, + "acc_stderr": 0.012525310625527046, + "acc_norm": 0.9689119170984456, + "acc_norm_stderr": 0.012525310625527046 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7794871794871795, + "acc_stderr": 0.0210206726808279, + "acc_norm": 0.7794871794871795, + "acc_norm_stderr": 0.0210206726808279 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.37037037037037035, + "acc_stderr": 0.02944316932303154, + "acc_norm": 0.37037037037037035, + "acc_norm_stderr": 0.02944316932303154 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.819327731092437, + "acc_stderr": 0.02499196496660077, + "acc_norm": 0.819327731092437, + "acc_norm_stderr": 0.02499196496660077 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.48344370860927155, + "acc_stderr": 0.0408024418562897, + "acc_norm": 0.48344370860927155, + "acc_norm_stderr": 0.0408024418562897 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.9137614678899083, + "acc_stderr": 0.012035597300116245, + "acc_norm": 0.9137614678899083, + "acc_norm_stderr": 0.012035597300116245 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.625, + "acc_stderr": 0.033016908987210894, + "acc_norm": 0.625, + "acc_norm_stderr": 0.033016908987210894 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9117647058823529, + "acc_stderr": 0.019907399791316945, + "acc_norm": 0.9117647058823529, + "acc_norm_stderr": 0.019907399791316945 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.9156118143459916, + "acc_stderr": 0.01809424711647332, + "acc_norm": 0.9156118143459916, + "acc_norm_stderr": 0.01809424711647332 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.8116591928251121, + "acc_stderr": 0.026241132996407256, + "acc_norm": 0.8116591928251121, + "acc_norm_stderr": 0.026241132996407256 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.9007633587786259, + "acc_stderr": 0.026222235171477374, + "acc_norm": 0.9007633587786259, + "acc_norm_stderr": 0.026222235171477374 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.9008264462809917, + "acc_stderr": 0.02728524631275896, + "acc_norm": 0.9008264462809917, + "acc_norm_stderr": 0.02728524631275896 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8888888888888888, + "acc_stderr": 0.03038159675665167, + "acc_norm": 0.8888888888888888, + "acc_norm_stderr": 0.03038159675665167 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8650306748466258, + "acc_stderr": 0.02684576505455386, + "acc_norm": 0.8650306748466258, + "acc_norm_stderr": 0.02684576505455386 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.6160714285714286, + "acc_stderr": 0.04616143075028546, + "acc_norm": 0.6160714285714286, + "acc_norm_stderr": 0.04616143075028546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8640776699029126, + "acc_stderr": 0.033932957297610096, + "acc_norm": 0.8640776699029126, + "acc_norm_stderr": 0.033932957297610096 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9145299145299145, + "acc_stderr": 0.01831589168562586, + "acc_norm": 0.9145299145299145, + "acc_norm_stderr": 0.01831589168562586 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.89, + "acc_stderr": 0.03144660377352203, + "acc_norm": 0.89, + "acc_norm_stderr": 0.03144660377352203 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8978288633461047, + "acc_stderr": 0.010830724713134182, + "acc_norm": 0.8978288633461047, + "acc_norm_stderr": 0.010830724713134182 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.8092485549132948, + "acc_stderr": 0.02115267696657528, + "acc_norm": 0.8092485549132948, + "acc_norm_stderr": 0.02115267696657528 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.7195530726256983, + "acc_stderr": 0.015024083883322895, + "acc_norm": 0.7195530726256983, + "acc_norm_stderr": 0.015024083883322895 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.8300653594771242, + "acc_stderr": 0.02150538312123138, + "acc_norm": 0.8300653594771242, + "acc_norm_stderr": 0.02150538312123138 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.8006430868167203, + "acc_stderr": 0.022691033780549656, + "acc_norm": 0.8006430868167203, + "acc_norm_stderr": 0.022691033780549656 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8827160493827161, + "acc_stderr": 0.017903112615281123, + "acc_norm": 0.8827160493827161, + "acc_norm_stderr": 0.017903112615281123 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.6170212765957447, + "acc_stderr": 0.02899908090480618, + "acc_norm": 0.6170212765957447, + "acc_norm_stderr": 0.02899908090480618 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5560625814863103, + "acc_stderr": 0.012689708167787679, + "acc_norm": 0.5560625814863103, + "acc_norm_stderr": 0.012689708167787679 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.8014705882352942, + "acc_stderr": 0.02423101337054109, + "acc_norm": 0.8014705882352942, + "acc_norm_stderr": 0.02423101337054109 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.8218954248366013, + "acc_stderr": 0.015478369653108568, + "acc_norm": 0.8218954248366013, + "acc_norm_stderr": 0.015478369653108568 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.04350271442923243, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.04350271442923243 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.8367346938775511, + "acc_stderr": 0.023661699177098615, + "acc_norm": 0.8367346938775511, + "acc_norm_stderr": 0.023661699177098615 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8756218905472637, + "acc_stderr": 0.023335401790166327, + "acc_norm": 0.8756218905472637, + "acc_norm_stderr": 0.023335401790166327 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.88, + "acc_stderr": 0.032659863237109066, + "acc_norm": 0.88, + "acc_norm_stderr": 0.032659863237109066 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5903614457831325, + "acc_stderr": 0.038284011150790206, + "acc_norm": 0.5903614457831325, + "acc_norm_stderr": 0.038284011150790206 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8654970760233918, + "acc_stderr": 0.026168221344662297, + "acc_norm": 0.8654970760233918, + "acc_norm_stderr": 0.026168221344662297 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.379436964504284, + "mc1_stderr": 0.01698703926614299, + "mc2": 0.538842608150276, + "mc2_stderr": 0.015448158590971197 + }, + "harness|winogrande|5": { + "acc": 0.797947908445146, + "acc_stderr": 0.01128501375404745 + }, + "harness|gsm8k|5": { + "acc": 0.3479909021986353, + "acc_stderr": 0.013120581030382132 + }, + "all": { + "acc": 0.7397087702526806, + "acc_stderr": 0.028697152379174293, + "acc_norm": 0.749145830773331, + "acc_norm_stderr": 0.029232668522838182, + "mc1": 0.379436964504284, + "mc1_stderr": 0.01698703926614299, + "mc2": 0.538842608150276, + "mc2_stderr": 0.015448158590971197 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "f52f7134dd4e8235", + "hash_cont_tokens": "e23c779c4c2dd1ec" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4682, + "non_padded": 5, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "8380af90422a117e", + "hash_cont_tokens": "55da5ba61989a8fe" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40097, + "non_padded": 71, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "9185dc38dcc328ea", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "90fdbbaaf0213cec", + "hash_cont_tokens": "5cc800feae9fa1ad" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "cbe1c711494076b6", + "hash_cont_tokens": "655dbb90034f484a" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "09397035a4a73e5f", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "90c311de52544438", + "hash_cont_tokens": "f77b74d946d7fc02" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "d8fd4e3af4ae46c3", + "hash_cont_tokens": "1ba4b1a158d8bf3f" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "da514a10083e8e97", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "7ccea65975bb46d4", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "8ea8585f6adc2650", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9d07c6e852253252", + "hash_cont_tokens": "78a0ebf66d91c5cf" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "0d3d540477f9eddb", + "hash_cont_tokens": "5a030c95824fdbe5" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5ebc754afaa1fac8", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "7780b9cde8badacb", + "hash_cont_tokens": "2326dc60d0bc41b6" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "8acec1576892f7ab", + "hash_cont_tokens": "be908364b6f14dd6" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e0321889f63f18d7", + "hash_cont_tokens": "179280ef597fe1bf" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 564, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "60e497887b9e2608", + "hash_cont_tokens": "95cdcdaf1abd0bd2" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "53adc0607e358206", + "hash_cont_tokens": "6a4818f3c307c346" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "34682f752c1a1ac4", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "bb5cc287970e5c14", + "hash_cont_tokens": "36d0d84455f0bdba" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b12197fdbc9a45f0", + "hash_cont_tokens": "c678f794a9b8ee74" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "36408b638d9d7a8d", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "652bd20e505a2826", + "hash_cont_tokens": "e9c94304326d875c" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "8f4cd01faf05c6f1", + "hash_cont_tokens": "f937a1349eb483eb" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "217861435fcb5576", + "hash_cont_tokens": "8b27dd3907d25b4e" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "bcedb3cf953f812f", + "hash_cont_tokens": "3763cae29e2f938c" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "52affce916d66c97", + "hash_cont_tokens": "fd7b555352d765a4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "b9d29201856d353d", + "hash_cont_tokens": "61f46d4a209b9aa2" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "9c27af329cb41097", + "hash_cont_tokens": "4e7053e7c19d680d" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "192aef17a8956826", + "hash_cont_tokens": "84d19ae8790476bb" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a9bc6c02c6f83983", + "hash_cont_tokens": "b119c7b668213a4e" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "14741fa2bd2a4414", + "hash_cont_tokens": "a3b126bc622d571f" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "67f306eb2bf3d2cb", + "hash_cont_tokens": "9abf19ceb76331ff" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "e5cc30c46358588f", + "hash_cont_tokens": "0e2e725ae9a898da" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "10a6536adeac8632", + "hash_cont_tokens": "a94c1dea6d775249" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d9015aba41ce0d5c", + "hash_cont_tokens": "3832f860859bb86b" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "d5f2109de63c3402", + "hash_cont_tokens": "9fac5a0c364fca8a" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e0b39eb7c9788cfe", + "hash_cont_tokens": "dc53ed31134ddf3a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "643a872ad0f99bb0", + "hash_cont_tokens": "e272b5456d5552d6" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "1232c5b0f524b151", + "hash_cont_tokens": "7119d4642957b1f0" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "f1d76d4a1e08e901", + "hash_cont_tokens": "099d58c66ece3f11" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "cd181ff20fe83b83", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "a3d90d10e2efc569", + "hash_cont_tokens": "bae342d4e82ba8f7" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "4b35576715cc147a", + "hash_cont_tokens": "578c64cbdbb1e0d4" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "1b93703ae85294ee", + "hash_cont_tokens": "79b25f42b3fce0f9" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "6741a26253bd4258", + "hash_cont_tokens": "9d1f3b976417156c" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "730a52e273f8fcf5", + "hash_cont_tokens": "88dab560e1e06d97" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "9e211e939e14b414", + "hash_cont_tokens": "04ea847139fe9393" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "d5761e6be99ed835", + "hash_cont_tokens": "0435ff692ad17e68" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1124, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "fcbc59834dbaa06c", + "hash_cont_tokens": "b852c74e9f8801bd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "ba5999ee85a41b08", + "hash_cont_tokens": "5db0f6460652d063" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "35652463c3b2d9c6", + "hash_cont_tokens": "c960676ef7f3dbe5" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "af501bc2c58d000f", + "hash_cont_tokens": "3320565f412c4b01" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "5df7af45226ffc3a", + "hash_cont_tokens": "218ed775ef60aab9" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "5dc2e3734f4dd402", + "hash_cont_tokens": "20babf5cc4cc7f3d" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "ed972b660c40d1e4", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "ed703c55cc114c98", + "hash_cont_tokens": "dc6d57296bea0882" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "00cf9f5943b1480b", + "hash_cont_tokens": "37f53444db289ed3" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "5e931dfc6ab75011", + "hash_cont_tokens": "71a67034827cd30e" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "bd055e8ba456ab4a", + "hash_cont_tokens": "c93e9c22fa3077a0" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "5cae6c4034435931", + "hash_cont_tokens": "f9475f22afa2fdc5" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "2f7ca631fba4ce39", + "hash_cont_tokens": "252cc31b34422063" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113445, + "non_padded": 1427, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-100step-flan-v2/results_2023-12-06T16-35-02.254725.json b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-100step-flan-v2/results_2023-12-06T16-35-02.254725.json new file mode 100644 index 0000000000000000000000000000000000000000..8db9d4dfc88dd3822dd81c6dec917297a3b96766 --- /dev/null +++ b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-100step-flan-v2/results_2023-12-06T16-35-02.254725.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 328745.472559064, + "end_time": 335373.639240252, + "total_evaluation_time_secondes": "6628.166681188042", + "model_name": "Korabbit/Llama-2-7b-chat-hf-afr-100step-flan-v2", + "model_sha": "0f1873b505a5f32ca429c164a229bab663eaf617", + "model_dtype": "torch.float16", + "model_size": "12.61 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4948805460750853, + "acc_stderr": 0.01461062489030916, + "acc_norm": 0.5324232081911263, + "acc_norm_stderr": 0.014580637569995421 + }, + "harness|hellaswag|10": { + "acc": 0.5962955586536547, + "acc_stderr": 0.004896368185765231, + "acc_norm": 0.7843059151563434, + "acc_norm_stderr": 0.0041046239918463645 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4222222222222222, + "acc_stderr": 0.04266763404099582, + "acc_norm": 0.4222222222222222, + "acc_norm_stderr": 0.04266763404099582 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.48026315789473684, + "acc_stderr": 0.040657710025626036, + "acc_norm": 0.48026315789473684, + "acc_norm_stderr": 0.040657710025626036 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5433962264150943, + "acc_stderr": 0.03065674869673943, + "acc_norm": 0.5433962264150943, + "acc_norm_stderr": 0.03065674869673943 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5208333333333334, + "acc_stderr": 0.041775789507399935, + "acc_norm": 0.5208333333333334, + "acc_norm_stderr": 0.041775789507399935 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4046242774566474, + "acc_stderr": 0.03742461193887248, + "acc_norm": 0.4046242774566474, + "acc_norm_stderr": 0.03742461193887248 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4085106382978723, + "acc_stderr": 0.03213418026701576, + "acc_norm": 0.4085106382978723, + "acc_norm_stderr": 0.03213418026701576 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.37719298245614036, + "acc_stderr": 0.045595221419582166, + "acc_norm": 0.37719298245614036, + "acc_norm_stderr": 0.045595221419582166 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.041665675771015785, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.041665675771015785 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.023456037383982026, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.023456037383982026 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.03852273364924314, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.03852273364924314 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5225806451612903, + "acc_stderr": 0.02841498501970786, + "acc_norm": 0.5225806451612903, + "acc_norm_stderr": 0.02841498501970786 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3694581280788177, + "acc_stderr": 0.033959703819985726, + "acc_norm": 0.3694581280788177, + "acc_norm_stderr": 0.033959703819985726 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5878787878787879, + "acc_stderr": 0.03843566993588717, + "acc_norm": 0.5878787878787879, + "acc_norm_stderr": 0.03843566993588717 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6060606060606061, + "acc_stderr": 0.034812853382329624, + "acc_norm": 0.6060606060606061, + "acc_norm_stderr": 0.034812853382329624 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7150259067357513, + "acc_stderr": 0.032577140777096614, + "acc_norm": 0.7150259067357513, + "acc_norm_stderr": 0.032577140777096614 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4256410256410256, + "acc_stderr": 0.02506909438729654, + "acc_norm": 0.4256410256410256, + "acc_norm_stderr": 0.02506909438729654 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.02684205787383371, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.02684205787383371 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42016806722689076, + "acc_stderr": 0.03206183783236152, + "acc_norm": 0.42016806722689076, + "acc_norm_stderr": 0.03206183783236152 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2913907284768212, + "acc_stderr": 0.03710185726119995, + "acc_norm": 0.2913907284768212, + "acc_norm_stderr": 0.03710185726119995 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6770642201834862, + "acc_stderr": 0.02004811592341532, + "acc_norm": 0.6770642201834862, + "acc_norm_stderr": 0.02004811592341532 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3287037037037037, + "acc_stderr": 0.032036140846700596, + "acc_norm": 0.3287037037037037, + "acc_norm_stderr": 0.032036140846700596 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03308611113236434, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03308611113236434 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.679324894514768, + "acc_stderr": 0.030381931949990403, + "acc_norm": 0.679324894514768, + "acc_norm_stderr": 0.030381931949990403 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5695067264573991, + "acc_stderr": 0.033231973029429394, + "acc_norm": 0.5695067264573991, + "acc_norm_stderr": 0.033231973029429394 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5725190839694656, + "acc_stderr": 0.04338920305792401, + "acc_norm": 0.5725190839694656, + "acc_norm_stderr": 0.04338920305792401 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.043913262867240704, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.043913262867240704 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5925925925925926, + "acc_stderr": 0.04750077341199984, + "acc_norm": 0.5925925925925926, + "acc_norm_stderr": 0.04750077341199984 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.558282208588957, + "acc_stderr": 0.03901591825836184, + "acc_norm": 0.558282208588957, + "acc_norm_stderr": 0.03901591825836184 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6796116504854369, + "acc_stderr": 0.04620284082280041, + "acc_norm": 0.6796116504854369, + "acc_norm_stderr": 0.04620284082280041 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.717948717948718, + "acc_stderr": 0.029480360549541194, + "acc_norm": 0.717948717948718, + "acc_norm_stderr": 0.029480360549541194 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6768837803320562, + "acc_stderr": 0.016723726512343048, + "acc_norm": 0.6768837803320562, + "acc_norm_stderr": 0.016723726512343048 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.026897049996382875, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.026897049996382875 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2223463687150838, + "acc_stderr": 0.013907189208156881, + "acc_norm": 0.2223463687150838, + "acc_norm_stderr": 0.013907189208156881 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5130718954248366, + "acc_stderr": 0.028620130800700246, + "acc_norm": 0.5130718954248366, + "acc_norm_stderr": 0.028620130800700246 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5691318327974276, + "acc_stderr": 0.028125340983972714, + "acc_norm": 0.5691318327974276, + "acc_norm_stderr": 0.028125340983972714 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5679012345679012, + "acc_stderr": 0.027563010971606676, + "acc_norm": 0.5679012345679012, + "acc_norm_stderr": 0.027563010971606676 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.36879432624113473, + "acc_stderr": 0.02878222756134724, + "acc_norm": 0.36879432624113473, + "acc_norm_stderr": 0.02878222756134724 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3500651890482399, + "acc_stderr": 0.012182552313215175, + "acc_norm": 0.3500651890482399, + "acc_norm_stderr": 0.012182552313215175 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.45588235294117646, + "acc_stderr": 0.03025437257397668, + "acc_norm": 0.45588235294117646, + "acc_norm_stderr": 0.03025437257397668 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.48366013071895425, + "acc_stderr": 0.020217030653186453, + "acc_norm": 0.48366013071895425, + "acc_norm_stderr": 0.020217030653186453 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5363636363636364, + "acc_stderr": 0.04776449162396197, + "acc_norm": 0.5363636363636364, + "acc_norm_stderr": 0.04776449162396197 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5224489795918368, + "acc_stderr": 0.031976941187136725, + "acc_norm": 0.5224489795918368, + "acc_norm_stderr": 0.031976941187136725 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6467661691542289, + "acc_stderr": 0.03379790611796777, + "acc_norm": 0.6467661691542289, + "acc_norm_stderr": 0.03379790611796777 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.43373493975903615, + "acc_stderr": 0.03858158940685517, + "acc_norm": 0.43373493975903615, + "acc_norm_stderr": 0.03858158940685517 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7192982456140351, + "acc_stderr": 0.03446296217088427, + "acc_norm": 0.7192982456140351, + "acc_norm_stderr": 0.03446296217088427 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.30354957160342716, + "mc1_stderr": 0.016095884155386847, + "mc2": 0.45656512530226173, + "mc2_stderr": 0.01564502150544874 + }, + "harness|winogrande|5": { + "acc": 0.7229676400947119, + "acc_stderr": 0.012577891015342414 + }, + "harness|gsm8k|5": { + "acc": 0.19484457922668688, + "acc_stderr": 0.010910039409578768 + }, + "all": { + "acc": 0.4854669414522855, + "acc_stderr": 0.03427653682216115, + "acc_norm": 0.4901902411506228, + "acc_norm_stderr": 0.03502642515614706, + "mc1": 0.30354957160342716, + "mc1_stderr": 0.016095884155386847, + "mc2": 0.45656512530226173, + "mc2_stderr": 0.01564502150544874 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "7432e80af384007c" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "a8fa53915153e1db", + "hash_cont_tokens": "b24849461d402299" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-100step-flan/results_2023-12-04T11-18-09.449875.json b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-100step-flan/results_2023-12-04T11-18-09.449875.json new file mode 100644 index 0000000000000000000000000000000000000000..cbc43e7ae9c5e0026aba2589f3279b3c6ecbeeac --- /dev/null +++ b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-100step-flan/results_2023-12-04T11-18-09.449875.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 131228.421979807, + "end_time": 137828.454550048, + "total_evaluation_time_secondes": "6600.03257024099", + "model_name": "Korabbit/Llama-2-7b-chat-hf-afr-100step-flan", + "model_sha": "1d502ae9a15c38118baa5ae55e048a080cb05c89", + "model_dtype": "torch.float16", + "model_size": "12.61 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4948805460750853, + "acc_stderr": 0.01461062489030916, + "acc_norm": 0.5290102389078498, + "acc_norm_stderr": 0.014586776355294323 + }, + "harness|hellaswag|10": { + "acc": 0.5961959768970324, + "acc_stderr": 0.004896563126116811, + "acc_norm": 0.7844054969129656, + "acc_norm_stderr": 0.004103936879526262 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4222222222222222, + "acc_stderr": 0.04266763404099582, + "acc_norm": 0.4222222222222222, + "acc_norm_stderr": 0.04266763404099582 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.48026315789473684, + "acc_stderr": 0.040657710025626036, + "acc_norm": 0.48026315789473684, + "acc_norm_stderr": 0.040657710025626036 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5433962264150943, + "acc_stderr": 0.03065674869673943, + "acc_norm": 0.5433962264150943, + "acc_norm_stderr": 0.03065674869673943 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5208333333333334, + "acc_stderr": 0.041775789507399935, + "acc_norm": 0.5208333333333334, + "acc_norm_stderr": 0.041775789507399935 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4046242774566474, + "acc_stderr": 0.03742461193887248, + "acc_norm": 0.4046242774566474, + "acc_norm_stderr": 0.03742461193887248 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4085106382978723, + "acc_stderr": 0.03213418026701576, + "acc_norm": 0.4085106382978723, + "acc_norm_stderr": 0.03213418026701576 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.37719298245614036, + "acc_stderr": 0.045595221419582166, + "acc_norm": 0.37719298245614036, + "acc_norm_stderr": 0.045595221419582166 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.041665675771015785, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.041665675771015785 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.023517294335963286, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.023517294335963286 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.03852273364924314, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.03852273364924314 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5225806451612903, + "acc_stderr": 0.02841498501970786, + "acc_norm": 0.5225806451612903, + "acc_norm_stderr": 0.02841498501970786 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3645320197044335, + "acc_stderr": 0.033864057460620905, + "acc_norm": 0.3645320197044335, + "acc_norm_stderr": 0.033864057460620905 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5878787878787879, + "acc_stderr": 0.03843566993588717, + "acc_norm": 0.5878787878787879, + "acc_norm_stderr": 0.03843566993588717 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6060606060606061, + "acc_stderr": 0.034812853382329624, + "acc_norm": 0.6060606060606061, + "acc_norm_stderr": 0.034812853382329624 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7150259067357513, + "acc_stderr": 0.032577140777096614, + "acc_norm": 0.7150259067357513, + "acc_norm_stderr": 0.032577140777096614 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4256410256410256, + "acc_stderr": 0.02506909438729654, + "acc_norm": 0.4256410256410256, + "acc_norm_stderr": 0.02506909438729654 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.02684205787383371, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.02684205787383371 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42436974789915966, + "acc_stderr": 0.03210479051015776, + "acc_norm": 0.42436974789915966, + "acc_norm_stderr": 0.03210479051015776 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2847682119205298, + "acc_stderr": 0.036848815213890225, + "acc_norm": 0.2847682119205298, + "acc_norm_stderr": 0.036848815213890225 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6770642201834862, + "acc_stderr": 0.02004811592341532, + "acc_norm": 0.6770642201834862, + "acc_norm_stderr": 0.02004811592341532 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.0321495214780275, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.0321495214780275 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03308611113236434, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03308611113236434 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.679324894514768, + "acc_stderr": 0.030381931949990403, + "acc_norm": 0.679324894514768, + "acc_norm_stderr": 0.030381931949990403 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5695067264573991, + "acc_stderr": 0.033231973029429394, + "acc_norm": 0.5695067264573991, + "acc_norm_stderr": 0.033231973029429394 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5725190839694656, + "acc_stderr": 0.04338920305792401, + "acc_norm": 0.5725190839694656, + "acc_norm_stderr": 0.04338920305792401 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.043913262867240704, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.043913262867240704 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6018518518518519, + "acc_stderr": 0.04732332615978813, + "acc_norm": 0.6018518518518519, + "acc_norm_stderr": 0.04732332615978813 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.558282208588957, + "acc_stderr": 0.03901591825836184, + "acc_norm": 0.558282208588957, + "acc_norm_stderr": 0.03901591825836184 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6699029126213593, + "acc_stderr": 0.04656147110012351, + "acc_norm": 0.6699029126213593, + "acc_norm_stderr": 0.04656147110012351 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.717948717948718, + "acc_stderr": 0.029480360549541194, + "acc_norm": 0.717948717948718, + "acc_norm_stderr": 0.029480360549541194 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6768837803320562, + "acc_stderr": 0.016723726512343048, + "acc_norm": 0.6768837803320562, + "acc_norm_stderr": 0.016723726512343048 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5173410404624278, + "acc_stderr": 0.02690290045866664, + "acc_norm": 0.5173410404624278, + "acc_norm_stderr": 0.02690290045866664 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2212290502793296, + "acc_stderr": 0.013882164598887275, + "acc_norm": 0.2212290502793296, + "acc_norm_stderr": 0.013882164598887275 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5163398692810458, + "acc_stderr": 0.02861462475280544, + "acc_norm": 0.5163398692810458, + "acc_norm_stderr": 0.02861462475280544 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5659163987138264, + "acc_stderr": 0.02815023224453559, + "acc_norm": 0.5659163987138264, + "acc_norm_stderr": 0.02815023224453559 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5648148148148148, + "acc_stderr": 0.0275860062216077, + "acc_norm": 0.5648148148148148, + "acc_norm_stderr": 0.0275860062216077 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.36879432624113473, + "acc_stderr": 0.02878222756134724, + "acc_norm": 0.36879432624113473, + "acc_norm_stderr": 0.02878222756134724 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.34876140808344197, + "acc_stderr": 0.01217203515712712, + "acc_norm": 0.34876140808344197, + "acc_norm_stderr": 0.01217203515712712 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.45955882352941174, + "acc_stderr": 0.03027332507734576, + "acc_norm": 0.45955882352941174, + "acc_norm_stderr": 0.03027332507734576 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.48366013071895425, + "acc_stderr": 0.020217030653186453, + "acc_norm": 0.48366013071895425, + "acc_norm_stderr": 0.020217030653186453 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5363636363636364, + "acc_stderr": 0.04776449162396197, + "acc_norm": 0.5363636363636364, + "acc_norm_stderr": 0.04776449162396197 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5224489795918368, + "acc_stderr": 0.031976941187136725, + "acc_norm": 0.5224489795918368, + "acc_norm_stderr": 0.031976941187136725 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6467661691542289, + "acc_stderr": 0.03379790611796777, + "acc_norm": 0.6467661691542289, + "acc_norm_stderr": 0.03379790611796777 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.43373493975903615, + "acc_stderr": 0.03858158940685517, + "acc_norm": 0.43373493975903615, + "acc_norm_stderr": 0.03858158940685517 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7251461988304093, + "acc_stderr": 0.034240429246915824, + "acc_norm": 0.7251461988304093, + "acc_norm_stderr": 0.034240429246915824 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.30354957160342716, + "mc1_stderr": 0.016095884155386847, + "mc2": 0.4566539974272441, + "mc2_stderr": 0.01564839684478776 + }, + "harness|winogrande|5": { + "acc": 0.7237569060773481, + "acc_stderr": 0.012566815015698158 + }, + "harness|gsm8k|5": { + "acc": 0.19484457922668688, + "acc_stderr": 0.010910039409578768 + }, + "all": { + "acc": 0.4851611654629593, + "acc_stderr": 0.03426525263666884, + "acc_norm": 0.4898062511997487, + "acc_norm_stderr": 0.03501503528186299, + "mc1": 0.30354957160342716, + "mc1_stderr": 0.016095884155386847, + "mc2": 0.4566539974272441, + "mc2_stderr": 0.01564839684478776 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "5525e387f80c8366" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "a8fa53915153e1db", + "hash_cont_tokens": "11a54eb9dba9ed57" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-100step-v2/results_2023-11-23T18-49-40.471713.json b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-100step-v2/results_2023-11-23T18-49-40.471713.json new file mode 100644 index 0000000000000000000000000000000000000000..3abe9244bd10f0fe15773bd6a65f8ad4245cb24f --- /dev/null +++ b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-100step-v2/results_2023-11-23T18-49-40.471713.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 530299.248459213, + "end_time": 543281.156336294, + "total_evaluation_time_secondes": "12981.907877081074", + "model_name": "Korabbit/Llama-2-7b-chat-hf-afr-100step-v2", + "model_sha": "4ee3182f614473f9ea3b6e429b01872bc90e89f1", + "model_dtype": "torch.float16", + "model_size": "12.61 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.492320819112628, + "acc_stderr": 0.01460966744089257, + "acc_norm": 0.5264505119453925, + "acc_norm_stderr": 0.014590931358120167 + }, + "harness|hellaswag|10": { + "acc": 0.5955984863572994, + "acc_stderr": 0.004897728370737241, + "acc_norm": 0.782513443537144, + "acc_norm_stderr": 0.0041169313831573495 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.42962962962962964, + "acc_stderr": 0.04276349494376599, + "acc_norm": 0.42962962962962964, + "acc_norm_stderr": 0.04276349494376599 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.04063302731486671, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.04063302731486671 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5471698113207547, + "acc_stderr": 0.03063562795796182, + "acc_norm": 0.5471698113207547, + "acc_norm_stderr": 0.03063562795796182 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5208333333333334, + "acc_stderr": 0.041775789507399935, + "acc_norm": 0.5208333333333334, + "acc_norm_stderr": 0.041775789507399935 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4046242774566474, + "acc_stderr": 0.03742461193887248, + "acc_norm": 0.4046242774566474, + "acc_norm_stderr": 0.03742461193887248 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4085106382978723, + "acc_stderr": 0.03213418026701576, + "acc_norm": 0.4085106382978723, + "acc_norm_stderr": 0.03213418026701576 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.37719298245614036, + "acc_stderr": 0.045595221419582166, + "acc_norm": 0.37719298245614036, + "acc_norm_stderr": 0.045595221419582166 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.041665675771015785, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.041665675771015785 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.023636975996101806, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.023636975996101806 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.03852273364924314, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.03852273364924314 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5225806451612903, + "acc_stderr": 0.02841498501970786, + "acc_norm": 0.5225806451612903, + "acc_norm_stderr": 0.02841498501970786 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.35960591133004927, + "acc_stderr": 0.033764582465095665, + "acc_norm": 0.35960591133004927, + "acc_norm_stderr": 0.033764582465095665 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5878787878787879, + "acc_stderr": 0.03843566993588717, + "acc_norm": 0.5878787878787879, + "acc_norm_stderr": 0.03843566993588717 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6060606060606061, + "acc_stderr": 0.034812853382329624, + "acc_norm": 0.6060606060606061, + "acc_norm_stderr": 0.034812853382329624 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7150259067357513, + "acc_stderr": 0.032577140777096614, + "acc_norm": 0.7150259067357513, + "acc_norm_stderr": 0.032577140777096614 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4256410256410256, + "acc_stderr": 0.02506909438729654, + "acc_norm": 0.4256410256410256, + "acc_norm_stderr": 0.02506909438729654 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.02684205787383371, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.02684205787383371 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42016806722689076, + "acc_stderr": 0.03206183783236152, + "acc_norm": 0.42016806722689076, + "acc_norm_stderr": 0.03206183783236152 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2847682119205298, + "acc_stderr": 0.036848815213890225, + "acc_norm": 0.2847682119205298, + "acc_norm_stderr": 0.036848815213890225 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6788990825688074, + "acc_stderr": 0.02001814977273375, + "acc_norm": 0.6788990825688074, + "acc_norm_stderr": 0.02001814977273375 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.0321495214780275, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.0321495214780275 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03308611113236434, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03308611113236434 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6751054852320675, + "acc_stderr": 0.03048603938910529, + "acc_norm": 0.6751054852320675, + "acc_norm_stderr": 0.03048603938910529 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5695067264573991, + "acc_stderr": 0.033231973029429394, + "acc_norm": 0.5695067264573991, + "acc_norm_stderr": 0.033231973029429394 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5725190839694656, + "acc_stderr": 0.04338920305792401, + "acc_norm": 0.5725190839694656, + "acc_norm_stderr": 0.04338920305792401 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.043913262867240704, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.043913262867240704 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5925925925925926, + "acc_stderr": 0.04750077341199984, + "acc_norm": 0.5925925925925926, + "acc_norm_stderr": 0.04750077341199984 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.558282208588957, + "acc_stderr": 0.03901591825836184, + "acc_norm": 0.558282208588957, + "acc_norm_stderr": 0.03901591825836184 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6699029126213593, + "acc_stderr": 0.04656147110012351, + "acc_norm": 0.6699029126213593, + "acc_norm_stderr": 0.04656147110012351 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.717948717948718, + "acc_stderr": 0.029480360549541194, + "acc_norm": 0.717948717948718, + "acc_norm_stderr": 0.029480360549541194 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6781609195402298, + "acc_stderr": 0.0167063814150579, + "acc_norm": 0.6781609195402298, + "acc_norm_stderr": 0.0167063814150579 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5144508670520231, + "acc_stderr": 0.026907849856282542, + "acc_norm": 0.5144508670520231, + "acc_norm_stderr": 0.026907849856282542 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2212290502793296, + "acc_stderr": 0.013882164598887275, + "acc_norm": 0.2212290502793296, + "acc_norm_stderr": 0.013882164598887275 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5163398692810458, + "acc_stderr": 0.02861462475280544, + "acc_norm": 0.5163398692810458, + "acc_norm_stderr": 0.02861462475280544 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5755627009646302, + "acc_stderr": 0.028071928247946215, + "acc_norm": 0.5755627009646302, + "acc_norm_stderr": 0.028071928247946215 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5679012345679012, + "acc_stderr": 0.027563010971606676, + "acc_norm": 0.5679012345679012, + "acc_norm_stderr": 0.027563010971606676 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3723404255319149, + "acc_stderr": 0.028838921471251458, + "acc_norm": 0.3723404255319149, + "acc_norm_stderr": 0.028838921471251458 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3494132985658409, + "acc_stderr": 0.012177306252786686, + "acc_norm": 0.3494132985658409, + "acc_norm_stderr": 0.012177306252786686 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.45588235294117646, + "acc_stderr": 0.03025437257397668, + "acc_norm": 0.45588235294117646, + "acc_norm_stderr": 0.03025437257397668 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4852941176470588, + "acc_stderr": 0.020219083895133924, + "acc_norm": 0.4852941176470588, + "acc_norm_stderr": 0.020219083895133924 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5363636363636364, + "acc_stderr": 0.04776449162396197, + "acc_norm": 0.5363636363636364, + "acc_norm_stderr": 0.04776449162396197 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5265306122448979, + "acc_stderr": 0.03196412734523272, + "acc_norm": 0.5265306122448979, + "acc_norm_stderr": 0.03196412734523272 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6467661691542289, + "acc_stderr": 0.03379790611796777, + "acc_norm": 0.6467661691542289, + "acc_norm_stderr": 0.03379790611796777 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.43373493975903615, + "acc_stderr": 0.03858158940685517, + "acc_norm": 0.43373493975903615, + "acc_norm_stderr": 0.03858158940685517 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7192982456140351, + "acc_stderr": 0.03446296217088427, + "acc_norm": 0.7192982456140351, + "acc_norm_stderr": 0.03446296217088427 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2974296205630355, + "mc1_stderr": 0.016002651487361005, + "mc2": 0.4518194385088943, + "mc2_stderr": 0.01565368058265292 + }, + "harness|winogrande|5": { + "acc": 0.7229676400947119, + "acc_stderr": 0.012577891015342412 + }, + "harness|drop|3": { + "em": 0.04016359060402685, + "em_stderr": 0.002010733562468151, + "f1": 0.10108536073825498, + "f1_stderr": 0.0024087765856211545 + }, + "harness|gsm8k|5": { + "acc": 0.08491281273692192, + "acc_stderr": 0.007678212824450797 + }, + "all": { + "acc": 0.4839603798631096, + "acc_stderr": 0.034233481703847386, + "acc_norm": 0.4904194469293332, + "acc_norm_stderr": 0.0350370635088906, + "mc1": 0.2974296205630355, + "mc1_stderr": 0.016002651487361005, + "mc2": 0.4518194385088943, + "mc2_stderr": 0.01565368058265292, + "em": 0.04016359060402685, + "em_stderr": 0.002010733562468151, + "f1": 0.10108536073825498, + "f1_stderr": 0.0024087765856211545 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "0ff126522b306a55" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "eb63d9512d91fdd7" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "379266f3a5365f9d", + "hash_cont_tokens": "f5d26d3684875a18" + }, + "truncated": 3, + "non_truncated": 38192, + "padded": 113348, + "non_padded": 11060, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-200step-flan-v2/results_2023-12-06T16-38-37.454092.json b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-200step-flan-v2/results_2023-12-06T16-38-37.454092.json new file mode 100644 index 0000000000000000000000000000000000000000..53420ce6f0394972df274ff081337132ef9b16b4 --- /dev/null +++ b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-200step-flan-v2/results_2023-12-06T16-38-37.454092.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 328832.415475407, + "end_time": 335596.50115752, + "total_evaluation_time_secondes": "6764.085682113015", + "model_name": "Korabbit/Llama-2-7b-chat-hf-afr-200step-flan-v2", + "model_sha": "35e4747656b719af659625092174f188584934c1", + "model_dtype": "torch.float16", + "model_size": "12.61 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.49402730375426623, + "acc_stderr": 0.014610348300255795, + "acc_norm": 0.5264505119453925, + "acc_norm_stderr": 0.014590931358120169 + }, + "harness|hellaswag|10": { + "acc": 0.5935072694682334, + "acc_stderr": 0.004901747426331731, + "acc_norm": 0.780422226648078, + "acc_norm_stderr": 0.0041311457711038875 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4222222222222222, + "acc_stderr": 0.04266763404099582, + "acc_norm": 0.4222222222222222, + "acc_norm_stderr": 0.04266763404099582 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.48026315789473684, + "acc_stderr": 0.040657710025626036, + "acc_norm": 0.48026315789473684, + "acc_norm_stderr": 0.040657710025626036 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5433962264150943, + "acc_stderr": 0.03065674869673943, + "acc_norm": 0.5433962264150943, + "acc_norm_stderr": 0.03065674869673943 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5347222222222222, + "acc_stderr": 0.04171115858181618, + "acc_norm": 0.5347222222222222, + "acc_norm_stderr": 0.04171115858181618 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4046242774566474, + "acc_stderr": 0.03742461193887248, + "acc_norm": 0.4046242774566474, + "acc_norm_stderr": 0.03742461193887248 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4085106382978723, + "acc_stderr": 0.03213418026701576, + "acc_norm": 0.4085106382978723, + "acc_norm_stderr": 0.03213418026701576 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.04537815354939392, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.04537815354939392 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29894179894179895, + "acc_stderr": 0.023577604791655802, + "acc_norm": 0.29894179894179895, + "acc_norm_stderr": 0.023577604791655802 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.03852273364924314, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.03852273364924314 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5290322580645161, + "acc_stderr": 0.028396016402761005, + "acc_norm": 0.5290322580645161, + "acc_norm_stderr": 0.028396016402761005 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3694581280788177, + "acc_stderr": 0.033959703819985726, + "acc_norm": 0.3694581280788177, + "acc_norm_stderr": 0.033959703819985726 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5878787878787879, + "acc_stderr": 0.03843566993588717, + "acc_norm": 0.5878787878787879, + "acc_norm_stderr": 0.03843566993588717 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6060606060606061, + "acc_stderr": 0.03481285338232963, + "acc_norm": 0.6060606060606061, + "acc_norm_stderr": 0.03481285338232963 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7253886010362695, + "acc_stderr": 0.03221024508041153, + "acc_norm": 0.7253886010362695, + "acc_norm_stderr": 0.03221024508041153 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4282051282051282, + "acc_stderr": 0.025088301454694834, + "acc_norm": 0.4282051282051282, + "acc_norm_stderr": 0.025088301454694834 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.02684205787383371, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.02684205787383371 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42016806722689076, + "acc_stderr": 0.03206183783236152, + "acc_norm": 0.42016806722689076, + "acc_norm_stderr": 0.03206183783236152 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2913907284768212, + "acc_stderr": 0.03710185726119995, + "acc_norm": 0.2913907284768212, + "acc_norm_stderr": 0.03710185726119995 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6770642201834862, + "acc_stderr": 0.02004811592341532, + "acc_norm": 0.6770642201834862, + "acc_norm_stderr": 0.02004811592341532 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3287037037037037, + "acc_stderr": 0.032036140846700596, + "acc_norm": 0.3287037037037037, + "acc_norm_stderr": 0.032036140846700596 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6715686274509803, + "acc_stderr": 0.03296245110172229, + "acc_norm": 0.6715686274509803, + "acc_norm_stderr": 0.03296245110172229 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6624472573839663, + "acc_stderr": 0.030781549102026226, + "acc_norm": 0.6624472573839663, + "acc_norm_stderr": 0.030781549102026226 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5695067264573991, + "acc_stderr": 0.033231973029429394, + "acc_norm": 0.5695067264573991, + "acc_norm_stderr": 0.033231973029429394 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5648854961832062, + "acc_stderr": 0.04348208051644858, + "acc_norm": 0.5648854961832062, + "acc_norm_stderr": 0.04348208051644858 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.043913262867240704, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.043913262867240704 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6018518518518519, + "acc_stderr": 0.04732332615978813, + "acc_norm": 0.6018518518518519, + "acc_norm_stderr": 0.04732332615978813 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.558282208588957, + "acc_stderr": 0.03901591825836184, + "acc_norm": 0.558282208588957, + "acc_norm_stderr": 0.03901591825836184 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6796116504854369, + "acc_stderr": 0.04620284082280041, + "acc_norm": 0.6796116504854369, + "acc_norm_stderr": 0.04620284082280041 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.717948717948718, + "acc_stderr": 0.029480360549541194, + "acc_norm": 0.717948717948718, + "acc_norm_stderr": 0.029480360549541194 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6768837803320562, + "acc_stderr": 0.016723726512343048, + "acc_norm": 0.6768837803320562, + "acc_norm_stderr": 0.016723726512343048 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5115606936416185, + "acc_stderr": 0.02691189868637793, + "acc_norm": 0.5115606936416185, + "acc_norm_stderr": 0.02691189868637793 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.22793296089385476, + "acc_stderr": 0.014030149950805097, + "acc_norm": 0.22793296089385476, + "acc_norm_stderr": 0.014030149950805097 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5130718954248366, + "acc_stderr": 0.028620130800700246, + "acc_norm": 0.5130718954248366, + "acc_norm_stderr": 0.028620130800700246 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5627009646302251, + "acc_stderr": 0.02817391776176289, + "acc_norm": 0.5627009646302251, + "acc_norm_stderr": 0.02817391776176289 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5740740740740741, + "acc_stderr": 0.027513747284379428, + "acc_norm": 0.5740740740740741, + "acc_norm_stderr": 0.027513747284379428 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3723404255319149, + "acc_stderr": 0.028838921471251458, + "acc_norm": 0.3723404255319149, + "acc_norm_stderr": 0.028838921471251458 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.34876140808344197, + "acc_stderr": 0.01217203515712712, + "acc_norm": 0.34876140808344197, + "acc_norm_stderr": 0.01217203515712712 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.45588235294117646, + "acc_stderr": 0.03025437257397668, + "acc_norm": 0.45588235294117646, + "acc_norm_stderr": 0.03025437257397668 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.48856209150326796, + "acc_stderr": 0.02022254151561087, + "acc_norm": 0.48856209150326796, + "acc_norm_stderr": 0.02022254151561087 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5363636363636364, + "acc_stderr": 0.04776449162396197, + "acc_norm": 0.5363636363636364, + "acc_norm_stderr": 0.04776449162396197 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5265306122448979, + "acc_stderr": 0.03196412734523272, + "acc_norm": 0.5265306122448979, + "acc_norm_stderr": 0.03196412734523272 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6467661691542289, + "acc_stderr": 0.03379790611796777, + "acc_norm": 0.6467661691542289, + "acc_norm_stderr": 0.03379790611796777 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42771084337349397, + "acc_stderr": 0.038515976837185335, + "acc_norm": 0.42771084337349397, + "acc_norm_stderr": 0.038515976837185335 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7192982456140351, + "acc_stderr": 0.03446296217088427, + "acc_norm": 0.7192982456140351, + "acc_norm_stderr": 0.03446296217088427 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3011015911872705, + "mc1_stderr": 0.016058999026100616, + "mc2": 0.4541594429045071, + "mc2_stderr": 0.015593505840237026 + }, + "harness|winogrande|5": { + "acc": 0.7292817679558011, + "acc_stderr": 0.012487904760626304 + }, + "harness|gsm8k|5": { + "acc": 0.18953752843062927, + "acc_stderr": 0.010795837931896377 + }, + "all": { + "acc": 0.48616292304115616, + "acc_stderr": 0.03429063635606904, + "acc_norm": 0.49078740973720447, + "acc_norm_stderr": 0.03504500095644619, + "mc1": 0.3011015911872705, + "mc1_stderr": 0.016058999026100616, + "mc2": 0.4541594429045071, + "mc2_stderr": 0.015593505840237026 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "6810cac395527f3e" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "a8fa53915153e1db", + "hash_cont_tokens": "d80698e206b50368" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-200step-flan/results_2023-12-04T15-41-52.746166.json b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-200step-flan/results_2023-12-04T15-41-52.746166.json new file mode 100644 index 0000000000000000000000000000000000000000..9d3b54a17a25f3d75730eb6e478f723e77e44c6b --- /dev/null +++ b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-200step-flan/results_2023-12-04T15-41-52.746166.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 152737.150808779, + "end_time": 159383.683219843, + "total_evaluation_time_secondes": "6646.532411063992", + "model_name": "Korabbit/Llama-2-7b-chat-hf-afr-200step-flan", + "model_sha": "03550d05aac147dde6d70b7b63f4a1661ecf5cb3", + "model_dtype": "torch.float16", + "model_size": "12.61 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.492320819112628, + "acc_stderr": 0.014609667440892574, + "acc_norm": 0.5247440273037542, + "acc_norm_stderr": 0.01459348769493774 + }, + "harness|hellaswag|10": { + "acc": 0.5925114519020116, + "acc_stderr": 0.004903628887264536, + "acc_norm": 0.7802230631348337, + "acc_norm_stderr": 0.0041324914757278775 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4222222222222222, + "acc_stderr": 0.04266763404099582, + "acc_norm": 0.4222222222222222, + "acc_norm_stderr": 0.04266763404099582 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.46710526315789475, + "acc_stderr": 0.040601270352363966, + "acc_norm": 0.46710526315789475, + "acc_norm_stderr": 0.040601270352363966 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5433962264150943, + "acc_stderr": 0.03065674869673943, + "acc_norm": 0.5433962264150943, + "acc_norm_stderr": 0.03065674869673943 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5208333333333334, + "acc_stderr": 0.041775789507399935, + "acc_norm": 0.5208333333333334, + "acc_norm_stderr": 0.041775789507399935 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939098, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939098 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4046242774566474, + "acc_stderr": 0.03742461193887248, + "acc_norm": 0.4046242774566474, + "acc_norm_stderr": 0.03742461193887248 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4127659574468085, + "acc_stderr": 0.03218471141400351, + "acc_norm": 0.4127659574468085, + "acc_norm_stderr": 0.03218471141400351 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.04537815354939392, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.04537815354939392 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29894179894179895, + "acc_stderr": 0.023577604791655805, + "acc_norm": 0.29894179894179895, + "acc_norm_stderr": 0.023577604791655805 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.03852273364924314, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.03852273364924314 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5290322580645161, + "acc_stderr": 0.028396016402761005, + "acc_norm": 0.5290322580645161, + "acc_norm_stderr": 0.028396016402761005 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3694581280788177, + "acc_stderr": 0.033959703819985726, + "acc_norm": 0.3694581280788177, + "acc_norm_stderr": 0.033959703819985726 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5878787878787879, + "acc_stderr": 0.03843566993588717, + "acc_norm": 0.5878787878787879, + "acc_norm_stderr": 0.03843566993588717 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6060606060606061, + "acc_stderr": 0.03481285338232963, + "acc_norm": 0.6060606060606061, + "acc_norm_stderr": 0.03481285338232963 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7253886010362695, + "acc_stderr": 0.03221024508041153, + "acc_norm": 0.7253886010362695, + "acc_norm_stderr": 0.03221024508041153 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4230769230769231, + "acc_stderr": 0.02504919787604234, + "acc_norm": 0.4230769230769231, + "acc_norm_stderr": 0.02504919787604234 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.02684205787383371, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.02684205787383371 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42016806722689076, + "acc_stderr": 0.03206183783236152, + "acc_norm": 0.42016806722689076, + "acc_norm_stderr": 0.03206183783236152 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2847682119205298, + "acc_stderr": 0.036848815213890225, + "acc_norm": 0.2847682119205298, + "acc_norm_stderr": 0.036848815213890225 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6770642201834862, + "acc_stderr": 0.02004811592341532, + "acc_norm": 0.6770642201834862, + "acc_norm_stderr": 0.02004811592341532 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3287037037037037, + "acc_stderr": 0.032036140846700596, + "acc_norm": 0.3287037037037037, + "acc_norm_stderr": 0.032036140846700596 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6617647058823529, + "acc_stderr": 0.033205746129454324, + "acc_norm": 0.6617647058823529, + "acc_norm_stderr": 0.033205746129454324 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6624472573839663, + "acc_stderr": 0.030781549102026226, + "acc_norm": 0.6624472573839663, + "acc_norm_stderr": 0.030781549102026226 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5695067264573991, + "acc_stderr": 0.033231973029429394, + "acc_norm": 0.5695067264573991, + "acc_norm_stderr": 0.033231973029429394 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5648854961832062, + "acc_stderr": 0.04348208051644858, + "acc_norm": 0.5648854961832062, + "acc_norm_stderr": 0.04348208051644858 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.043913262867240704, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.043913262867240704 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6018518518518519, + "acc_stderr": 0.04732332615978813, + "acc_norm": 0.6018518518518519, + "acc_norm_stderr": 0.04732332615978813 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5644171779141104, + "acc_stderr": 0.03895632464138937, + "acc_norm": 0.5644171779141104, + "acc_norm_stderr": 0.03895632464138937 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6796116504854369, + "acc_stderr": 0.04620284082280041, + "acc_norm": 0.6796116504854369, + "acc_norm_stderr": 0.04620284082280041 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.02934311479809446, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.02934311479809446 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6768837803320562, + "acc_stderr": 0.016723726512343048, + "acc_norm": 0.6768837803320562, + "acc_norm_stderr": 0.016723726512343048 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5173410404624278, + "acc_stderr": 0.02690290045866664, + "acc_norm": 0.5173410404624278, + "acc_norm_stderr": 0.02690290045866664 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.22681564245810057, + "acc_stderr": 0.014005843570897899, + "acc_norm": 0.22681564245810057, + "acc_norm_stderr": 0.014005843570897899 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5130718954248366, + "acc_stderr": 0.028620130800700246, + "acc_norm": 0.5130718954248366, + "acc_norm_stderr": 0.028620130800700246 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5659163987138264, + "acc_stderr": 0.02815023224453559, + "acc_norm": 0.5659163987138264, + "acc_norm_stderr": 0.02815023224453559 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5679012345679012, + "acc_stderr": 0.02756301097160668, + "acc_norm": 0.5679012345679012, + "acc_norm_stderr": 0.02756301097160668 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3723404255319149, + "acc_stderr": 0.028838921471251458, + "acc_norm": 0.3723404255319149, + "acc_norm_stderr": 0.028838921471251458 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3494132985658409, + "acc_stderr": 0.012177306252786688, + "acc_norm": 0.3494132985658409, + "acc_norm_stderr": 0.012177306252786688 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.45588235294117646, + "acc_stderr": 0.03025437257397668, + "acc_norm": 0.45588235294117646, + "acc_norm_stderr": 0.03025437257397668 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4918300653594771, + "acc_stderr": 0.020225134343057265, + "acc_norm": 0.4918300653594771, + "acc_norm_stderr": 0.020225134343057265 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5363636363636364, + "acc_stderr": 0.04776449162396197, + "acc_norm": 0.5363636363636364, + "acc_norm_stderr": 0.04776449162396197 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5102040816326531, + "acc_stderr": 0.03200255347893782, + "acc_norm": 0.5102040816326531, + "acc_norm_stderr": 0.03200255347893782 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6467661691542289, + "acc_stderr": 0.03379790611796777, + "acc_norm": 0.6467661691542289, + "acc_norm_stderr": 0.03379790611796777 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42168674698795183, + "acc_stderr": 0.03844453181770917, + "acc_norm": 0.42168674698795183, + "acc_norm_stderr": 0.03844453181770917 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7192982456140351, + "acc_stderr": 0.03446296217088427, + "acc_norm": 0.7192982456140351, + "acc_norm_stderr": 0.03446296217088427 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.29865361077111385, + "mc1_stderr": 0.016021570613768542, + "mc2": 0.45466126872222284, + "mc2_stderr": 0.015592630900447884 + }, + "harness|winogrande|5": { + "acc": 0.7269139700078927, + "acc_stderr": 0.012522020105869456 + }, + "harness|gsm8k|5": { + "acc": 0.1865049279757392, + "acc_stderr": 0.010729140039689897 + }, + "all": { + "acc": 0.4851995546077363, + "acc_stderr": 0.034286883868394735, + "acc_norm": 0.4898964195341055, + "acc_norm_stderr": 0.03504171929947504, + "mc1": 0.29865361077111385, + "mc1_stderr": 0.016021570613768542, + "mc2": 0.45466126872222284, + "mc2_stderr": 0.015592630900447884 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "68d701cbd40d3284" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "a8fa53915153e1db", + "hash_cont_tokens": "9ff994b1908e5a66" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-200step-merged/results_2023-11-21T15-01-37.754437.json b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-200step-merged/results_2023-11-21T15-01-37.754437.json new file mode 100644 index 0000000000000000000000000000000000000000..476f4a223b3f92678ffb15f408e050ac52f02983 --- /dev/null +++ b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-200step-merged/results_2023-11-21T15-01-37.754437.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 473212.778736086, + "end_time": 486271.482015462, + "total_evaluation_time_secondes": "13058.70327937603", + "model_name": "Korabbit/Llama-2-7b-chat-hf-afr-200step-merged", + "model_sha": "858de1c14854e55d5141b8d1b3954b335044669e", + "model_dtype": "torch.float16", + "model_size": "12.61 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.48890784982935154, + "acc_stderr": 0.01460779491401306, + "acc_norm": 0.5204778156996587, + "acc_norm_stderr": 0.014599131353035005 + }, + "harness|hellaswag|10": { + "acc": 0.58743278231428, + "acc_stderr": 0.004912900450370838, + "acc_norm": 0.7738498307110138, + "acc_norm_stderr": 0.004174825437724602 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.42962962962962964, + "acc_stderr": 0.04276349494376599, + "acc_norm": 0.42962962962962964, + "acc_norm_stderr": 0.04276349494376599 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.46710526315789475, + "acc_stderr": 0.040601270352363966, + "acc_norm": 0.46710526315789475, + "acc_norm_stderr": 0.040601270352363966 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5433962264150943, + "acc_stderr": 0.03065674869673943, + "acc_norm": 0.5433962264150943, + "acc_norm_stderr": 0.03065674869673943 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5208333333333334, + "acc_stderr": 0.041775789507399935, + "acc_norm": 0.5208333333333334, + "acc_norm_stderr": 0.041775789507399935 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3988439306358382, + "acc_stderr": 0.037336266553835096, + "acc_norm": 0.3988439306358382, + "acc_norm_stderr": 0.037336266553835096 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4, + "acc_stderr": 0.03202563076101735, + "acc_norm": 0.4, + "acc_norm_stderr": 0.03202563076101735 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.04537815354939392, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.04537815354939392 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5103448275862069, + "acc_stderr": 0.041657747757287644, + "acc_norm": 0.5103448275862069, + "acc_norm_stderr": 0.041657747757287644 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.023636975996101806, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.023636975996101806 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.038095238095238126, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.038095238095238126 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.532258064516129, + "acc_stderr": 0.028384747788813332, + "acc_norm": 0.532258064516129, + "acc_norm_stderr": 0.028384747788813332 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3645320197044335, + "acc_stderr": 0.033864057460620905, + "acc_norm": 0.3645320197044335, + "acc_norm_stderr": 0.033864057460620905 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6, + "acc_stderr": 0.03825460278380025, + "acc_norm": 0.6, + "acc_norm_stderr": 0.03825460278380025 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6060606060606061, + "acc_stderr": 0.034812853382329624, + "acc_norm": 0.6060606060606061, + "acc_norm_stderr": 0.034812853382329624 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7305699481865285, + "acc_stderr": 0.032018671228777947, + "acc_norm": 0.7305699481865285, + "acc_norm_stderr": 0.032018671228777947 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4282051282051282, + "acc_stderr": 0.025088301454694834, + "acc_norm": 0.4282051282051282, + "acc_norm_stderr": 0.025088301454694834 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.026842057873833706, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.026842057873833706 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42016806722689076, + "acc_stderr": 0.03206183783236152, + "acc_norm": 0.42016806722689076, + "acc_norm_stderr": 0.03206183783236152 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.037345356767871984, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.037345356767871984 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.673394495412844, + "acc_stderr": 0.020106990889937303, + "acc_norm": 0.673394495412844, + "acc_norm_stderr": 0.020106990889937303 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.32407407407407407, + "acc_stderr": 0.03191923445686185, + "acc_norm": 0.32407407407407407, + "acc_norm_stderr": 0.03191923445686185 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6617647058823529, + "acc_stderr": 0.033205746129454324, + "acc_norm": 0.6617647058823529, + "acc_norm_stderr": 0.033205746129454324 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03068582059661079, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03068582059661079 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5739910313901345, + "acc_stderr": 0.03318833286217281, + "acc_norm": 0.5739910313901345, + "acc_norm_stderr": 0.03318833286217281 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5725190839694656, + "acc_stderr": 0.04338920305792401, + "acc_norm": 0.5725190839694656, + "acc_norm_stderr": 0.04338920305792401 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.043913262867240704, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.043913262867240704 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6018518518518519, + "acc_stderr": 0.04732332615978813, + "acc_norm": 0.6018518518518519, + "acc_norm_stderr": 0.04732332615978813 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.558282208588957, + "acc_stderr": 0.03901591825836184, + "acc_norm": 0.558282208588957, + "acc_norm_stderr": 0.03901591825836184 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6893203883495146, + "acc_stderr": 0.04582124160161551, + "acc_norm": 0.6893203883495146, + "acc_norm_stderr": 0.04582124160161551 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7264957264957265, + "acc_stderr": 0.029202540153431183, + "acc_norm": 0.7264957264957265, + "acc_norm_stderr": 0.029202540153431183 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6781609195402298, + "acc_stderr": 0.0167063814150579, + "acc_norm": 0.6781609195402298, + "acc_norm_stderr": 0.0167063814150579 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.026897049996382875, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.026897049996382875 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23128491620111732, + "acc_stderr": 0.01410222362315258, + "acc_norm": 0.23128491620111732, + "acc_norm_stderr": 0.01410222362315258 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5065359477124183, + "acc_stderr": 0.028627470550556054, + "acc_norm": 0.5065359477124183, + "acc_norm_stderr": 0.028627470550556054 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5755627009646302, + "acc_stderr": 0.02807192824794621, + "acc_norm": 0.5755627009646302, + "acc_norm_stderr": 0.02807192824794621 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5679012345679012, + "acc_stderr": 0.027563010971606676, + "acc_norm": 0.5679012345679012, + "acc_norm_stderr": 0.027563010971606676 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.36524822695035464, + "acc_stderr": 0.02872386385328128, + "acc_norm": 0.36524822695035464, + "acc_norm_stderr": 0.02872386385328128 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3533246414602347, + "acc_stderr": 0.01220840821108243, + "acc_norm": 0.3533246414602347, + "acc_norm_stderr": 0.01220840821108243 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.45955882352941174, + "acc_stderr": 0.03027332507734576, + "acc_norm": 0.45955882352941174, + "acc_norm_stderr": 0.03027332507734576 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.48856209150326796, + "acc_stderr": 0.02022254151561087, + "acc_norm": 0.48856209150326796, + "acc_norm_stderr": 0.02022254151561087 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5363636363636364, + "acc_stderr": 0.04776449162396197, + "acc_norm": 0.5363636363636364, + "acc_norm_stderr": 0.04776449162396197 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5306122448979592, + "acc_stderr": 0.031949171367580624, + "acc_norm": 0.5306122448979592, + "acc_norm_stderr": 0.031949171367580624 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6467661691542289, + "acc_stderr": 0.03379790611796777, + "acc_norm": 0.6467661691542289, + "acc_norm_stderr": 0.03379790611796777 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.73, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.73, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42771084337349397, + "acc_stderr": 0.038515976837185335, + "acc_norm": 0.42771084337349397, + "acc_norm_stderr": 0.038515976837185335 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7192982456140351, + "acc_stderr": 0.03446296217088427, + "acc_norm": 0.7192982456140351, + "acc_norm_stderr": 0.03446296217088427 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2937576499388005, + "mc1_stderr": 0.015945068581236614, + "mc2": 0.4459944089782322, + "mc2_stderr": 0.01559600987989602 + }, + "harness|winogrande|5": { + "acc": 0.7190213101815311, + "acc_stderr": 0.012632541095875824 + }, + "harness|drop|3": { + "em": 0.01373741610738255, + "em_stderr": 0.0011920334890960778, + "f1": 0.07324769295302042, + "f1_stderr": 0.0018226775245231439 + }, + "harness|gsm8k|5": { + "acc": 0.08491281273692192, + "acc_stderr": 0.007678212824450797 + }, + "all": { + "acc": 0.48542848394851473, + "acc_stderr": 0.03421913026004955, + "acc_norm": 0.49195237986793194, + "acc_norm_stderr": 0.03502219412489953, + "mc1": 0.2937576499388005, + "mc1_stderr": 0.015945068581236614, + "mc2": 0.4459944089782322, + "mc2_stderr": 0.01559600987989602, + "em": 0.01373741610738255, + "em_stderr": 0.0011920334890960778, + "f1": 0.07324769295302042, + "f1_stderr": 0.0018226775245231439 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "aa380e3cce527184" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "3ef477b77b267bc2" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "379266f3a5365f9d", + "hash_cont_tokens": "c72993d942d0d9f0" + }, + "truncated": 3, + "non_truncated": 38192, + "padded": 113348, + "non_padded": 11060, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-200step-merged/results_2023-12-02T13-52-27.757521.json b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-200step-merged/results_2023-12-02T13-52-27.757521.json new file mode 100644 index 0000000000000000000000000000000000000000..6c92b57bbea8f742d3ea177116279ecdc7e7a427 --- /dev/null +++ b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-200step-merged/results_2023-12-02T13-52-27.757521.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1402717.147816929, + "end_time": 1405048.817950383, + "total_evaluation_time_secondes": "2331.6701334540267", + "model_name": "Korabbit/Llama-2-7b-chat-hf-afr-200step-merged", + "model_sha": "858de1c14854e55d5141b8d1b3954b335044669e", + "model_dtype": "torch.float16", + "model_size": "12.61 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.18953752843062927, + "acc_stderr": 0.010795837931896377 + }, + "all": { + "acc": 0.18953752843062927, + "acc_stderr": 0.010795837931896377 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "3ef477b77b267bc2" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "42036645de5ac59d", + "hash_cont_tokens": "8180b5409044e302" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-200step-v2/results_2023-11-23T18-39-46.756166.json b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-200step-v2/results_2023-11-23T18-39-46.756166.json new file mode 100644 index 0000000000000000000000000000000000000000..040457f547ba9bdba30b6f8961f5b04ff83a582c --- /dev/null +++ b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-200step-v2/results_2023-11-23T18-39-46.756166.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 238684.442555479, + "end_time": 251691.766869398, + "total_evaluation_time_secondes": "13007.324313919002", + "model_name": "Korabbit/Llama-2-7b-chat-hf-afr-200step-v2", + "model_sha": "a3575a542e1dc3db4a7794b8f36b104c93b39875", + "model_dtype": "torch.float16", + "model_size": "12.61 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4880546075085324, + "acc_stderr": 0.014607220340597171, + "acc_norm": 0.5179180887372014, + "acc_norm_stderr": 0.014602005585490978 + }, + "harness|hellaswag|10": { + "acc": 0.5889265086636128, + "acc_stderr": 0.004910229643262741, + "acc_norm": 0.7741485759808803, + "acc_norm_stderr": 0.004172872282984212 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.42962962962962964, + "acc_stderr": 0.04276349494376599, + "acc_norm": 0.42962962962962964, + "acc_norm_stderr": 0.04276349494376599 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4605263157894737, + "acc_stderr": 0.04056242252249034, + "acc_norm": 0.4605263157894737, + "acc_norm_stderr": 0.04056242252249034 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.539622641509434, + "acc_stderr": 0.030676096599389184, + "acc_norm": 0.539622641509434, + "acc_norm_stderr": 0.030676096599389184 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5208333333333334, + "acc_stderr": 0.041775789507399935, + "acc_norm": 0.5208333333333334, + "acc_norm_stderr": 0.041775789507399935 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4046242774566474, + "acc_stderr": 0.03742461193887248, + "acc_norm": 0.4046242774566474, + "acc_norm_stderr": 0.03742461193887248 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.40425531914893614, + "acc_stderr": 0.032081157507886836, + "acc_norm": 0.40425531914893614, + "acc_norm_stderr": 0.032081157507886836 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.04537815354939392, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.04537815354939392 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5103448275862069, + "acc_stderr": 0.041657747757287644, + "acc_norm": 0.5103448275862069, + "acc_norm_stderr": 0.041657747757287644 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30423280423280424, + "acc_stderr": 0.023695415009463087, + "acc_norm": 0.30423280423280424, + "acc_norm_stderr": 0.023695415009463087 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.03852273364924314, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.03852273364924314 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5290322580645161, + "acc_stderr": 0.028396016402761005, + "acc_norm": 0.5290322580645161, + "acc_norm_stderr": 0.028396016402761005 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3793103448275862, + "acc_stderr": 0.03413963805906235, + "acc_norm": 0.3793103448275862, + "acc_norm_stderr": 0.03413963805906235 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.593939393939394, + "acc_stderr": 0.03834816355401181, + "acc_norm": 0.593939393939394, + "acc_norm_stderr": 0.03834816355401181 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.601010101010101, + "acc_stderr": 0.034889016168527326, + "acc_norm": 0.601010101010101, + "acc_norm_stderr": 0.034889016168527326 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7150259067357513, + "acc_stderr": 0.032577140777096614, + "acc_norm": 0.7150259067357513, + "acc_norm_stderr": 0.032577140777096614 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4358974358974359, + "acc_stderr": 0.02514180151117749, + "acc_norm": 0.4358974358974359, + "acc_norm_stderr": 0.02514180151117749 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.026962424325073835, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.026962424325073835 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42016806722689076, + "acc_stderr": 0.03206183783236152, + "acc_norm": 0.42016806722689076, + "acc_norm_stderr": 0.03206183783236152 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2913907284768212, + "acc_stderr": 0.03710185726119995, + "acc_norm": 0.2913907284768212, + "acc_norm_stderr": 0.03710185726119995 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6770642201834862, + "acc_stderr": 0.02004811592341532, + "acc_norm": 0.6770642201834862, + "acc_norm_stderr": 0.02004811592341532 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3287037037037037, + "acc_stderr": 0.032036140846700596, + "acc_norm": 0.3287037037037037, + "acc_norm_stderr": 0.032036140846700596 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6568627450980392, + "acc_stderr": 0.033321399446680854, + "acc_norm": 0.6568627450980392, + "acc_norm_stderr": 0.033321399446680854 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03068582059661079, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03068582059661079 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5739910313901345, + "acc_stderr": 0.03318833286217281, + "acc_norm": 0.5739910313901345, + "acc_norm_stderr": 0.03318833286217281 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5648854961832062, + "acc_stderr": 0.04348208051644858, + "acc_norm": 0.5648854961832062, + "acc_norm_stderr": 0.04348208051644858 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.043913262867240704, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.043913262867240704 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5925925925925926, + "acc_stderr": 0.04750077341199984, + "acc_norm": 0.5925925925925926, + "acc_norm_stderr": 0.04750077341199984 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.558282208588957, + "acc_stderr": 0.03901591825836184, + "acc_norm": 0.558282208588957, + "acc_norm_stderr": 0.03901591825836184 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6893203883495146, + "acc_stderr": 0.04582124160161551, + "acc_norm": 0.6893203883495146, + "acc_norm_stderr": 0.04582124160161551 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7136752136752137, + "acc_stderr": 0.02961432369045665, + "acc_norm": 0.7136752136752137, + "acc_norm_stderr": 0.02961432369045665 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6819923371647509, + "acc_stderr": 0.01665348627561539, + "acc_norm": 0.6819923371647509, + "acc_norm_stderr": 0.01665348627561539 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5173410404624278, + "acc_stderr": 0.02690290045866664, + "acc_norm": 0.5173410404624278, + "acc_norm_stderr": 0.02690290045866664 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2223463687150838, + "acc_stderr": 0.013907189208156881, + "acc_norm": 0.2223463687150838, + "acc_norm_stderr": 0.013907189208156881 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5130718954248366, + "acc_stderr": 0.028620130800700246, + "acc_norm": 0.5130718954248366, + "acc_norm_stderr": 0.028620130800700246 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5755627009646302, + "acc_stderr": 0.028071928247946215, + "acc_norm": 0.5755627009646302, + "acc_norm_stderr": 0.028071928247946215 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5709876543209876, + "acc_stderr": 0.027538925613470863, + "acc_norm": 0.5709876543209876, + "acc_norm_stderr": 0.027538925613470863 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.36524822695035464, + "acc_stderr": 0.02872386385328128, + "acc_norm": 0.36524822695035464, + "acc_norm_stderr": 0.02872386385328128 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.34876140808344197, + "acc_stderr": 0.01217203515712712, + "acc_norm": 0.34876140808344197, + "acc_norm_stderr": 0.01217203515712712 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.45588235294117646, + "acc_stderr": 0.030254372573976684, + "acc_norm": 0.45588235294117646, + "acc_norm_stderr": 0.030254372573976684 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4934640522875817, + "acc_stderr": 0.020226106567657807, + "acc_norm": 0.4934640522875817, + "acc_norm_stderr": 0.020226106567657807 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5363636363636364, + "acc_stderr": 0.04776449162396197, + "acc_norm": 0.5363636363636364, + "acc_norm_stderr": 0.04776449162396197 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5224489795918368, + "acc_stderr": 0.031976941187136725, + "acc_norm": 0.5224489795918368, + "acc_norm_stderr": 0.031976941187136725 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6467661691542289, + "acc_stderr": 0.03379790611796777, + "acc_norm": 0.6467661691542289, + "acc_norm_stderr": 0.03379790611796777 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.73, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.73, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42771084337349397, + "acc_stderr": 0.038515976837185335, + "acc_norm": 0.42771084337349397, + "acc_norm_stderr": 0.038515976837185335 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7192982456140351, + "acc_stderr": 0.03446296217088427, + "acc_norm": 0.7192982456140351, + "acc_norm_stderr": 0.03446296217088427 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.29008567931456547, + "mc1_stderr": 0.01588623687420952, + "mc2": 0.43685304669032105, + "mc2_stderr": 0.015582536589566296 + }, + "harness|winogrande|5": { + "acc": 0.7190213101815311, + "acc_stderr": 0.012632541095875824 + }, + "harness|drop|3": { + "em": 0.02160234899328859, + "em_stderr": 0.0014888393578850528, + "f1": 0.08137164429530211, + "f1_stderr": 0.0020119444875776374 + }, + "harness|gsm8k|5": { + "acc": 0.07884761182714177, + "acc_stderr": 0.007423390519873241 + }, + "all": { + "acc": 0.4844173262075713, + "acc_stderr": 0.03422515121320321, + "acc_norm": 0.49096056822371376, + "acc_norm_stderr": 0.03503280881820784, + "mc1": 0.29008567931456547, + "mc1_stderr": 0.01588623687420952, + "mc2": 0.43685304669032105, + "mc2_stderr": 0.015582536589566296, + "em": 0.02160234899328859, + "em_stderr": 0.0014888393578850528, + "f1": 0.08137164429530211, + "f1_stderr": 0.0020119444875776374 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "4880a1343476a947" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "113915fd20710b57" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "379266f3a5365f9d", + "hash_cont_tokens": "12f04fc982333044" + }, + "truncated": 3, + "non_truncated": 38192, + "padded": 113348, + "non_padded": 11060, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-300step-flan-v2/results_2023-12-06T16-40-21.068162.json b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-300step-flan-v2/results_2023-12-06T16-40-21.068162.json new file mode 100644 index 0000000000000000000000000000000000000000..3233c76bfa41855d4be3ca6c90a6726b6ba975f2 --- /dev/null +++ b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-300step-flan-v2/results_2023-12-06T16-40-21.068162.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 328920.805046399, + "end_time": 335688.081162878, + "total_evaluation_time_secondes": "6767.276116479014", + "model_name": "Korabbit/Llama-2-7b-chat-hf-afr-300step-flan-v2", + "model_sha": "a2191bd90b04396016b7420dd14675916056f44a", + "model_dtype": "torch.float16", + "model_size": "12.61 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.49146757679180886, + "acc_stderr": 0.014609263165632191, + "acc_norm": 0.5255972696245734, + "acc_norm_stderr": 0.014592230885298964 + }, + "harness|hellaswag|10": { + "acc": 0.5911173073093009, + "acc_stderr": 0.004906227902850758, + "acc_norm": 0.7776339374626569, + "acc_norm_stderr": 0.004149859300604911 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4222222222222222, + "acc_stderr": 0.04266763404099582, + "acc_norm": 0.4222222222222222, + "acc_norm_stderr": 0.04266763404099582 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.46710526315789475, + "acc_stderr": 0.040601270352363966, + "acc_norm": 0.46710526315789475, + "acc_norm_stderr": 0.040601270352363966 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.539622641509434, + "acc_stderr": 0.030676096599389184, + "acc_norm": 0.539622641509434, + "acc_norm_stderr": 0.030676096599389184 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.04174752578923185, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.04174752578923185 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939098, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939098 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4046242774566474, + "acc_stderr": 0.03742461193887248, + "acc_norm": 0.4046242774566474, + "acc_norm_stderr": 0.03742461193887248 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4127659574468085, + "acc_stderr": 0.03218471141400351, + "acc_norm": 0.4127659574468085, + "acc_norm_stderr": 0.03218471141400351 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.35964912280701755, + "acc_stderr": 0.045144961328736334, + "acc_norm": 0.35964912280701755, + "acc_norm_stderr": 0.045144961328736334 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.02345603738398203, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.02345603738398203 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.038095238095238126, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.038095238095238126 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5258064516129032, + "acc_stderr": 0.02840609505765332, + "acc_norm": 0.5258064516129032, + "acc_norm_stderr": 0.02840609505765332 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3694581280788177, + "acc_stderr": 0.03395970381998573, + "acc_norm": 0.3694581280788177, + "acc_norm_stderr": 0.03395970381998573 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6060606060606061, + "acc_stderr": 0.0381549430868893, + "acc_norm": 0.6060606060606061, + "acc_norm_stderr": 0.0381549430868893 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.601010101010101, + "acc_stderr": 0.03488901616852732, + "acc_norm": 0.601010101010101, + "acc_norm_stderr": 0.03488901616852732 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7253886010362695, + "acc_stderr": 0.03221024508041153, + "acc_norm": 0.7253886010362695, + "acc_norm_stderr": 0.03221024508041153 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4282051282051282, + "acc_stderr": 0.025088301454694834, + "acc_norm": 0.4282051282051282, + "acc_norm_stderr": 0.025088301454694834 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25555555555555554, + "acc_stderr": 0.026593939101844082, + "acc_norm": 0.25555555555555554, + "acc_norm_stderr": 0.026593939101844082 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42436974789915966, + "acc_stderr": 0.03210479051015776, + "acc_norm": 0.42436974789915966, + "acc_norm_stderr": 0.03210479051015776 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.037345356767871984, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.037345356767871984 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.671559633027523, + "acc_stderr": 0.02013590279729841, + "acc_norm": 0.671559633027523, + "acc_norm_stderr": 0.02013590279729841 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3287037037037037, + "acc_stderr": 0.032036140846700596, + "acc_norm": 0.3287037037037037, + "acc_norm_stderr": 0.032036140846700596 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.033086111132364336, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.033086111132364336 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6624472573839663, + "acc_stderr": 0.030781549102026226, + "acc_norm": 0.6624472573839663, + "acc_norm_stderr": 0.030781549102026226 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5650224215246636, + "acc_stderr": 0.033272833702713445, + "acc_norm": 0.5650224215246636, + "acc_norm_stderr": 0.033272833702713445 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5648854961832062, + "acc_stderr": 0.04348208051644858, + "acc_norm": 0.5648854961832062, + "acc_norm_stderr": 0.04348208051644858 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.628099173553719, + "acc_stderr": 0.04412015806624504, + "acc_norm": 0.628099173553719, + "acc_norm_stderr": 0.04412015806624504 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6018518518518519, + "acc_stderr": 0.04732332615978813, + "acc_norm": 0.6018518518518519, + "acc_norm_stderr": 0.04732332615978813 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5644171779141104, + "acc_stderr": 0.03895632464138937, + "acc_norm": 0.5644171779141104, + "acc_norm_stderr": 0.03895632464138937 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.044328040552915185, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.044328040552915185 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6796116504854369, + "acc_stderr": 0.04620284082280041, + "acc_norm": 0.6796116504854369, + "acc_norm_stderr": 0.04620284082280041 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.02934311479809446, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.02934311479809446 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6768837803320562, + "acc_stderr": 0.016723726512343048, + "acc_norm": 0.6768837803320562, + "acc_norm_stderr": 0.016723726512343048 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5144508670520231, + "acc_stderr": 0.026907849856282542, + "acc_norm": 0.5144508670520231, + "acc_norm_stderr": 0.026907849856282542 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.22681564245810057, + "acc_stderr": 0.014005843570897899, + "acc_norm": 0.22681564245810057, + "acc_norm_stderr": 0.014005843570897899 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5098039215686274, + "acc_stderr": 0.028624412550167958, + "acc_norm": 0.5098039215686274, + "acc_norm_stderr": 0.028624412550167958 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.572347266881029, + "acc_stderr": 0.028099240775809553, + "acc_norm": 0.572347266881029, + "acc_norm_stderr": 0.028099240775809553 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5679012345679012, + "acc_stderr": 0.02756301097160668, + "acc_norm": 0.5679012345679012, + "acc_norm_stderr": 0.02756301097160668 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.36879432624113473, + "acc_stderr": 0.02878222756134724, + "acc_norm": 0.36879432624113473, + "acc_norm_stderr": 0.02878222756134724 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3500651890482399, + "acc_stderr": 0.012182552313215172, + "acc_norm": 0.3500651890482399, + "acc_norm_stderr": 0.012182552313215172 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4742647058823529, + "acc_stderr": 0.03033257809455504, + "acc_norm": 0.4742647058823529, + "acc_norm_stderr": 0.03033257809455504 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4934640522875817, + "acc_stderr": 0.020226106567657807, + "acc_norm": 0.4934640522875817, + "acc_norm_stderr": 0.020226106567657807 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5363636363636364, + "acc_stderr": 0.04776449162396197, + "acc_norm": 0.5363636363636364, + "acc_norm_stderr": 0.04776449162396197 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5102040816326531, + "acc_stderr": 0.03200255347893782, + "acc_norm": 0.5102040816326531, + "acc_norm_stderr": 0.03200255347893782 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6467661691542289, + "acc_stderr": 0.03379790611796777, + "acc_norm": 0.6467661691542289, + "acc_norm_stderr": 0.03379790611796777 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42168674698795183, + "acc_stderr": 0.03844453181770917, + "acc_norm": 0.42168674698795183, + "acc_norm_stderr": 0.03844453181770917 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7192982456140351, + "acc_stderr": 0.03446296217088427, + "acc_norm": 0.7192982456140351, + "acc_norm_stderr": 0.03446296217088427 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.29865361077111385, + "mc1_stderr": 0.016021570613768545, + "mc2": 0.45138129313940284, + "mc2_stderr": 0.015562220951147801 + }, + "harness|winogrande|5": { + "acc": 0.7253354380426204, + "acc_stderr": 0.012544516005117187 + }, + "harness|gsm8k|5": { + "acc": 0.17968157695223655, + "acc_stderr": 0.01057511996424224 + }, + "all": { + "acc": 0.4858318036904494, + "acc_stderr": 0.03428773546743271, + "acc_norm": 0.4907011751374352, + "acc_norm_stderr": 0.03504506485866877, + "mc1": 0.29865361077111385, + "mc1_stderr": 0.016021570613768545, + "mc2": 0.45138129313940284, + "mc2_stderr": 0.015562220951147801 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "12b72d0fdfc5f8e1" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "a8fa53915153e1db", + "hash_cont_tokens": "33f3152983db0bc2" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-441step-flan-v2/results_2023-12-08T00-30-10.216270.json b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-441step-flan-v2/results_2023-12-08T00-30-10.216270.json new file mode 100644 index 0000000000000000000000000000000000000000..ea5baa61361cbd10f6359b9f2cf68fb6892645b6 --- /dev/null +++ b/eval-results/Korabbit/Llama-2-7b-chat-hf-afr-441step-flan-v2/results_2023-12-08T00-30-10.216270.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 443625.224644885, + "end_time": 450275.10773866, + "total_evaluation_time_secondes": "6649.883093775017", + "model_name": "Korabbit/Llama-2-7b-chat-hf-afr-441step-flan-v2", + "model_sha": "daede60607179be05b5d6e90b4c6777806b10fb8", + "model_dtype": "torch.float16", + "model_size": "12.61 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4906143344709898, + "acc_stderr": 0.014608816322065, + "acc_norm": 0.5213310580204779, + "acc_norm_stderr": 0.014598087973127106 + }, + "harness|hellaswag|10": { + "acc": 0.5900219079864569, + "acc_stderr": 0.004908241354310213, + "acc_norm": 0.7763393746265684, + "acc_norm_stderr": 0.004158455808204937 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4222222222222222, + "acc_stderr": 0.04266763404099582, + "acc_norm": 0.4222222222222222, + "acc_norm_stderr": 0.04266763404099582 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.46710526315789475, + "acc_stderr": 0.040601270352363966, + "acc_norm": 0.46710526315789475, + "acc_norm_stderr": 0.040601270352363966 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.539622641509434, + "acc_stderr": 0.030676096599389184, + "acc_norm": 0.539622641509434, + "acc_norm_stderr": 0.030676096599389184 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5208333333333334, + "acc_stderr": 0.041775789507399935, + "acc_norm": 0.5208333333333334, + "acc_norm_stderr": 0.041775789507399935 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3988439306358382, + "acc_stderr": 0.037336266553835096, + "acc_norm": 0.3988439306358382, + "acc_norm_stderr": 0.037336266553835096 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171453, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171453 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.41702127659574467, + "acc_stderr": 0.03223276266711712, + "acc_norm": 0.41702127659574467, + "acc_norm_stderr": 0.03223276266711712 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.35964912280701755, + "acc_stderr": 0.045144961328736334, + "acc_norm": 0.35964912280701755, + "acc_norm_stderr": 0.045144961328736334 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5241379310344828, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.5241379310344828, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.02345603738398203, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.02345603738398203 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.038095238095238126, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.038095238095238126 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5290322580645161, + "acc_stderr": 0.028396016402761, + "acc_norm": 0.5290322580645161, + "acc_norm_stderr": 0.028396016402761 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3694581280788177, + "acc_stderr": 0.03395970381998573, + "acc_norm": 0.3694581280788177, + "acc_norm_stderr": 0.03395970381998573 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6060606060606061, + "acc_stderr": 0.0381549430868893, + "acc_norm": 0.6060606060606061, + "acc_norm_stderr": 0.0381549430868893 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.601010101010101, + "acc_stderr": 0.03488901616852732, + "acc_norm": 0.601010101010101, + "acc_norm_stderr": 0.03488901616852732 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7253886010362695, + "acc_stderr": 0.03221024508041153, + "acc_norm": 0.7253886010362695, + "acc_norm_stderr": 0.03221024508041153 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4256410256410256, + "acc_stderr": 0.02506909438729654, + "acc_norm": 0.4256410256410256, + "acc_norm_stderr": 0.02506909438729654 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.026719240783712173, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.026719240783712173 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42436974789915966, + "acc_stderr": 0.03210479051015776, + "acc_norm": 0.42436974789915966, + "acc_norm_stderr": 0.03210479051015776 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.037345356767871984, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.037345356767871984 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.671559633027523, + "acc_stderr": 0.02013590279729841, + "acc_norm": 0.671559633027523, + "acc_norm_stderr": 0.02013590279729841 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3287037037037037, + "acc_stderr": 0.032036140846700596, + "acc_norm": 0.3287037037037037, + "acc_norm_stderr": 0.032036140846700596 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.033086111132364336, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.033086111132364336 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6582278481012658, + "acc_stderr": 0.03087453753755362, + "acc_norm": 0.6582278481012658, + "acc_norm_stderr": 0.03087453753755362 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5695067264573991, + "acc_stderr": 0.033231973029429394, + "acc_norm": 0.5695067264573991, + "acc_norm_stderr": 0.033231973029429394 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5572519083969466, + "acc_stderr": 0.04356447202665069, + "acc_norm": 0.5572519083969466, + "acc_norm_stderr": 0.04356447202665069 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.628099173553719, + "acc_stderr": 0.04412015806624504, + "acc_norm": 0.628099173553719, + "acc_norm_stderr": 0.04412015806624504 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6018518518518519, + "acc_stderr": 0.04732332615978813, + "acc_norm": 0.6018518518518519, + "acc_norm_stderr": 0.04732332615978813 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5644171779141104, + "acc_stderr": 0.03895632464138937, + "acc_norm": 0.5644171779141104, + "acc_norm_stderr": 0.03895632464138937 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.044328040552915185, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.044328040552915185 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6796116504854369, + "acc_stderr": 0.04620284082280041, + "acc_norm": 0.6796116504854369, + "acc_norm_stderr": 0.04620284082280041 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.02934311479809446, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.02934311479809446 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6794380587484036, + "acc_stderr": 0.016688893310803764, + "acc_norm": 0.6794380587484036, + "acc_norm_stderr": 0.016688893310803764 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5115606936416185, + "acc_stderr": 0.02691189868637793, + "acc_norm": 0.5115606936416185, + "acc_norm_stderr": 0.02691189868637793 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23016759776536314, + "acc_stderr": 0.014078339253425817, + "acc_norm": 0.23016759776536314, + "acc_norm_stderr": 0.014078339253425817 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5130718954248366, + "acc_stderr": 0.028620130800700246, + "acc_norm": 0.5130718954248366, + "acc_norm_stderr": 0.028620130800700246 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.572347266881029, + "acc_stderr": 0.028099240775809553, + "acc_norm": 0.572347266881029, + "acc_norm_stderr": 0.028099240775809553 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5648148148148148, + "acc_stderr": 0.0275860062216077, + "acc_norm": 0.5648148148148148, + "acc_norm_stderr": 0.0275860062216077 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.36879432624113473, + "acc_stderr": 0.02878222756134724, + "acc_norm": 0.36879432624113473, + "acc_norm_stderr": 0.02878222756134724 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3500651890482399, + "acc_stderr": 0.012182552313215174, + "acc_norm": 0.3500651890482399, + "acc_norm_stderr": 0.012182552313215174 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4742647058823529, + "acc_stderr": 0.03033257809455504, + "acc_norm": 0.4742647058823529, + "acc_norm_stderr": 0.03033257809455504 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4934640522875817, + "acc_stderr": 0.020226106567657807, + "acc_norm": 0.4934640522875817, + "acc_norm_stderr": 0.020226106567657807 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5363636363636364, + "acc_stderr": 0.04776449162396197, + "acc_norm": 0.5363636363636364, + "acc_norm_stderr": 0.04776449162396197 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5142857142857142, + "acc_stderr": 0.03199615232806287, + "acc_norm": 0.5142857142857142, + "acc_norm_stderr": 0.03199615232806287 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6467661691542289, + "acc_stderr": 0.03379790611796777, + "acc_norm": 0.6467661691542289, + "acc_norm_stderr": 0.03379790611796777 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42168674698795183, + "acc_stderr": 0.03844453181770917, + "acc_norm": 0.42168674698795183, + "acc_norm_stderr": 0.03844453181770917 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7251461988304093, + "acc_stderr": 0.034240429246915824, + "acc_norm": 0.7251461988304093, + "acc_norm_stderr": 0.034240429246915824 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2974296205630355, + "mc1_stderr": 0.016002651487361002, + "mc2": 0.4502487926207983, + "mc2_stderr": 0.01555198529323624 + }, + "harness|winogrande|5": { + "acc": 0.7253354380426204, + "acc_stderr": 0.012544516005117187 + }, + "harness|gsm8k|5": { + "acc": 0.17816527672479152, + "acc_stderr": 0.01054013252754947 + }, + "all": { + "acc": 0.4859119672157952, + "acc_stderr": 0.03429323694683427, + "acc_norm": 0.49074853348450337, + "acc_norm_stderr": 0.03505156425981663, + "mc1": 0.2974296205630355, + "mc1_stderr": 0.016002651487361002, + "mc2": 0.4502487926207983, + "mc2_stderr": 0.01555198529323624 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "3aafd5f23fc3aac9" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "a8fa53915153e1db", + "hash_cont_tokens": "ea8d8463898c44b9" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Lazycuber/Janemalion-6B/results_2023-07-24T11-00-29.262151.json b/eval-results/Lazycuber/Janemalion-6B/results_2023-07-24T11-00-29.262151.json new file mode 100644 index 0000000000000000000000000000000000000000..e4b0eff6d0a3a32b74009463aeb692310a956edd --- /dev/null +++ b/eval-results/Lazycuber/Janemalion-6B/results_2023-07-24T11-00-29.262151.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.386518771331058, + "acc_stderr": 0.01423008476191048, + "acc_norm": 0.42406143344709896, + "acc_norm_stderr": 0.014441889627464398 + }, + "harness|hellaswag|10": { + "acc": 0.5052778331009758, + "acc_stderr": 0.004989503417767287, + "acc_norm": 0.6840270862378013, + "acc_norm_stderr": 0.00463952045344403 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.03944624162501116, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.03944624162501116 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.03782728980865469, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.03782728980865469 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768077, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768077 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.30943396226415093, + "acc_stderr": 0.028450154794118627, + "acc_norm": 0.30943396226415093, + "acc_norm_stderr": 0.028450154794118627 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.17, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.17, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816507, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816507 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2658959537572254, + "acc_stderr": 0.033687629322594316, + "acc_norm": 0.2658959537572254, + "acc_norm_stderr": 0.033687629322594316 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237655, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237655 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3446808510638298, + "acc_stderr": 0.03106898596312215, + "acc_norm": 0.3446808510638298, + "acc_norm_stderr": 0.03106898596312215 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.044346007015849245, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.044346007015849245 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2482758620689655, + "acc_stderr": 0.036001056927277716, + "acc_norm": 0.2482758620689655, + "acc_norm_stderr": 0.036001056927277716 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25132275132275134, + "acc_stderr": 0.022340482339643898, + "acc_norm": 0.25132275132275134, + "acc_norm_stderr": 0.022340482339643898 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.1984126984126984, + "acc_stderr": 0.03567016675276864, + "acc_norm": 0.1984126984126984, + "acc_norm_stderr": 0.03567016675276864 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25483870967741934, + "acc_stderr": 0.024790118459332204, + "acc_norm": 0.25483870967741934, + "acc_norm_stderr": 0.024790118459332204 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2660098522167488, + "acc_stderr": 0.031089826002937523, + "acc_norm": 0.2660098522167488, + "acc_norm_stderr": 0.031089826002937523 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.03477691162163659, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.03477691162163659 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.21717171717171718, + "acc_stderr": 0.029376616484945633, + "acc_norm": 0.21717171717171718, + "acc_norm_stderr": 0.029376616484945633 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.2538860103626943, + "acc_stderr": 0.03141024780565319, + "acc_norm": 0.2538860103626943, + "acc_norm_stderr": 0.03141024780565319 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.28974358974358977, + "acc_stderr": 0.023000628243687968, + "acc_norm": 0.28974358974358977, + "acc_norm_stderr": 0.023000628243687968 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.026962424325073838, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.026962424325073838 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.23949579831932774, + "acc_stderr": 0.02772206549336128, + "acc_norm": 0.23949579831932774, + "acc_norm_stderr": 0.02772206549336128 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2582781456953642, + "acc_stderr": 0.035737053147634576, + "acc_norm": 0.2582781456953642, + "acc_norm_stderr": 0.035737053147634576 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.26055045871559634, + "acc_stderr": 0.018819182034850068, + "acc_norm": 0.26055045871559634, + "acc_norm_stderr": 0.018819182034850068 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.18055555555555555, + "acc_stderr": 0.026232878971491656, + "acc_norm": 0.18055555555555555, + "acc_norm_stderr": 0.026232878971491656 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.031980016601150726, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.031980016601150726 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.3037974683544304, + "acc_stderr": 0.029936696387138594, + "acc_norm": 0.3037974683544304, + "acc_norm_stderr": 0.029936696387138594 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.34080717488789236, + "acc_stderr": 0.0318114974705536, + "acc_norm": 0.34080717488789236, + "acc_norm_stderr": 0.0318114974705536 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.25190839694656486, + "acc_stderr": 0.03807387116306086, + "acc_norm": 0.25190839694656486, + "acc_norm_stderr": 0.03807387116306086 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.36363636363636365, + "acc_stderr": 0.04391326286724071, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.04391326286724071 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.32407407407407407, + "acc_stderr": 0.04524596007030049, + "acc_norm": 0.32407407407407407, + "acc_norm_stderr": 0.04524596007030049 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.25153374233128833, + "acc_stderr": 0.034089978868575295, + "acc_norm": 0.25153374233128833, + "acc_norm_stderr": 0.034089978868575295 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764376, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.27350427350427353, + "acc_stderr": 0.029202540153431177, + "acc_norm": 0.27350427350427353, + "acc_norm_stderr": 0.029202540153431177 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.3243933588761175, + "acc_stderr": 0.016740929047162706, + "acc_norm": 0.3243933588761175, + "acc_norm_stderr": 0.016740929047162706 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2947976878612717, + "acc_stderr": 0.02454761779480383, + "acc_norm": 0.2947976878612717, + "acc_norm_stderr": 0.02454761779480383 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24134078212290502, + "acc_stderr": 0.014310999547961459, + "acc_norm": 0.24134078212290502, + "acc_norm_stderr": 0.014310999547961459 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.3431372549019608, + "acc_stderr": 0.02718449890994162, + "acc_norm": 0.3431372549019608, + "acc_norm_stderr": 0.02718449890994162 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.29260450160771706, + "acc_stderr": 0.02583989833487798, + "acc_norm": 0.29260450160771706, + "acc_norm_stderr": 0.02583989833487798 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.025842248700902168, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.025842248700902168 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2978723404255319, + "acc_stderr": 0.027281608344469414, + "acc_norm": 0.2978723404255319, + "acc_norm_stderr": 0.027281608344469414 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.31681877444589307, + "acc_stderr": 0.01188234995472301, + "acc_norm": 0.31681877444589307, + "acc_norm_stderr": 0.01188234995472301 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.02576725201085598, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.02576725201085598 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.018054027458815198, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.018054027458815198 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.35454545454545455, + "acc_stderr": 0.04582004841505416, + "acc_norm": 0.35454545454545455, + "acc_norm_stderr": 0.04582004841505416 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.03168091161233882, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.03168091161233882 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.36318407960199006, + "acc_stderr": 0.03400598505599014, + "acc_norm": 0.36318407960199006, + "acc_norm_stderr": 0.03400598505599014 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.29518072289156627, + "acc_stderr": 0.035509201856896294, + "acc_norm": 0.29518072289156627, + "acc_norm_stderr": 0.035509201856896294 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.03565079670708311, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.03565079670708311 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.20930232558139536, + "mc1_stderr": 0.01424121943478583, + "mc2": 0.34587723098212036, + "mc2_stderr": 0.01348699014658101 + }, + "all": { + "acc": 0.28835373902195927, + "acc_stderr": 0.03262443339491777, + "acc_norm": 0.29201970368726204, + "acc_norm_stderr": 0.032622091393243714, + "mc1": 0.20930232558139536, + "mc1_stderr": 0.01424121943478583, + "mc2": 0.34587723098212036, + "mc2_stderr": 0.01348699014658101 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "Lazycuber/Janemalion-6B", + "model_sha": "e72ae3ec110121115b1ae6c2e5fb3995997a2d96", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4685, + "non-padded": 2, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40045, + "non-padded": 123, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 680, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 16, + "non-truncated": 6120, + "padded": 6120, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "0893dfcb83435e7d", + "hash_cont_tokens": "6159bf1904a8c8fb" + }, + "total_evaluation_time_secondes": "2472.055052280426", + "truncated": 1492, + "non-truncated": 109527, + "padded": 109290, + "non-padded": 1729, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Lazycuber/L2-7b-Base-Guanaco-Uncensored/results_2023-09-21T20-58-37.445412.json b/eval-results/Lazycuber/L2-7b-Base-Guanaco-Uncensored/results_2023-09-21T20-58-37.445412.json new file mode 100644 index 0000000000000000000000000000000000000000..298bf882e3dd03f857a453903b4514ba76c54ce4 --- /dev/null +++ b/eval-results/Lazycuber/L2-7b-Base-Guanaco-Uncensored/results_2023-09-21T20-58-37.445412.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Lazycuber/L2-7b-Base-Guanaco-Uncensored", + "model_sha": "dd51a3b26ad378e2953c947a1e4c2f8febe0cb52", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.49402730375426623, + "acc_stderr": 0.014610348300255795, + "acc_norm": 0.5221843003412969, + "acc_norm_stderr": 0.014597001927076136 + }, + "harness|hellaswag|10": { + "acc": 0.5916152160924119, + "acc_stderr": 0.00490530437109087, + "acc_norm": 0.7907787293367855, + "acc_norm_stderr": 0.004059213774735547 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45925925925925926, + "acc_stderr": 0.04304979692464242, + "acc_norm": 0.45925925925925926, + "acc_norm_stderr": 0.04304979692464242 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.40789473684210525, + "acc_stderr": 0.03999309712777471, + "acc_norm": 0.40789473684210525, + "acc_norm_stderr": 0.03999309712777471 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.44528301886792454, + "acc_stderr": 0.030588052974270658, + "acc_norm": 0.44528301886792454, + "acc_norm_stderr": 0.030588052974270658 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4375, + "acc_stderr": 0.04148415739394154, + "acc_norm": 0.4375, + "acc_norm_stderr": 0.04148415739394154 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4161849710982659, + "acc_stderr": 0.03758517775404948, + "acc_norm": 0.4161849710982659, + "acc_norm_stderr": 0.03758517775404948 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.040925639582376536, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.040925639582376536 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4297872340425532, + "acc_stderr": 0.03236214467715563, + "acc_norm": 0.4297872340425532, + "acc_norm_stderr": 0.03236214467715563 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.34210526315789475, + "acc_stderr": 0.04462917535336936, + "acc_norm": 0.34210526315789475, + "acc_norm_stderr": 0.04462917535336936 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.42758620689655175, + "acc_stderr": 0.04122737111370331, + "acc_norm": 0.42758620689655175, + "acc_norm_stderr": 0.04122737111370331 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.291005291005291, + "acc_stderr": 0.023393826500484865, + "acc_norm": 0.291005291005291, + "acc_norm_stderr": 0.023393826500484865 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.0404061017820884, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.0404061017820884 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.47419354838709676, + "acc_stderr": 0.02840609505765332, + "acc_norm": 0.47419354838709676, + "acc_norm_stderr": 0.02840609505765332 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.35960591133004927, + "acc_stderr": 0.033764582465095665, + "acc_norm": 0.35960591133004927, + "acc_norm_stderr": 0.033764582465095665 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.593939393939394, + "acc_stderr": 0.03834816355401181, + "acc_norm": 0.593939393939394, + "acc_norm_stderr": 0.03834816355401181 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.48484848484848486, + "acc_stderr": 0.0356071651653106, + "acc_norm": 0.48484848484848486, + "acc_norm_stderr": 0.0356071651653106 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.689119170984456, + "acc_stderr": 0.03340361906276586, + "acc_norm": 0.689119170984456, + "acc_norm_stderr": 0.03340361906276586 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.441025641025641, + "acc_stderr": 0.025174048384000756, + "acc_norm": 0.441025641025641, + "acc_norm_stderr": 0.025174048384000756 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.027634907264178544, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.027634907264178544 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.40756302521008403, + "acc_stderr": 0.03191863374478465, + "acc_norm": 0.40756302521008403, + "acc_norm_stderr": 0.03191863374478465 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.03802039760107903, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.03802039760107903 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6256880733944954, + "acc_stderr": 0.020748959408988306, + "acc_norm": 0.6256880733944954, + "acc_norm_stderr": 0.020748959408988306 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.25, + "acc_stderr": 0.029531221160930918, + "acc_norm": 0.25, + "acc_norm_stderr": 0.029531221160930918 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.553921568627451, + "acc_stderr": 0.03488845451304974, + "acc_norm": 0.553921568627451, + "acc_norm_stderr": 0.03488845451304974 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.620253164556962, + "acc_stderr": 0.031591887529658504, + "acc_norm": 0.620253164556962, + "acc_norm_stderr": 0.031591887529658504 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5695067264573991, + "acc_stderr": 0.033231973029429394, + "acc_norm": 0.5695067264573991, + "acc_norm_stderr": 0.033231973029429394 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5572519083969466, + "acc_stderr": 0.04356447202665069, + "acc_norm": 0.5572519083969466, + "acc_norm_stderr": 0.04356447202665069 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6446280991735537, + "acc_stderr": 0.0436923632657398, + "acc_norm": 0.6446280991735537, + "acc_norm_stderr": 0.0436923632657398 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.04803752235190192, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.04803752235190192 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.49079754601226994, + "acc_stderr": 0.03927705600787443, + "acc_norm": 0.49079754601226994, + "acc_norm_stderr": 0.03927705600787443 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5631067961165048, + "acc_stderr": 0.049111471073657764, + "acc_norm": 0.5631067961165048, + "acc_norm_stderr": 0.049111471073657764 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7136752136752137, + "acc_stderr": 0.02961432369045665, + "acc_norm": 0.7136752136752137, + "acc_norm_stderr": 0.02961432369045665 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.52, + "acc_stderr": 0.05021167315686779, + "acc_norm": 0.52, + "acc_norm_stderr": 0.05021167315686779 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6385696040868455, + "acc_stderr": 0.017179601328900732, + "acc_norm": 0.6385696040868455, + "acc_norm_stderr": 0.017179601328900732 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.026897049996382875, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.026897049996382875 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.48366013071895425, + "acc_stderr": 0.028614624752805413, + "acc_norm": 0.48366013071895425, + "acc_norm_stderr": 0.028614624752805413 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5884244372990354, + "acc_stderr": 0.02795048149440127, + "acc_norm": 0.5884244372990354, + "acc_norm_stderr": 0.02795048149440127 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5092592592592593, + "acc_stderr": 0.027815973433878014, + "acc_norm": 0.5092592592592593, + "acc_norm_stderr": 0.027815973433878014 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.35815602836879434, + "acc_stderr": 0.028602085862759426, + "acc_norm": 0.35815602836879434, + "acc_norm_stderr": 0.028602085862759426 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3559322033898305, + "acc_stderr": 0.012228645537277568, + "acc_norm": 0.3559322033898305, + "acc_norm_stderr": 0.012228645537277568 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5404411764705882, + "acc_stderr": 0.03027332507734576, + "acc_norm": 0.5404411764705882, + "acc_norm_stderr": 0.03027332507734576 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4542483660130719, + "acc_stderr": 0.020142974553795195, + "acc_norm": 0.4542483660130719, + "acc_norm_stderr": 0.020142974553795195 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5727272727272728, + "acc_stderr": 0.047381987035454834, + "acc_norm": 0.5727272727272728, + "acc_norm_stderr": 0.047381987035454834 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.42448979591836733, + "acc_stderr": 0.031642094879429414, + "acc_norm": 0.42448979591836733, + "acc_norm_stderr": 0.031642094879429414 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6467661691542289, + "acc_stderr": 0.03379790611796777, + "acc_norm": 0.6467661691542289, + "acc_norm_stderr": 0.03379790611796777 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.038367221765980515, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.038367221765980515 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7017543859649122, + "acc_stderr": 0.03508771929824564, + "acc_norm": 0.7017543859649122, + "acc_norm_stderr": 0.03508771929824564 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2913096695226438, + "mc1_stderr": 0.015905987048184828, + "mc2": 0.42967422660946836, + "mc2_stderr": 0.014075137629554195 + }, + "all": { + "acc": 0.46891978366173287, + "acc_stderr": 0.03526325786670738, + "acc_norm": 0.47277267365887526, + "acc_norm_stderr": 0.03524869113841017, + "mc1": 0.2913096695226438, + "mc1_stderr": 0.015905987048184828, + "mc2": 0.42967422660946836, + "mc2_stderr": 0.014075137629554195 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4223.796432971954", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Lazycuber/L2-7b-Base-Guanaco-Uncensored/results_2023-10-25T18-03-37.956652.json b/eval-results/Lazycuber/L2-7b-Base-Guanaco-Uncensored/results_2023-10-25T18-03-37.956652.json new file mode 100644 index 0000000000000000000000000000000000000000..73ce2397152e3f06543df37aaf54114b5c11dc15 --- /dev/null +++ b/eval-results/Lazycuber/L2-7b-Base-Guanaco-Uncensored/results_2023-10-25T18-03-37.956652.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Lazycuber/L2-7b-Base-Guanaco-Uncensored", + "model_sha": "dd51a3b26ad378e2953c947a1e4c2f8febe0cb52", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0010486577181208054, + "em_stderr": 0.0003314581465219032, + "f1": 0.05746119966442964, + "f1_stderr": 0.0013225129443672397 + }, + "harness|gsm8k|5": { + "acc": 0.07278241091736164, + "acc_stderr": 0.007155604761167465 + }, + "harness|winogrande|5": { + "acc": 0.745067087608524, + "acc_stderr": 0.012248806969376422 + }, + "all": { + "em": 0.0010486577181208054, + "em_stderr": 0.0003314581465219032, + "f1": 0.05746119966442964, + "f1_stderr": 0.0013225129443672397, + "acc": 0.4089247492629428, + "acc_stderr": 0.009702205865271943 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "39c90632a7b4a33c" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "77beb811d1bc8fee" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "ae549fd1282257c2" + }, + "total_evaluation_time_secondes": "10553.8526866436", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Lazycuber/L2-7b-Guanaco-Random-Test/results_2023-10-08T18-13-47.081600.json b/eval-results/Lazycuber/L2-7b-Guanaco-Random-Test/results_2023-10-08T18-13-47.081600.json new file mode 100644 index 0000000000000000000000000000000000000000..73ebfb6b0dd6606bc309e6314c9d6a8e97102b6f --- /dev/null +++ b/eval-results/Lazycuber/L2-7b-Guanaco-Random-Test/results_2023-10-08T18-13-47.081600.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Lazycuber/L2-7b-Guanaco-Random-Test", + "model_sha": "9ffff7d0f58ba1de5e5fc59a61b7dc6ca571c9bf", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4761092150170648, + "acc_stderr": 0.014594701798071654, + "acc_norm": 0.5059726962457338, + "acc_norm_stderr": 0.014610348300255795 + }, + "harness|hellaswag|10": { + "acc": 0.5723959370643298, + "acc_stderr": 0.004937199759947679, + "acc_norm": 0.7720573590918144, + "acc_norm_stderr": 0.004186480645315568 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.43703703703703706, + "acc_stderr": 0.042849586397533994, + "acc_norm": 0.43703703703703706, + "acc_norm_stderr": 0.042849586397533994 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5131578947368421, + "acc_stderr": 0.04067533136309173, + "acc_norm": 0.5131578947368421, + "acc_norm_stderr": 0.04067533136309173 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5169811320754717, + "acc_stderr": 0.030755120364119905, + "acc_norm": 0.5169811320754717, + "acc_norm_stderr": 0.030755120364119905 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5138888888888888, + "acc_stderr": 0.041795966175810016, + "acc_norm": 0.5138888888888888, + "acc_norm_stderr": 0.041795966175810016 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3699421965317919, + "acc_stderr": 0.036812296333943194, + "acc_norm": 0.3699421965317919, + "acc_norm_stderr": 0.036812296333943194 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.042801058373643966, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.042801058373643966 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.425531914893617, + "acc_stderr": 0.03232146916224469, + "acc_norm": 0.425531914893617, + "acc_norm_stderr": 0.03232146916224469 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.38596491228070173, + "acc_stderr": 0.04579639422070434, + "acc_norm": 0.38596491228070173, + "acc_norm_stderr": 0.04579639422070434 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.041665675771015785, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.041665675771015785 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.31216931216931215, + "acc_stderr": 0.0238652068369726, + "acc_norm": 0.31216931216931215, + "acc_norm_stderr": 0.0238652068369726 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.038522733649243156, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.038522733649243156 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5290322580645161, + "acc_stderr": 0.028396016402761005, + "acc_norm": 0.5290322580645161, + "acc_norm_stderr": 0.028396016402761005 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3793103448275862, + "acc_stderr": 0.03413963805906235, + "acc_norm": 0.3793103448275862, + "acc_norm_stderr": 0.03413963805906235 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5818181818181818, + "acc_stderr": 0.03851716319398395, + "acc_norm": 0.5818181818181818, + "acc_norm_stderr": 0.03851716319398395 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5909090909090909, + "acc_stderr": 0.03502975799413007, + "acc_norm": 0.5909090909090909, + "acc_norm_stderr": 0.03502975799413007 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6683937823834197, + "acc_stderr": 0.03397636541089118, + "acc_norm": 0.6683937823834197, + "acc_norm_stderr": 0.03397636541089118 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4128205128205128, + "acc_stderr": 0.024962683564331803, + "acc_norm": 0.4128205128205128, + "acc_norm_stderr": 0.024962683564331803 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085626, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085626 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3907563025210084, + "acc_stderr": 0.031693802357129965, + "acc_norm": 0.3907563025210084, + "acc_norm_stderr": 0.031693802357129965 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.03802039760107903, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.03802039760107903 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6642201834862386, + "acc_stderr": 0.020248081396752927, + "acc_norm": 0.6642201834862386, + "acc_norm_stderr": 0.020248081396752927 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.031141447823536016, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.031141447823536016 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6421568627450981, + "acc_stderr": 0.03364487286088298, + "acc_norm": 0.6421568627450981, + "acc_norm_stderr": 0.03364487286088298 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6286919831223629, + "acc_stderr": 0.0314506860074486, + "acc_norm": 0.6286919831223629, + "acc_norm_stderr": 0.0314506860074486 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5560538116591929, + "acc_stderr": 0.03334625674242728, + "acc_norm": 0.5560538116591929, + "acc_norm_stderr": 0.03334625674242728 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5572519083969466, + "acc_stderr": 0.04356447202665069, + "acc_norm": 0.5572519083969466, + "acc_norm_stderr": 0.04356447202665069 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6694214876033058, + "acc_stderr": 0.04294340845212093, + "acc_norm": 0.6694214876033058, + "acc_norm_stderr": 0.04294340845212093 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5925925925925926, + "acc_stderr": 0.04750077341199984, + "acc_norm": 0.5925925925925926, + "acc_norm_stderr": 0.04750077341199984 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5153374233128835, + "acc_stderr": 0.03926522378708843, + "acc_norm": 0.5153374233128835, + "acc_norm_stderr": 0.03926522378708843 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764376, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6407766990291263, + "acc_stderr": 0.047504583990416946, + "acc_norm": 0.6407766990291263, + "acc_norm_stderr": 0.047504583990416946 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7307692307692307, + "acc_stderr": 0.029058588303748842, + "acc_norm": 0.7307692307692307, + "acc_norm_stderr": 0.029058588303748842 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956913, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956913 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6845466155810983, + "acc_stderr": 0.016617501738763387, + "acc_norm": 0.6845466155810983, + "acc_norm_stderr": 0.016617501738763387 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5260115606936416, + "acc_stderr": 0.02688264343402289, + "acc_norm": 0.5260115606936416, + "acc_norm_stderr": 0.02688264343402289 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.22681564245810057, + "acc_stderr": 0.014005843570897895, + "acc_norm": 0.22681564245810057, + "acc_norm_stderr": 0.014005843570897895 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5392156862745098, + "acc_stderr": 0.028541722692618874, + "acc_norm": 0.5392156862745098, + "acc_norm_stderr": 0.028541722692618874 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5466237942122186, + "acc_stderr": 0.02827435985489426, + "acc_norm": 0.5466237942122186, + "acc_norm_stderr": 0.02827435985489426 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.558641975308642, + "acc_stderr": 0.027628737155668763, + "acc_norm": 0.558641975308642, + "acc_norm_stderr": 0.027628737155668763 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3617021276595745, + "acc_stderr": 0.028663820147199495, + "acc_norm": 0.3617021276595745, + "acc_norm_stderr": 0.028663820147199495 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.32790091264667537, + "acc_stderr": 0.011989936640666525, + "acc_norm": 0.32790091264667537, + "acc_norm_stderr": 0.011989936640666525 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.39705882352941174, + "acc_stderr": 0.029722152099280065, + "acc_norm": 0.39705882352941174, + "acc_norm_stderr": 0.029722152099280065 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.46895424836601307, + "acc_stderr": 0.020188804456361883, + "acc_norm": 0.46895424836601307, + "acc_norm_stderr": 0.020188804456361883 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.509090909090909, + "acc_stderr": 0.0478833976870286, + "acc_norm": 0.509090909090909, + "acc_norm_stderr": 0.0478833976870286 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5224489795918368, + "acc_stderr": 0.03197694118713672, + "acc_norm": 0.5224489795918368, + "acc_norm_stderr": 0.03197694118713672 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6218905472636815, + "acc_stderr": 0.034288678487786564, + "acc_norm": 0.6218905472636815, + "acc_norm_stderr": 0.034288678487786564 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.43373493975903615, + "acc_stderr": 0.03858158940685517, + "acc_norm": 0.43373493975903615, + "acc_norm_stderr": 0.03858158940685517 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.695906432748538, + "acc_stderr": 0.0352821125824523, + "acc_norm": 0.695906432748538, + "acc_norm_stderr": 0.0352821125824523 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.27906976744186046, + "mc1_stderr": 0.0157021070906279, + "mc2": 0.4232640996589444, + "mc2_stderr": 0.01477991946603906 + }, + "all": { + "acc": 0.47820349788584665, + "acc_stderr": 0.03520803674350638, + "acc_norm": 0.4820937504834085, + "acc_norm_stderr": 0.03519557788566828, + "mc1": 0.27906976744186046, + "mc1_stderr": 0.0157021070906279, + "mc2": 0.4232640996589444, + "mc2_stderr": 0.01477991946603906 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4167.49320435524", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Lazycuber/L2-7b-Guanaco-Uncensored/results_2023-09-18T14-24-41.596109.json b/eval-results/Lazycuber/L2-7b-Guanaco-Uncensored/results_2023-09-18T14-24-41.596109.json new file mode 100644 index 0000000000000000000000000000000000000000..96157389f8411b554cb989fb408a7452d78f4439 --- /dev/null +++ b/eval-results/Lazycuber/L2-7b-Guanaco-Uncensored/results_2023-09-18T14-24-41.596109.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Lazycuber/L2-7b-Guanaco-Uncensored", + "model_sha": "9d49378c69c00113cf7f6e66d1ddb9d9b003dddc", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4667235494880546, + "acc_stderr": 0.01457899585960581, + "acc_norm": 0.5059726962457338, + "acc_norm_stderr": 0.014610348300255793 + }, + "harness|hellaswag|10": { + "acc": 0.5754829715196176, + "acc_stderr": 0.004932593348813629, + "acc_norm": 0.7698665604461262, + "acc_norm_stderr": 0.004200578535056531 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621502, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621502 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4222222222222222, + "acc_stderr": 0.04266763404099582, + "acc_norm": 0.4222222222222222, + "acc_norm_stderr": 0.04266763404099582 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4605263157894737, + "acc_stderr": 0.04056242252249034, + "acc_norm": 0.4605263157894737, + "acc_norm_stderr": 0.04056242252249034 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.539622641509434, + "acc_stderr": 0.03067609659938918, + "acc_norm": 0.539622641509434, + "acc_norm_stderr": 0.03067609659938918 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5069444444444444, + "acc_stderr": 0.04180806750294938, + "acc_norm": 0.5069444444444444, + "acc_norm_stderr": 0.04180806750294938 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3988439306358382, + "acc_stderr": 0.037336266553835096, + "acc_norm": 0.3988439306358382, + "acc_norm_stderr": 0.037336266553835096 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.04440521906179328, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.04440521906179328 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.425531914893617, + "acc_stderr": 0.03232146916224469, + "acc_norm": 0.425531914893617, + "acc_norm_stderr": 0.03232146916224469 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.37719298245614036, + "acc_stderr": 0.045595221419582166, + "acc_norm": 0.37719298245614036, + "acc_norm_stderr": 0.045595221419582166 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5241379310344828, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.5241379310344828, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.02345603738398203, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.02345603738398203 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.03809523809523811, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.03809523809523811 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.532258064516129, + "acc_stderr": 0.028384747788813332, + "acc_norm": 0.532258064516129, + "acc_norm_stderr": 0.028384747788813332 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.39901477832512317, + "acc_stderr": 0.03445487686264716, + "acc_norm": 0.39901477832512317, + "acc_norm_stderr": 0.03445487686264716 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.593939393939394, + "acc_stderr": 0.03834816355401181, + "acc_norm": 0.593939393939394, + "acc_norm_stderr": 0.03834816355401181 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6161616161616161, + "acc_stderr": 0.03464881675016341, + "acc_norm": 0.6161616161616161, + "acc_norm_stderr": 0.03464881675016341 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7098445595854922, + "acc_stderr": 0.03275264467791516, + "acc_norm": 0.7098445595854922, + "acc_norm_stderr": 0.03275264467791516 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.441025641025641, + "acc_stderr": 0.025174048384000756, + "acc_norm": 0.441025641025641, + "acc_norm_stderr": 0.025174048384000756 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.02730914058823018, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.02730914058823018 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.0322529423239964, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.0322529423239964 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.03802039760107903, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.03802039760107903 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.673394495412844, + "acc_stderr": 0.020106990889937303, + "acc_norm": 0.673394495412844, + "acc_norm_stderr": 0.020106990889937303 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.03167468706828978, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.03167468706828978 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03308611113236434, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03308611113236434 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6582278481012658, + "acc_stderr": 0.03087453753755362, + "acc_norm": 0.6582278481012658, + "acc_norm_stderr": 0.03087453753755362 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5650224215246636, + "acc_stderr": 0.03327283370271344, + "acc_norm": 0.5650224215246636, + "acc_norm_stderr": 0.03327283370271344 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5725190839694656, + "acc_stderr": 0.04338920305792401, + "acc_norm": 0.5725190839694656, + "acc_norm_stderr": 0.04338920305792401 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.043913262867240704, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.043913262867240704 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6203703703703703, + "acc_stderr": 0.04691521224077742, + "acc_norm": 0.6203703703703703, + "acc_norm_stderr": 0.04691521224077742 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5460122699386503, + "acc_stderr": 0.0391170190467718, + "acc_norm": 0.5460122699386503, + "acc_norm_stderr": 0.0391170190467718 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.0457237235873743, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.0457237235873743 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.045416094465039476, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.045416094465039476 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7307692307692307, + "acc_stderr": 0.029058588303748842, + "acc_norm": 0.7307692307692307, + "acc_norm_stderr": 0.029058588303748842 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.52, + "acc_stderr": 0.05021167315686779, + "acc_norm": 0.52, + "acc_norm_stderr": 0.05021167315686779 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6871008939974457, + "acc_stderr": 0.01658093594030406, + "acc_norm": 0.6871008939974457, + "acc_norm_stderr": 0.01658093594030406 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5260115606936416, + "acc_stderr": 0.02688264343402289, + "acc_norm": 0.5260115606936416, + "acc_norm_stderr": 0.02688264343402289 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2122905027932961, + "acc_stderr": 0.013676644685831726, + "acc_norm": 0.2122905027932961, + "acc_norm_stderr": 0.013676644685831726 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5228758169934641, + "acc_stderr": 0.028599936776089782, + "acc_norm": 0.5228758169934641, + "acc_norm_stderr": 0.028599936776089782 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5562700964630225, + "acc_stderr": 0.028217683556652315, + "acc_norm": 0.5562700964630225, + "acc_norm_stderr": 0.028217683556652315 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5524691358024691, + "acc_stderr": 0.0276671385694227, + "acc_norm": 0.5524691358024691, + "acc_norm_stderr": 0.0276671385694227 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.37943262411347517, + "acc_stderr": 0.028947338851614105, + "acc_norm": 0.37943262411347517, + "acc_norm_stderr": 0.028947338851614105 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3455019556714472, + "acc_stderr": 0.012145303004087206, + "acc_norm": 0.3455019556714472, + "acc_norm_stderr": 0.012145303004087206 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4852941176470588, + "acc_stderr": 0.03035969707904611, + "acc_norm": 0.4852941176470588, + "acc_norm_stderr": 0.03035969707904611 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4869281045751634, + "acc_stderr": 0.020220920829626923, + "acc_norm": 0.4869281045751634, + "acc_norm_stderr": 0.020220920829626923 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5272727272727272, + "acc_stderr": 0.04782001791380061, + "acc_norm": 0.5272727272727272, + "acc_norm_stderr": 0.04782001791380061 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5795918367346938, + "acc_stderr": 0.03160106993449601, + "acc_norm": 0.5795918367346938, + "acc_norm_stderr": 0.03160106993449601 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6318407960199005, + "acc_stderr": 0.03410410565495302, + "acc_norm": 0.6318407960199005, + "acc_norm_stderr": 0.03410410565495302 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.65, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.65, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.39759036144578314, + "acc_stderr": 0.038099730845402184, + "acc_norm": 0.39759036144578314, + "acc_norm_stderr": 0.038099730845402184 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7192982456140351, + "acc_stderr": 0.03446296217088427, + "acc_norm": 0.7192982456140351, + "acc_norm_stderr": 0.03446296217088427 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2778457772337821, + "mc1_stderr": 0.015680929364024637, + "mc2": 0.43423608772141165, + "mc2_stderr": 0.01468977817324311 + }, + "all": { + "acc": 0.49036600008197573, + "acc_stderr": 0.03524623648458121, + "acc_norm": 0.49432587695797897, + "acc_norm_stderr": 0.035234360851138714, + "mc1": 0.2778457772337821, + "mc1_stderr": 0.015680929364024637, + "mc2": 0.43423608772141165, + "mc2_stderr": 0.01468977817324311 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4166.326728820801", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Lazycuber/L2-7b-Guanaco-Uncensored/results_2023-10-25T07-21-20.584203.json b/eval-results/Lazycuber/L2-7b-Guanaco-Uncensored/results_2023-10-25T07-21-20.584203.json new file mode 100644 index 0000000000000000000000000000000000000000..be12b2b15fbd525d0a477201f87ae8892466cd44 --- /dev/null +++ b/eval-results/Lazycuber/L2-7b-Guanaco-Uncensored/results_2023-10-25T07-21-20.584203.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Lazycuber/L2-7b-Guanaco-Uncensored", + "model_sha": "0dd5d093a206aa7930c3e13b0cf4e5a4c7945b98", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0014681208053691276, + "em_stderr": 0.00039210421902985767, + "f1": 0.056828859060402796, + "f1_stderr": 0.0013179206618636607 + }, + "harness|gsm8k|5": { + "acc": 0.07960576194086429, + "acc_stderr": 0.007455924338676278 + }, + "harness|winogrande|5": { + "acc": 0.7537490134175217, + "acc_stderr": 0.012108365307437523 + }, + "all": { + "em": 0.0014681208053691276, + "em_stderr": 0.00039210421902985767, + "f1": 0.056828859060402796, + "f1_stderr": 0.0013179206618636607, + "acc": 0.416677387679193, + "acc_stderr": 0.0097821448230569 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "6c4c0a15a547258d" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "dde6454510196e7c" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "21dd8d5546892d1c" + }, + "total_evaluation_time_secondes": "10531.30201625824", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Lazycuber/L2-7b-Orca-WVG-Test/results_2023-10-10T15-39-37.735727.json b/eval-results/Lazycuber/L2-7b-Orca-WVG-Test/results_2023-10-10T15-39-37.735727.json new file mode 100644 index 0000000000000000000000000000000000000000..81b23ff8c095b0c307229b3c047321768d9c878b --- /dev/null +++ b/eval-results/Lazycuber/L2-7b-Orca-WVG-Test/results_2023-10-10T15-39-37.735727.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Lazycuber/L2-7b-Orca-WVG-Test", + "model_sha": "6073a87872eb36149404bfb7d60e0108074ee1c3", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5162116040955631, + "acc_stderr": 0.014603708567414938, + "acc_norm": 0.5486348122866894, + "acc_norm_stderr": 0.014542104569955267 + }, + "harness|hellaswag|10": { + "acc": 0.5910177255526787, + "acc_stderr": 0.00490641198447679, + "acc_norm": 0.782513443537144, + "acc_norm_stderr": 0.004116931383157353 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5259259259259259, + "acc_stderr": 0.04313531696750575, + "acc_norm": 0.5259259259259259, + "acc_norm_stderr": 0.04313531696750575 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4473684210526316, + "acc_stderr": 0.04046336883978251, + "acc_norm": 0.4473684210526316, + "acc_norm_stderr": 0.04046336883978251 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6150943396226415, + "acc_stderr": 0.02994649856769995, + "acc_norm": 0.6150943396226415, + "acc_norm_stderr": 0.02994649856769995 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5347222222222222, + "acc_stderr": 0.041711158581816184, + "acc_norm": 0.5347222222222222, + "acc_norm_stderr": 0.041711158581816184 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.42, + "acc_stderr": 0.04960449637488584, + "acc_norm": 0.42, + "acc_norm_stderr": 0.04960449637488584 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4797687861271676, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.4797687861271676, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.044405219061793275, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.044405219061793275 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4723404255319149, + "acc_stderr": 0.03263597118409769, + "acc_norm": 0.4723404255319149, + "acc_norm_stderr": 0.03263597118409769 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.043391383225798615, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.043391383225798615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4482758620689655, + "acc_stderr": 0.04144311810878152, + "acc_norm": 0.4482758620689655, + "acc_norm_stderr": 0.04144311810878152 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.02326651221373057, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.02326651221373057 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2698412698412698, + "acc_stderr": 0.039701582732351734, + "acc_norm": 0.2698412698412698, + "acc_norm_stderr": 0.039701582732351734 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5387096774193548, + "acc_stderr": 0.028358634859836935, + "acc_norm": 0.5387096774193548, + "acc_norm_stderr": 0.028358634859836935 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3842364532019704, + "acc_stderr": 0.034223985656575494, + "acc_norm": 0.3842364532019704, + "acc_norm_stderr": 0.034223985656575494 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.03663974994391245, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.03663974994391245 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6616161616161617, + "acc_stderr": 0.033711241426263014, + "acc_norm": 0.6616161616161617, + "acc_norm_stderr": 0.033711241426263014 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7098445595854922, + "acc_stderr": 0.032752644677915166, + "acc_norm": 0.7098445595854922, + "acc_norm_stderr": 0.032752644677915166 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.44871794871794873, + "acc_stderr": 0.025217315184846482, + "acc_norm": 0.44871794871794873, + "acc_norm_stderr": 0.025217315184846482 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24814814814814815, + "acc_stderr": 0.0263357394040558, + "acc_norm": 0.24814814814814815, + "acc_norm_stderr": 0.0263357394040558 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.47478991596638653, + "acc_stderr": 0.0324371805513741, + "acc_norm": 0.47478991596638653, + "acc_norm_stderr": 0.0324371805513741 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.708256880733945, + "acc_stderr": 0.01948930096887651, + "acc_norm": 0.708256880733945, + "acc_norm_stderr": 0.01948930096887651 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.37962962962962965, + "acc_stderr": 0.03309682581119035, + "acc_norm": 0.37962962962962965, + "acc_norm_stderr": 0.03309682581119035 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7107843137254902, + "acc_stderr": 0.031822318676475544, + "acc_norm": 0.7107843137254902, + "acc_norm_stderr": 0.031822318676475544 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7510548523206751, + "acc_stderr": 0.028146970599422644, + "acc_norm": 0.7510548523206751, + "acc_norm_stderr": 0.028146970599422644 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5964125560538116, + "acc_stderr": 0.03292802819330314, + "acc_norm": 0.5964125560538116, + "acc_norm_stderr": 0.03292802819330314 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6259541984732825, + "acc_stderr": 0.04243869242230524, + "acc_norm": 0.6259541984732825, + "acc_norm_stderr": 0.04243869242230524 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6611570247933884, + "acc_stderr": 0.043207678075366705, + "acc_norm": 0.6611570247933884, + "acc_norm_stderr": 0.043207678075366705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6481481481481481, + "acc_stderr": 0.046166311118017146, + "acc_norm": 0.6481481481481481, + "acc_norm_stderr": 0.046166311118017146 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5705521472392638, + "acc_stderr": 0.03889066619112722, + "acc_norm": 0.5705521472392638, + "acc_norm_stderr": 0.03889066619112722 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.0457237235873743, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.0457237235873743 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7281553398058253, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.7281553398058253, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7435897435897436, + "acc_stderr": 0.028605953702004257, + "acc_norm": 0.7435897435897436, + "acc_norm_stderr": 0.028605953702004257 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7164750957854407, + "acc_stderr": 0.016117318166832265, + "acc_norm": 0.7164750957854407, + "acc_norm_stderr": 0.016117318166832265 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5606936416184971, + "acc_stderr": 0.026720034380514995, + "acc_norm": 0.5606936416184971, + "acc_norm_stderr": 0.026720034380514995 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2837988826815642, + "acc_stderr": 0.015078358970751752, + "acc_norm": 0.2837988826815642, + "acc_norm_stderr": 0.015078358970751752 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5326797385620915, + "acc_stderr": 0.02856869975222587, + "acc_norm": 0.5326797385620915, + "acc_norm_stderr": 0.02856869975222587 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6012861736334405, + "acc_stderr": 0.0278093225857745, + "acc_norm": 0.6012861736334405, + "acc_norm_stderr": 0.0278093225857745 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.558641975308642, + "acc_stderr": 0.027628737155668777, + "acc_norm": 0.558641975308642, + "acc_norm_stderr": 0.027628737155668777 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3900709219858156, + "acc_stderr": 0.02909767559946393, + "acc_norm": 0.3900709219858156, + "acc_norm_stderr": 0.02909767559946393 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.379400260756193, + "acc_stderr": 0.012393202029825402, + "acc_norm": 0.379400260756193, + "acc_norm_stderr": 0.012393202029825402 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.47058823529411764, + "acc_stderr": 0.030320243265004137, + "acc_norm": 0.47058823529411764, + "acc_norm_stderr": 0.030320243265004137 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4852941176470588, + "acc_stderr": 0.020219083895133924, + "acc_norm": 0.4852941176470588, + "acc_norm_stderr": 0.020219083895133924 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5818181818181818, + "acc_stderr": 0.04724577405731572, + "acc_norm": 0.5818181818181818, + "acc_norm_stderr": 0.04724577405731572 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5591836734693878, + "acc_stderr": 0.03178419114175363, + "acc_norm": 0.5591836734693878, + "acc_norm_stderr": 0.03178419114175363 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6268656716417911, + "acc_stderr": 0.03419832608176007, + "acc_norm": 0.6268656716417911, + "acc_norm_stderr": 0.03419832608176007 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.73, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.73, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.038367221765980515, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.038367221765980515 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7192982456140351, + "acc_stderr": 0.034462962170884265, + "acc_norm": 0.7192982456140351, + "acc_norm_stderr": 0.034462962170884265 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3023255813953488, + "mc1_stderr": 0.016077509266133026, + "mc2": 0.436784676933468, + "mc2_stderr": 0.014891030280754473 + }, + "all": { + "acc": 0.512701362043529, + "acc_stderr": 0.03493895100033154, + "acc_norm": 0.5164965980804035, + "acc_norm_stderr": 0.03492452583764037, + "mc1": 0.3023255813953488, + "mc1_stderr": 0.016077509266133026, + "mc2": 0.436784676933468, + "mc2_stderr": 0.014891030280754473 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4410.121661186218", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Lazycuber/L2-7b-Orca-WVG-Test/results_2023-10-26T20-44-34.027885.json b/eval-results/Lazycuber/L2-7b-Orca-WVG-Test/results_2023-10-26T20-44-34.027885.json new file mode 100644 index 0000000000000000000000000000000000000000..a95e101887f34c061e94f1b5a934283bc489ae55 --- /dev/null +++ b/eval-results/Lazycuber/L2-7b-Orca-WVG-Test/results_2023-10-26T20-44-34.027885.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Lazycuber/L2-7b-Orca-WVG-Test", + "model_sha": "6073a87872eb36149404bfb7d60e0108074ee1c3", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.002202181208053691, + "em_stderr": 0.00048005108166191996, + "f1": 0.07443687080536941, + "f1_stderr": 0.0016342523738966323 + }, + "harness|gsm8k|5": { + "acc": 0.0803639120545868, + "acc_stderr": 0.007488258573239077 + }, + "harness|winogrande|5": { + "acc": 0.7434885556432518, + "acc_stderr": 0.01227364800875999 + }, + "all": { + "em": 0.002202181208053691, + "em_stderr": 0.00048005108166191996, + "f1": 0.07443687080536941, + "f1_stderr": 0.0016342523738966323, + "acc": 0.4119262338489193, + "acc_stderr": 0.009880953290999535 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "9e4735573309f5bc" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "b5be616981ff04b4" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "80c168d15a30dd26" + }, + "total_evaluation_time_secondes": "9802.565352678299", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Lazycuber/pyg-instruct-wizardlm/results_2023-07-24T15-30-39.317119.json b/eval-results/Lazycuber/pyg-instruct-wizardlm/results_2023-07-24T15-30-39.317119.json new file mode 100644 index 0000000000000000000000000000000000000000..5276c4c95c93d8b4bc89e1b76c4ce9f67fb8b51a --- /dev/null +++ b/eval-results/Lazycuber/pyg-instruct-wizardlm/results_2023-07-24T15-30-39.317119.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.3703071672354949, + "acc_stderr": 0.01411129875167495, + "acc_norm": 0.40955631399317405, + "acc_norm_stderr": 0.014370358632472442 + }, + "harness|hellaswag|10": { + "acc": 0.4907388966341366, + "acc_stderr": 0.004988925410522775, + "acc_norm": 0.6670981876120294, + "acc_norm_stderr": 0.004702886273189401 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768081, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768081 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.03712537833614865, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.03712537833614865 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.035834961763610625, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.035834961763610625 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2641509433962264, + "acc_stderr": 0.027134291628741727, + "acc_norm": 0.2641509433962264, + "acc_norm_stderr": 0.027134291628741727 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653695, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653695 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.23121387283236994, + "acc_stderr": 0.03214737302029471, + "acc_norm": 0.23121387283236994, + "acc_norm_stderr": 0.03214737302029471 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179961, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179961 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.32340425531914896, + "acc_stderr": 0.030579442773610334, + "acc_norm": 0.32340425531914896, + "acc_norm_stderr": 0.030579442773610334 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.25517241379310346, + "acc_stderr": 0.03632984052707842, + "acc_norm": 0.25517241379310346, + "acc_norm_stderr": 0.03632984052707842 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24867724867724866, + "acc_stderr": 0.022261817692400168, + "acc_norm": 0.24867724867724866, + "acc_norm_stderr": 0.022261817692400168 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.037184890068181146, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.037184890068181146 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.26129032258064516, + "acc_stderr": 0.02499305339776483, + "acc_norm": 0.26129032258064516, + "acc_norm_stderr": 0.02499305339776483 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.26108374384236455, + "acc_stderr": 0.0309037969521145, + "acc_norm": 0.26108374384236455, + "acc_norm_stderr": 0.0309037969521145 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2909090909090909, + "acc_stderr": 0.03546563019624336, + "acc_norm": 0.2909090909090909, + "acc_norm_stderr": 0.03546563019624336 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.21717171717171718, + "acc_stderr": 0.02937661648494563, + "acc_norm": 0.21717171717171718, + "acc_norm_stderr": 0.02937661648494563 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.22279792746113988, + "acc_stderr": 0.03003114797764155, + "acc_norm": 0.22279792746113988, + "acc_norm_stderr": 0.03003114797764155 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2512820512820513, + "acc_stderr": 0.021992016662370547, + "acc_norm": 0.2512820512820513, + "acc_norm_stderr": 0.021992016662370547 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.02696242432507384, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.02696242432507384 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2184873949579832, + "acc_stderr": 0.02684151432295894, + "acc_norm": 0.2184873949579832, + "acc_norm_stderr": 0.02684151432295894 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2052980132450331, + "acc_stderr": 0.03297986648473835, + "acc_norm": 0.2052980132450331, + "acc_norm_stderr": 0.03297986648473835 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.25871559633027524, + "acc_stderr": 0.018776052319619624, + "acc_norm": 0.25871559633027524, + "acc_norm_stderr": 0.018776052319619624 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.16666666666666666, + "acc_stderr": 0.025416428388767478, + "acc_norm": 0.16666666666666666, + "acc_norm_stderr": 0.025416428388767478 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.27941176470588236, + "acc_stderr": 0.03149328104507955, + "acc_norm": 0.27941176470588236, + "acc_norm_stderr": 0.03149328104507955 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.26582278481012656, + "acc_stderr": 0.028756799629658335, + "acc_norm": 0.26582278481012656, + "acc_norm_stderr": 0.028756799629658335 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.34080717488789236, + "acc_stderr": 0.0318114974705536, + "acc_norm": 0.34080717488789236, + "acc_norm_stderr": 0.0318114974705536 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.1984732824427481, + "acc_stderr": 0.03498149385462473, + "acc_norm": 0.1984732824427481, + "acc_norm_stderr": 0.03498149385462473 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.35537190082644626, + "acc_stderr": 0.04369236326573981, + "acc_norm": 0.35537190082644626, + "acc_norm_stderr": 0.04369236326573981 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.39814814814814814, + "acc_stderr": 0.04732332615978815, + "acc_norm": 0.39814814814814814, + "acc_norm_stderr": 0.04732332615978815 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.26380368098159507, + "acc_stderr": 0.03462419931615623, + "acc_norm": 0.26380368098159507, + "acc_norm_stderr": 0.03462419931615623 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.26785714285714285, + "acc_stderr": 0.04203277291467764, + "acc_norm": 0.26785714285714285, + "acc_norm_stderr": 0.04203277291467764 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.21359223300970873, + "acc_stderr": 0.04058042015646034, + "acc_norm": 0.21359223300970873, + "acc_norm_stderr": 0.04058042015646034 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.28205128205128205, + "acc_stderr": 0.02948036054954119, + "acc_norm": 0.28205128205128205, + "acc_norm_stderr": 0.02948036054954119 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.30779054916985954, + "acc_stderr": 0.016506045045155637, + "acc_norm": 0.30779054916985954, + "acc_norm_stderr": 0.016506045045155637 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2630057803468208, + "acc_stderr": 0.02370309952525817, + "acc_norm": 0.2630057803468208, + "acc_norm_stderr": 0.02370309952525817 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.27039106145251396, + "acc_stderr": 0.01485499393801009, + "acc_norm": 0.27039106145251396, + "acc_norm_stderr": 0.01485499393801009 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.025829163272757482, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.025829163272757482 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.28938906752411575, + "acc_stderr": 0.025755865922632938, + "acc_norm": 0.28938906752411575, + "acc_norm_stderr": 0.025755865922632938 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.30246913580246915, + "acc_stderr": 0.025557653981868052, + "acc_norm": 0.30246913580246915, + "acc_norm_stderr": 0.025557653981868052 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2730496453900709, + "acc_stderr": 0.026577860943307847, + "acc_norm": 0.2730496453900709, + "acc_norm_stderr": 0.026577860943307847 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.27249022164276404, + "acc_stderr": 0.011371658294311538, + "acc_norm": 0.27249022164276404, + "acc_norm_stderr": 0.011371658294311538 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.1801470588235294, + "acc_stderr": 0.023345163616544873, + "acc_norm": 0.1801470588235294, + "acc_norm_stderr": 0.023345163616544873 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.272875816993464, + "acc_stderr": 0.01802047414839358, + "acc_norm": 0.272875816993464, + "acc_norm_stderr": 0.01802047414839358 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2909090909090909, + "acc_stderr": 0.04350271442923243, + "acc_norm": 0.2909090909090909, + "acc_norm_stderr": 0.04350271442923243 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.23673469387755103, + "acc_stderr": 0.02721283588407315, + "acc_norm": 0.23673469387755103, + "acc_norm_stderr": 0.02721283588407315 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.26865671641791045, + "acc_stderr": 0.03134328358208955, + "acc_norm": 0.26865671641791045, + "acc_norm_stderr": 0.03134328358208955 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3313253012048193, + "acc_stderr": 0.03664314777288085, + "acc_norm": 0.3313253012048193, + "acc_norm_stderr": 0.03664314777288085 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.03565079670708311, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.03565079670708311 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.19583843329253367, + "mc1_stderr": 0.013892344367742086, + "mc2": 0.3193114626837606, + "mc2_stderr": 0.013554866615873247 + }, + "all": { + "acc": 0.268990380147086, + "acc_stderr": 0.03196915174899114, + "acc_norm": 0.27264476044768887, + "acc_norm_stderr": 0.031968694473456634, + "mc1": 0.19583843329253367, + "mc1_stderr": 0.013892344367742086, + "mc2": 0.3193114626837606, + "mc2_stderr": 0.013554866615873247 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "Lazycuber/pyg-instruct-wizardlm", + "model_sha": "f00ef7a7b0cc6f02af2a11ac764270dfd61b9e2f", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4685, + "non-padded": 2, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40045, + "non-padded": 123, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 680, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 16, + "non-truncated": 6120, + "padded": 6120, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "0893dfcb83435e7d", + "hash_cont_tokens": "6159bf1904a8c8fb" + }, + "total_evaluation_time_secondes": "2386.1140110492706", + "truncated": 1492, + "non-truncated": 109527, + "padded": 109290, + "non-padded": 1729, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Lazycuber/pyg-instruct-wizardlm/results_2023-10-28T08-03-29.005419.json b/eval-results/Lazycuber/pyg-instruct-wizardlm/results_2023-10-28T08-03-29.005419.json new file mode 100644 index 0000000000000000000000000000000000000000..64f1336a61d3193ae9e9293112a392f7c4873dc7 --- /dev/null +++ b/eval-results/Lazycuber/pyg-instruct-wizardlm/results_2023-10-28T08-03-29.005419.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Lazycuber/pyg-instruct-wizardlm", + "model_sha": "f00ef7a7b0cc6f02af2a11ac764270dfd61b9e2f", + "model_size": "11.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.01971476510067114, + "em_stderr": 0.0014236777096831824, + "f1": 0.07215394295302006, + "f1_stderr": 0.001870662901719372 + }, + "harness|gsm8k|5": { + "acc": 0.01592115238817286, + "acc_stderr": 0.0034478192723889907 + }, + "harness|winogrande|5": { + "acc": 0.6369376479873717, + "acc_stderr": 0.01351519186647922 + }, + "all": { + "em": 0.01971476510067114, + "em_stderr": 0.0014236777096831824, + "f1": 0.07215394295302006, + "f1_stderr": 0.001870662901719372, + "acc": 0.3264294001877723, + "acc_stderr": 0.008481505569434104 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f21277d2c2d2e06c", + "hash_cont_tokens": "70057140906f3dc9" + }, + "truncated": 382, + "non-truncated": 9154, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "43b2d6a71a9b8778" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "26cd3631535039d0", + "hash_cont_tokens": "11f1514d6ff8dabd" + }, + "total_evaluation_time_secondes": "8594.77758193016", + "truncated": 382, + "non-truncated": 13007, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/LoupGarou/WizardCoder-Guanaco-15B-V1.0/results_2023-07-24T16-55-06.473074.json b/eval-results/LoupGarou/WizardCoder-Guanaco-15B-V1.0/results_2023-07-24T16-55-06.473074.json new file mode 100644 index 0000000000000000000000000000000000000000..f5841ced4aa10e35ffa3c2ccbb6a5191bace73b4 --- /dev/null +++ b/eval-results/LoupGarou/WizardCoder-Guanaco-15B-V1.0/results_2023-07-24T16-55-06.473074.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.2883959044368601, + "acc_stderr": 0.013238394422428182, + "acc_norm": 0.3046075085324232, + "acc_norm_stderr": 0.013449522109932492 + }, + "harness|hellaswag|10": { + "acc": 0.37273451503684524, + "acc_stderr": 0.004825441080261174, + "acc_norm": 0.4558852818163712, + "acc_norm_stderr": 0.0049703221569979425 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.037125378336148665, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.037125378336148665 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.2565789473684211, + "acc_stderr": 0.0355418036802569, + "acc_norm": 0.2565789473684211, + "acc_norm_stderr": 0.0355418036802569 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2528301886792453, + "acc_stderr": 0.026749899771241235, + "acc_norm": 0.2528301886792453, + "acc_norm_stderr": 0.026749899771241235 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909282, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909282 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3179190751445087, + "acc_stderr": 0.03550683989165581, + "acc_norm": 0.3179190751445087, + "acc_norm_stderr": 0.03550683989165581 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.043364327079931785, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.043364327079931785 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939098, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939098 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.32340425531914896, + "acc_stderr": 0.030579442773610334, + "acc_norm": 0.32340425531914896, + "acc_norm_stderr": 0.030579442773610334 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.04339138322579859, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.04339138322579859 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3793103448275862, + "acc_stderr": 0.04043461861916747, + "acc_norm": 0.3793103448275862, + "acc_norm_stderr": 0.04043461861916747 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.20634920634920634, + "acc_stderr": 0.020842290930114683, + "acc_norm": 0.20634920634920634, + "acc_norm_stderr": 0.020842290930114683 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.040406101782088394, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.040406101782088394 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366255, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366255 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.22580645161290322, + "acc_stderr": 0.023785577884181012, + "acc_norm": 0.22580645161290322, + "acc_norm_stderr": 0.023785577884181012 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.21674876847290642, + "acc_stderr": 0.028990331252516235, + "acc_norm": 0.21674876847290642, + "acc_norm_stderr": 0.028990331252516235 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.24848484848484848, + "acc_stderr": 0.033744026441394036, + "acc_norm": 0.24848484848484848, + "acc_norm_stderr": 0.033744026441394036 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.1919191919191919, + "acc_stderr": 0.028057791672989017, + "acc_norm": 0.1919191919191919, + "acc_norm_stderr": 0.028057791672989017 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.21243523316062177, + "acc_stderr": 0.029519282616817244, + "acc_norm": 0.21243523316062177, + "acc_norm_stderr": 0.029519282616817244 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2128205128205128, + "acc_stderr": 0.020752423722128006, + "acc_norm": 0.2128205128205128, + "acc_norm_stderr": 0.020752423722128006 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085626, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085626 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.226890756302521, + "acc_stderr": 0.02720537153827946, + "acc_norm": 0.226890756302521, + "acc_norm_stderr": 0.02720537153827946 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23178807947019867, + "acc_stderr": 0.034454062719870546, + "acc_norm": 0.23178807947019867, + "acc_norm_stderr": 0.034454062719870546 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.20550458715596331, + "acc_stderr": 0.01732435232501601, + "acc_norm": 0.20550458715596331, + "acc_norm_stderr": 0.01732435232501601 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.16203703703703703, + "acc_stderr": 0.02513045365226846, + "acc_norm": 0.16203703703703703, + "acc_norm_stderr": 0.02513045365226846 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.0309645179269234, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.0309645179269234 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2742616033755274, + "acc_stderr": 0.029041333510598035, + "acc_norm": 0.2742616033755274, + "acc_norm_stderr": 0.029041333510598035 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3273542600896861, + "acc_stderr": 0.031493846709941306, + "acc_norm": 0.3273542600896861, + "acc_norm_stderr": 0.031493846709941306 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.26717557251908397, + "acc_stderr": 0.038808483010823944, + "acc_norm": 0.26717557251908397, + "acc_norm_stderr": 0.038808483010823944 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2975206611570248, + "acc_stderr": 0.04173349148083498, + "acc_norm": 0.2975206611570248, + "acc_norm_stderr": 0.04173349148083498 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.04284467968052192, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.04284467968052192 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.03259177392742177, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.03259177392742177 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.30357142857142855, + "acc_stderr": 0.04364226155841044, + "acc_norm": 0.30357142857142855, + "acc_norm_stderr": 0.04364226155841044 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.2524271844660194, + "acc_stderr": 0.04301250399690877, + "acc_norm": 0.2524271844660194, + "acc_norm_stderr": 0.04301250399690877 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.33760683760683763, + "acc_stderr": 0.03098029699261856, + "acc_norm": 0.33760683760683763, + "acc_norm_stderr": 0.03098029699261856 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.015302380123542077, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.015302380123542077 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2832369942196532, + "acc_stderr": 0.02425790170532337, + "acc_norm": 0.2832369942196532, + "acc_norm_stderr": 0.02425790170532337 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2346368715083799, + "acc_stderr": 0.014173044098303682, + "acc_norm": 0.2346368715083799, + "acc_norm_stderr": 0.014173044098303682 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.025553169991826503, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.025553169991826503 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3054662379421222, + "acc_stderr": 0.026160584450140474, + "acc_norm": 0.3054662379421222, + "acc_norm_stderr": 0.026160584450140474 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.02438366553103545, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.02438366553103545 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.25177304964539005, + "acc_stderr": 0.0258921511567094, + "acc_norm": 0.25177304964539005, + "acc_norm_stderr": 0.0258921511567094 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2737940026075619, + "acc_stderr": 0.011388612167979399, + "acc_norm": 0.2737940026075619, + "acc_norm_stderr": 0.011388612167979399 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.19117647058823528, + "acc_stderr": 0.023886881922440345, + "acc_norm": 0.19117647058823528, + "acc_norm_stderr": 0.023886881922440345 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2826797385620915, + "acc_stderr": 0.018217269552053435, + "acc_norm": 0.2826797385620915, + "acc_norm_stderr": 0.018217269552053435 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.3090909090909091, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.3090909090909091, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2163265306122449, + "acc_stderr": 0.026358916334904038, + "acc_norm": 0.2163265306122449, + "acc_norm_stderr": 0.026358916334904038 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.3283582089552239, + "acc_stderr": 0.033206858897443244, + "acc_norm": 0.3283582089552239, + "acc_norm_stderr": 0.033206858897443244 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3072289156626506, + "acc_stderr": 0.03591566797824663, + "acc_norm": 0.3072289156626506, + "acc_norm_stderr": 0.03591566797824663 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.28518971848225216, + "mc1_stderr": 0.015805827874454892, + "mc2": 0.46393246777342, + "mc2_stderr": 0.015782611429494386 + }, + "all": { + "acc": 0.2699933767983565, + "acc_stderr": 0.03228163034197912, + "acc_norm": 0.2716774847792902, + "acc_norm_stderr": 0.032287664388830656, + "mc1": 0.28518971848225216, + "mc1_stderr": 0.015805827874454892, + "mc2": 0.46393246777342, + "mc2_stderr": 0.015782611429494386 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "LoupGarou/WizardCoder-Guanaco-15B-V1.0", + "model_sha": "ab5ea678d63eb2324658dcc8cfae267eabc366ef", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "cf3b9ea33612d0f6", + "hash_cont_tokens": "dba03e8607e55660" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4682, + "non-padded": 5, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "b8dce70b49a73f68", + "hash_cont_tokens": "aeeb90422dddad80" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40091, + "non-padded": 77, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "4b2bc3b45f5e8d5a", + "hash_cont_tokens": "9fa5425796526efd" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "23440e62a7b2d5e5", + "hash_cont_tokens": "7e728d69ef1ac7c2" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "2c8582eea410db6d", + "hash_cont_tokens": "7f6d24030197b9e2" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ab8203714dc10365", + "hash_cont_tokens": "3234d906bf828aeb" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "dcdd9f537894fd7c", + "hash_cont_tokens": "a45a52e29ce33788" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "4ab9828f8337c32a", + "hash_cont_tokens": "4701c1c4bc65eb13" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "f20c5e5a97d442cf", + "hash_cont_tokens": "529a4b6152efd969" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "62e11834f72f2b55", + "hash_cont_tokens": "dfd59a537f965fd8" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "2c01899c41f52655", + "hash_cont_tokens": "3ad3bfeaaf9eecf0" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "687d69e77693d01f", + "hash_cont_tokens": "9ecf3f9ea7be6f36" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "bbee94c43d00994c", + "hash_cont_tokens": "8c85887d4e9340ba" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4a0e2507d76b04f8", + "hash_cont_tokens": "1649dc79ba4b8f6e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "fe4297e5ffc256e6", + "hash_cont_tokens": "7298714f69bd0499" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "0b8aeb415ff5b1bf", + "hash_cont_tokens": "7742165a7d63434d" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "ae2b0b3b228d2bd7", + "hash_cont_tokens": "0c5900159c5a4fd3" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 576, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "204bbc9261b4c917", + "hash_cont_tokens": "85dba81265bfac4f" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "bb22a354e68640e5", + "hash_cont_tokens": "4fac8819ecb0c824" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "acef09e091664c4b", + "hash_cont_tokens": "9fa5425796526efd" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "499e4eb6954d6714", + "hash_cont_tokens": "64e65812857526be" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "375b75d804deeb76", + "hash_cont_tokens": "840f9e33c065c1fc" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "05e46151c35bc285", + "hash_cont_tokens": "e87da720c8dfb2e6" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "26bb245aa51490db", + "hash_cont_tokens": "4b0bc7b1752bc971" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b8d579e213ab9575", + "hash_cont_tokens": "6c2d8b7c6c68cbbc" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "168a05c631f917a7", + "hash_cont_tokens": "c9efd564121e5f64" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 760, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "7ca8fc7f8b137705", + "hash_cont_tokens": "87da7bbe91d71f4d" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "76004a62919b5475", + "hash_cont_tokens": "58c8fc9e4f01ea37" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "8a088c3c95b3232c", + "hash_cont_tokens": "bf01fea58a024ce9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "3da5c88422b8c8ad", + "hash_cont_tokens": "ec32025d30c749dc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "0a717a90a22cae35", + "hash_cont_tokens": "8351234d979af350" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2163, + "non-padded": 17, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "03aff3d5599d02d4", + "hash_cont_tokens": "2e400437e3cc54c1" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "e2f8f15480dadba0", + "hash_cont_tokens": "387cbb636c5830db" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "492531338247cb56", + "hash_cont_tokens": "14ddc4fd6077ac5a" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "06196a73e0b3420e", + "hash_cont_tokens": "818f4cc0988a9629" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "b894c90f2c51f84d", + "hash_cont_tokens": "f0c8aa96ca0f46c5" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "51c05430b0627344", + "hash_cont_tokens": "765ce9c1c62775d6" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "d29333c2738ac7c6", + "hash_cont_tokens": "c44e316003399c46" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "657052c8baa9fc85", + "hash_cont_tokens": "0de9fa48ae9a71a6" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "fc5326db9435de85", + "hash_cont_tokens": "c3a384869692ee19" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "801adc9549ba3fe7", + "hash_cont_tokens": "fcaa29ac112cadd6" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aebea8ad4ffaaefb", + "hash_cont_tokens": "04f508fb6568b7ff" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "780bbe312a283237", + "hash_cont_tokens": "9fa5425796526efd" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "848518bff6ee3db7", + "hash_cont_tokens": "5d41818685e2c793" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "aa30023bcdf5d58a", + "hash_cont_tokens": "42efb90e49c9e088" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1384, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "15b8b76535d7d82f", + "hash_cont_tokens": "c57656133741226a" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a8ad40f91d76e471", + "hash_cont_tokens": "090fb54fec61071e" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "71fa220f44351832", + "hash_cont_tokens": "59b76a89cd4887cf" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a526ab409be4a2d9", + "hash_cont_tokens": "d8cc4d411aa4e0e3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e4323a17f81e7316", + "hash_cont_tokens": "4f32dbf6bf9c9f8b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1125, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "06bc9d84c69c192f", + "hash_cont_tokens": "5f160f615d97ebd7" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "4aa4134a856a253b", + "hash_cont_tokens": "021921d6cce5ec05" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "39df5733563dc07a", + "hash_cont_tokens": "9bc4fc87f3f2befa" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "c34769e3c38bdede", + "hash_cont_tokens": "477d51b5c5ce3cce" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c25c9c1dd04e971f", + "hash_cont_tokens": "15a4b8e5410af0b6" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "027e8aac00f3aa38", + "hash_cont_tokens": "0d5aa3fb85683feb" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 788, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "ba778c17daa0fcb9", + "hash_cont_tokens": "9fa5425796526efd" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "8cdca6c47be4526a", + "hash_cont_tokens": "d57828ff29ba6543" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "8f888321da976097", + "hash_cont_tokens": "3a4ad2460809839b" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "7c47a57b24ee9f01", + "hash_cont_tokens": "79eed6ee3736b1b3" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5611b71cb9b5b279", + "hash_cont_tokens": "10db92063c7fe655" + }, + "total_evaluation_time_secondes": "6598.046590805054", + "truncated": 0, + "non-truncated": 111019, + "padded": 110881, + "non-padded": 138, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/LoupGarou/WizardCoder-Guanaco-15B-V1.0/results_2023-09-23T04-03-10.692358.json b/eval-results/LoupGarou/WizardCoder-Guanaco-15B-V1.0/results_2023-09-23T04-03-10.692358.json new file mode 100644 index 0000000000000000000000000000000000000000..e18472906ecc81c992490a8f8be7a8903beac5a9 --- /dev/null +++ b/eval-results/LoupGarou/WizardCoder-Guanaco-15B-V1.0/results_2023-09-23T04-03-10.692358.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "LoupGarou/WizardCoder-Guanaco-15B-V1.0", + "model_sha": "ab5ea678d63eb2324658dcc8cfae267eabc366ef", + "model_size": "28.91 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.04089765100671141, + "em_stderr": 0.0020282491887764946, + "f1": 0.08708682885906038, + "f1_stderr": 0.002301893268858503 + }, + "harness|gsm8k|5": { + "acc": 0.014404852160727824, + "acc_stderr": 0.0032820559171369513 + }, + "harness|winogrande|5": { + "acc": 0.5311760063141279, + "acc_stderr": 0.014025142640639511 + }, + "all": { + "em": 0.04089765100671141, + "em_stderr": 0.0020282491887764946, + "f1": 0.08708682885906038, + "f1_stderr": 0.002301893268858503, + "acc": 0.27279042923742786, + "acc_stderr": 0.008653599278888232 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "cf16f9000278e518", + "hash_cont_tokens": "25bd0fd8c7d95c59" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "4aa68a375f093a16", + "hash_cont_tokens": "501e2c659913aaaa" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c3465b5dfa001847", + "hash_cont_tokens": "b1180cd01713a559" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2451, + "non-padded": 83, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "c5e37076f5feb8db", + "hash_cont_tokens": "28496e50db59672b" + }, + "total_evaluation_time_secondes": "7677.245581150055", + "truncated": 0, + "non-truncated": 13389, + "padded": 2451, + "non-padded": 10938, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/LoupGarou/WizardCoder-Guanaco-15B-V1.1/results_2023-07-19T21-04-47.997241.json b/eval-results/LoupGarou/WizardCoder-Guanaco-15B-V1.1/results_2023-07-19T21-04-47.997241.json new file mode 100644 index 0000000000000000000000000000000000000000..514b04c43c3610b0fcd0167135d71cae0418e3c8 --- /dev/null +++ b/eval-results/LoupGarou/WizardCoder-Guanaco-15B-V1.1/results_2023-07-19T21-04-47.997241.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.2883959044368601, + "acc_stderr": 0.013238394422428175, + "acc_norm": 0.32593856655290104, + "acc_norm_stderr": 0.013697432466693252 + }, + "harness|hellaswag|10": { + "acc": 0.37203744274048994, + "acc_stderr": 0.0048236047750159, + "acc_norm": 0.4541923919537941, + "acc_norm_stderr": 0.004968796800410414 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.03712537833614866, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.03712537833614866 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2339622641509434, + "acc_stderr": 0.026055296901152915, + "acc_norm": 0.2339622641509434, + "acc_norm_stderr": 0.026055296901152915 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.25, + "acc_stderr": 0.03621034121889507, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03621034121889507 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749884, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749884 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179962, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179962 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3021276595744681, + "acc_stderr": 0.030017554471880557, + "acc_norm": 0.3021276595744681, + "acc_norm_stderr": 0.030017554471880557 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.040493392977481404, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.040493392977481404 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3448275862068966, + "acc_stderr": 0.03960933549451208, + "acc_norm": 0.3448275862068966, + "acc_norm_stderr": 0.03960933549451208 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2275132275132275, + "acc_stderr": 0.021591269407823785, + "acc_norm": 0.2275132275132275, + "acc_norm_stderr": 0.021591269407823785 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.04073524322147126, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.04073524322147126 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.23, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.23, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2161290322580645, + "acc_stderr": 0.02341529343356853, + "acc_norm": 0.2161290322580645, + "acc_norm_stderr": 0.02341529343356853 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.1724137931034483, + "acc_stderr": 0.02657767218303658, + "acc_norm": 0.1724137931034483, + "acc_norm_stderr": 0.02657767218303658 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21212121212121213, + "acc_stderr": 0.03192271569548299, + "acc_norm": 0.21212121212121213, + "acc_norm_stderr": 0.03192271569548299 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.20202020202020202, + "acc_stderr": 0.028606204289229872, + "acc_norm": 0.20202020202020202, + "acc_norm_stderr": 0.028606204289229872 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.20725388601036268, + "acc_stderr": 0.029252823291803613, + "acc_norm": 0.20725388601036268, + "acc_norm_stderr": 0.029252823291803613 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2153846153846154, + "acc_stderr": 0.020843034557462878, + "acc_norm": 0.2153846153846154, + "acc_norm_stderr": 0.020843034557462878 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.026466117538959916, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.026466117538959916 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.24369747899159663, + "acc_stderr": 0.027886828078380548, + "acc_norm": 0.24369747899159663, + "acc_norm_stderr": 0.027886828078380548 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.24503311258278146, + "acc_stderr": 0.03511807571804725, + "acc_norm": 0.24503311258278146, + "acc_norm_stderr": 0.03511807571804725 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.2036697247706422, + "acc_stderr": 0.017266742087630797, + "acc_norm": 0.2036697247706422, + "acc_norm_stderr": 0.017266742087630797 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.17592592592592593, + "acc_stderr": 0.025967420958258533, + "acc_norm": 0.17592592592592593, + "acc_norm_stderr": 0.025967420958258533 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.03019028245350195, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.03019028245350195 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.28270042194092826, + "acc_stderr": 0.02931281415395592, + "acc_norm": 0.28270042194092826, + "acc_norm_stderr": 0.02931281415395592 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.336322869955157, + "acc_stderr": 0.031708824268455005, + "acc_norm": 0.336322869955157, + "acc_norm_stderr": 0.031708824268455005 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2892561983471074, + "acc_stderr": 0.04139112727635464, + "acc_norm": 0.2892561983471074, + "acc_norm_stderr": 0.04139112727635464 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.0413311944024384, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.0413311944024384 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.03259177392742177, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.03259177392742177 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285713, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285713 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.22330097087378642, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.22330097087378642, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.3418803418803419, + "acc_stderr": 0.03107502852650776, + "acc_norm": 0.3418803418803419, + "acc_norm_stderr": 0.03107502852650776 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2988505747126437, + "acc_stderr": 0.01636925681509313, + "acc_norm": 0.2988505747126437, + "acc_norm_stderr": 0.01636925681509313 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2630057803468208, + "acc_stderr": 0.023703099525258172, + "acc_norm": 0.2630057803468208, + "acc_norm_stderr": 0.023703099525258172 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25139664804469275, + "acc_stderr": 0.014508979453553983, + "acc_norm": 0.25139664804469275, + "acc_norm_stderr": 0.014508979453553983 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.24836601307189543, + "acc_stderr": 0.024739981355113596, + "acc_norm": 0.24836601307189543, + "acc_norm_stderr": 0.024739981355113596 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24758842443729903, + "acc_stderr": 0.024513879973621967, + "acc_norm": 0.24758842443729903, + "acc_norm_stderr": 0.024513879973621967 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.023788583551658533, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.023788583551658533 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2730496453900709, + "acc_stderr": 0.026577860943307854, + "acc_norm": 0.2730496453900709, + "acc_norm_stderr": 0.026577860943307854 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2516297262059974, + "acc_stderr": 0.011083276280441917, + "acc_norm": 0.2516297262059974, + "acc_norm_stderr": 0.011083276280441917 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.20220588235294118, + "acc_stderr": 0.024398192986654924, + "acc_norm": 0.20220588235294118, + "acc_norm_stderr": 0.024398192986654924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.272875816993464, + "acc_stderr": 0.018020474148393577, + "acc_norm": 0.272875816993464, + "acc_norm_stderr": 0.018020474148393577 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.3, + "acc_stderr": 0.04389311454644286, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04389311454644286 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.18775510204081633, + "acc_stderr": 0.025000256039546212, + "acc_norm": 0.18775510204081633, + "acc_norm_stderr": 0.025000256039546212 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.26865671641791045, + "acc_stderr": 0.031343283582089536, + "acc_norm": 0.26865671641791045, + "acc_norm_stderr": 0.031343283582089536 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.30120481927710846, + "acc_stderr": 0.03571609230053481, + "acc_norm": 0.30120481927710846, + "acc_norm_stderr": 0.03571609230053481 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.035650796707083106, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.035650796707083106 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.26438188494492043, + "mc1_stderr": 0.015438211119522505, + "mc2": 0.42331934318918635, + "mc2_stderr": 0.015523372351238341 + }, + "all": { + "acc": 0.26123812980782113, + "acc_stderr": 0.031906050369000925, + "acc_norm": 0.26326690288119986, + "acc_norm_stderr": 0.03191629155662227, + "mc1": 0.26438188494492043, + "mc1_stderr": 0.015438211119522505, + "mc2": 0.42331934318918635, + "mc2_stderr": 0.015523372351238341 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "LoupGarou/WizardCoder-Guanaco-15B-V1.1", + "model_sha": "979531c84ec0b4e1712d6a5cec6907126a21e605", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "cf3b9ea33612d0f6", + "hash_cont_tokens": "dba03e8607e55660" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "b8dce70b49a73f68", + "hash_cont_tokens": "aeeb90422dddad80" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "4b2bc3b45f5e8d5a", + "hash_cont_tokens": "9fa5425796526efd" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "23440e62a7b2d5e5", + "hash_cont_tokens": "7e728d69ef1ac7c2" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "2c8582eea410db6d", + "hash_cont_tokens": "7f6d24030197b9e2" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ab8203714dc10365", + "hash_cont_tokens": "3234d906bf828aeb" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "dcdd9f537894fd7c", + "hash_cont_tokens": "a45a52e29ce33788" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "4ab9828f8337c32a", + "hash_cont_tokens": "4701c1c4bc65eb13" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "f20c5e5a97d442cf", + "hash_cont_tokens": "529a4b6152efd969" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "62e11834f72f2b55", + "hash_cont_tokens": "dfd59a537f965fd8" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "2c01899c41f52655", + "hash_cont_tokens": "3ad3bfeaaf9eecf0" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "687d69e77693d01f", + "hash_cont_tokens": "9ecf3f9ea7be6f36" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "bbee94c43d00994c", + "hash_cont_tokens": "8c85887d4e9340ba" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4a0e2507d76b04f8", + "hash_cont_tokens": "1649dc79ba4b8f6e" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "fe4297e5ffc256e6", + "hash_cont_tokens": "7298714f69bd0499" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "0b8aeb415ff5b1bf", + "hash_cont_tokens": "7742165a7d63434d" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "ae2b0b3b228d2bd7", + "hash_cont_tokens": "0c5900159c5a4fd3" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "204bbc9261b4c917", + "hash_cont_tokens": "85dba81265bfac4f" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "bb22a354e68640e5", + "hash_cont_tokens": "4fac8819ecb0c824" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "acef09e091664c4b", + "hash_cont_tokens": "9fa5425796526efd" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "499e4eb6954d6714", + "hash_cont_tokens": "64e65812857526be" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "375b75d804deeb76", + "hash_cont_tokens": "840f9e33c065c1fc" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "05e46151c35bc285", + "hash_cont_tokens": "e87da720c8dfb2e6" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "26bb245aa51490db", + "hash_cont_tokens": "4b0bc7b1752bc971" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b8d579e213ab9575", + "hash_cont_tokens": "6c2d8b7c6c68cbbc" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "168a05c631f917a7", + "hash_cont_tokens": "c9efd564121e5f64" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "7ca8fc7f8b137705", + "hash_cont_tokens": "87da7bbe91d71f4d" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "76004a62919b5475", + "hash_cont_tokens": "58c8fc9e4f01ea37" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "8a088c3c95b3232c", + "hash_cont_tokens": "bf01fea58a024ce9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "3da5c88422b8c8ad", + "hash_cont_tokens": "ec32025d30c749dc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "0a717a90a22cae35", + "hash_cont_tokens": "8351234d979af350" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "03aff3d5599d02d4", + "hash_cont_tokens": "2e400437e3cc54c1" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "e2f8f15480dadba0", + "hash_cont_tokens": "387cbb636c5830db" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "492531338247cb56", + "hash_cont_tokens": "14ddc4fd6077ac5a" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "06196a73e0b3420e", + "hash_cont_tokens": "818f4cc0988a9629" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "b894c90f2c51f84d", + "hash_cont_tokens": "f0c8aa96ca0f46c5" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "51c05430b0627344", + "hash_cont_tokens": "765ce9c1c62775d6" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "d29333c2738ac7c6", + "hash_cont_tokens": "c44e316003399c46" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "657052c8baa9fc85", + "hash_cont_tokens": "0de9fa48ae9a71a6" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "fc5326db9435de85", + "hash_cont_tokens": "c3a384869692ee19" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "801adc9549ba3fe7", + "hash_cont_tokens": "fcaa29ac112cadd6" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aebea8ad4ffaaefb", + "hash_cont_tokens": "04f508fb6568b7ff" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "780bbe312a283237", + "hash_cont_tokens": "9fa5425796526efd" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "848518bff6ee3db7", + "hash_cont_tokens": "5d41818685e2c793" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "aa30023bcdf5d58a", + "hash_cont_tokens": "42efb90e49c9e088" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "15b8b76535d7d82f", + "hash_cont_tokens": "c57656133741226a" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a8ad40f91d76e471", + "hash_cont_tokens": "090fb54fec61071e" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "71fa220f44351832", + "hash_cont_tokens": "59b76a89cd4887cf" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a526ab409be4a2d9", + "hash_cont_tokens": "d8cc4d411aa4e0e3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e4323a17f81e7316", + "hash_cont_tokens": "4f32dbf6bf9c9f8b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "06bc9d84c69c192f", + "hash_cont_tokens": "5f160f615d97ebd7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "4aa4134a856a253b", + "hash_cont_tokens": "021921d6cce5ec05" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "39df5733563dc07a", + "hash_cont_tokens": "9bc4fc87f3f2befa" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "c34769e3c38bdede", + "hash_cont_tokens": "477d51b5c5ce3cce" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c25c9c1dd04e971f", + "hash_cont_tokens": "15a4b8e5410af0b6" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "027e8aac00f3aa38", + "hash_cont_tokens": "0d5aa3fb85683feb" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "ba778c17daa0fcb9", + "hash_cont_tokens": "9fa5425796526efd" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "8cdca6c47be4526a", + "hash_cont_tokens": "d57828ff29ba6543" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "8f888321da976097", + "hash_cont_tokens": "3a4ad2460809839b" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "7c47a57b24ee9f01", + "hash_cont_tokens": "79eed6ee3736b1b3" + } + } +} \ No newline at end of file diff --git a/eval-results/LoupGarou/WizardCoder-Guanaco-15B-V1.1/results_2023-09-22T15-52-30.106380.json b/eval-results/LoupGarou/WizardCoder-Guanaco-15B-V1.1/results_2023-09-22T15-52-30.106380.json new file mode 100644 index 0000000000000000000000000000000000000000..d6b3c6dbb8b3d2adc97769ac4849c13c92ce8eca --- /dev/null +++ b/eval-results/LoupGarou/WizardCoder-Guanaco-15B-V1.1/results_2023-09-22T15-52-30.106380.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "LoupGarou/WizardCoder-Guanaco-15B-V1.1", + "model_sha": "979531c84ec0b4e1712d6a5cec6907126a21e605", + "model_size": "28.91 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.10580956375838926, + "em_stderr": 0.003150047651575815, + "f1": 0.16983640939597303, + "f1_stderr": 0.0033726302998826852 + }, + "harness|gsm8k|5": { + "acc": 0.02880970432145565, + "acc_stderr": 0.004607484283767452 + }, + "harness|winogrande|5": { + "acc": 0.5603788476716653, + "acc_stderr": 0.013949649776015703 + }, + "all": { + "em": 0.10580956375838926, + "em_stderr": 0.003150047651575815, + "f1": 0.16983640939597303, + "f1_stderr": 0.0033726302998826852, + "acc": 0.2945942759965605, + "acc_stderr": 0.009278567029891577 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "cf16f9000278e518", + "hash_cont_tokens": "b66d196df714667c" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "4aa68a375f093a16", + "hash_cont_tokens": "eec5606bbf442d02" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c3465b5dfa001847", + "hash_cont_tokens": "b1180cd01713a559" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2451, + "non-padded": 83, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "c5e37076f5feb8db", + "hash_cont_tokens": "660ceb4764adf874" + }, + "total_evaluation_time_secondes": "7089.204674005508", + "truncated": 0, + "non-truncated": 13389, + "padded": 2451, + "non-padded": 10938, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Minirecord/Mini_DPO_test02/results_2023-12-04T11-56-14.353700.json b/eval-results/Minirecord/Mini_DPO_test02/results_2023-12-04T11-56-14.353700.json new file mode 100644 index 0000000000000000000000000000000000000000..6ec784518b9fe6d95a472b9dd42a9d4183cbbcd3 --- /dev/null +++ b/eval-results/Minirecord/Mini_DPO_test02/results_2023-12-04T11-56-14.353700.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 136673.371738193, + "end_time": 145844.337919076, + "total_evaluation_time_secondes": "9170.966180882999", + "model_name": "Minirecord/Mini_DPO_test02", + "model_sha": "cd417467644c4178100083e342bad88a3f968be6", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5699658703071673, + "acc_stderr": 0.014467631559137994, + "acc_norm": 0.5972696245733788, + "acc_norm_stderr": 0.01433223630679014 + }, + "harness|hellaswag|10": { + "acc": 0.6425014937263493, + "acc_stderr": 0.00478283835222253, + "acc_norm": 0.8388767177853017, + "acc_norm_stderr": 0.0036689326296725643 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6074074074074074, + "acc_stderr": 0.0421850621536888, + "acc_norm": 0.6074074074074074, + "acc_norm_stderr": 0.0421850621536888 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6513157894736842, + "acc_stderr": 0.0387813988879761, + "acc_norm": 0.6513157894736842, + "acc_norm_stderr": 0.0387813988879761 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6830188679245283, + "acc_stderr": 0.028637235639800886, + "acc_norm": 0.6830188679245283, + "acc_norm_stderr": 0.028637235639800886 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6875, + "acc_stderr": 0.038760854559127644, + "acc_norm": 0.6875, + "acc_norm_stderr": 0.038760854559127644 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6127167630057804, + "acc_stderr": 0.03714325906302065, + "acc_norm": 0.6127167630057804, + "acc_norm_stderr": 0.03714325906302065 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.04655010411319616, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.04655010411319616 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.79, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5574468085106383, + "acc_stderr": 0.032469569197899575, + "acc_norm": 0.5574468085106383, + "acc_norm_stderr": 0.032469569197899575 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5087719298245614, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.5087719298245614, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5586206896551724, + "acc_stderr": 0.04137931034482757, + "acc_norm": 0.5586206896551724, + "acc_norm_stderr": 0.04137931034482757 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3835978835978836, + "acc_stderr": 0.0250437573185202, + "acc_norm": 0.3835978835978836, + "acc_norm_stderr": 0.0250437573185202 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.04325506042017086, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.04325506042017086 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.45, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.45, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7129032258064516, + "acc_stderr": 0.025736542745594528, + "acc_norm": 0.7129032258064516, + "acc_norm_stderr": 0.025736542745594528 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4876847290640394, + "acc_stderr": 0.035169204442208966, + "acc_norm": 0.4876847290640394, + "acc_norm_stderr": 0.035169204442208966 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7333333333333333, + "acc_stderr": 0.03453131801885417, + "acc_norm": 0.7333333333333333, + "acc_norm_stderr": 0.03453131801885417 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.029620227874790482, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.029620227874790482 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8704663212435233, + "acc_stderr": 0.02423353229775873, + "acc_norm": 0.8704663212435233, + "acc_norm_stderr": 0.02423353229775873 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6230769230769231, + "acc_stderr": 0.024570975364225995, + "acc_norm": 0.6230769230769231, + "acc_norm_stderr": 0.024570975364225995 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.028317533496066468, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.028317533496066468 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6218487394957983, + "acc_stderr": 0.03149930577784906, + "acc_norm": 0.6218487394957983, + "acc_norm_stderr": 0.03149930577784906 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8055045871559633, + "acc_stderr": 0.01697028909045804, + "acc_norm": 0.8055045871559633, + "acc_norm_stderr": 0.01697028909045804 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.44907407407407407, + "acc_stderr": 0.03392238405321617, + "acc_norm": 0.44907407407407407, + "acc_norm_stderr": 0.03392238405321617 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7892156862745098, + "acc_stderr": 0.028626547912437406, + "acc_norm": 0.7892156862745098, + "acc_norm_stderr": 0.028626547912437406 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.759493670886076, + "acc_stderr": 0.02782078198114968, + "acc_norm": 0.759493670886076, + "acc_norm_stderr": 0.02782078198114968 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6771300448430493, + "acc_stderr": 0.03138147637575499, + "acc_norm": 0.6771300448430493, + "acc_norm_stderr": 0.03138147637575499 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7404580152671756, + "acc_stderr": 0.038448761397852714, + "acc_norm": 0.7404580152671756, + "acc_norm_stderr": 0.038448761397852714 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7768595041322314, + "acc_stderr": 0.03800754475228732, + "acc_norm": 0.7768595041322314, + "acc_norm_stderr": 0.03800754475228732 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.04133119440243838, + "acc_norm": 0.7592592592592593, + "acc_norm_stderr": 0.04133119440243838 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7668711656441718, + "acc_stderr": 0.0332201579577674, + "acc_norm": 0.7668711656441718, + "acc_norm_stderr": 0.0332201579577674 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5089285714285714, + "acc_stderr": 0.04745033255489122, + "acc_norm": 0.5089285714285714, + "acc_norm_stderr": 0.04745033255489122 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7669902912621359, + "acc_stderr": 0.04185832598928315, + "acc_norm": 0.7669902912621359, + "acc_norm_stderr": 0.04185832598928315 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8504273504273504, + "acc_stderr": 0.023365051491753715, + "acc_norm": 0.8504273504273504, + "acc_norm_stderr": 0.023365051491753715 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526094, + "acc_norm": 0.67, + "acc_norm_stderr": 0.047258156262526094 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8084291187739464, + "acc_stderr": 0.014072859310451949, + "acc_norm": 0.8084291187739464, + "acc_norm_stderr": 0.014072859310451949 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6763005780346821, + "acc_stderr": 0.025190181327608405, + "acc_norm": 0.6763005780346821, + "acc_norm_stderr": 0.025190181327608405 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2737430167597765, + "acc_stderr": 0.014912413096372434, + "acc_norm": 0.2737430167597765, + "acc_norm_stderr": 0.014912413096372434 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6895424836601307, + "acc_stderr": 0.026493033225145898, + "acc_norm": 0.6895424836601307, + "acc_norm_stderr": 0.026493033225145898 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6945337620578779, + "acc_stderr": 0.026160584450140446, + "acc_norm": 0.6945337620578779, + "acc_norm_stderr": 0.026160584450140446 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7160493827160493, + "acc_stderr": 0.025089478523765127, + "acc_norm": 0.7160493827160493, + "acc_norm_stderr": 0.025089478523765127 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.43617021276595747, + "acc_stderr": 0.02958345203628407, + "acc_norm": 0.43617021276595747, + "acc_norm_stderr": 0.02958345203628407 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44784876140808344, + "acc_stderr": 0.012700582404768221, + "acc_norm": 0.44784876140808344, + "acc_norm_stderr": 0.012700582404768221 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6397058823529411, + "acc_stderr": 0.02916312857067073, + "acc_norm": 0.6397058823529411, + "acc_norm_stderr": 0.02916312857067073 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.630718954248366, + "acc_stderr": 0.019524316744866353, + "acc_norm": 0.630718954248366, + "acc_norm_stderr": 0.019524316744866353 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6272727272727273, + "acc_stderr": 0.04631381319425465, + "acc_norm": 0.6272727272727273, + "acc_norm_stderr": 0.04631381319425465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7224489795918367, + "acc_stderr": 0.028666857790274648, + "acc_norm": 0.7224489795918367, + "acc_norm_stderr": 0.028666857790274648 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.835820895522388, + "acc_stderr": 0.026193923544454132, + "acc_norm": 0.835820895522388, + "acc_norm_stderr": 0.026193923544454132 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.83, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5120481927710844, + "acc_stderr": 0.03891364495835817, + "acc_norm": 0.5120481927710844, + "acc_norm_stderr": 0.03891364495835817 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8187134502923976, + "acc_stderr": 0.02954774168764004, + "acc_norm": 0.8187134502923976, + "acc_norm_stderr": 0.02954774168764004 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.31946144430844553, + "mc1_stderr": 0.016322644182960498, + "mc2": 0.48469236116826936, + "mc2_stderr": 0.014679613330954367 + }, + "harness|winogrande|5": { + "acc": 0.7837411207576953, + "acc_stderr": 0.01157061486140935 + }, + "harness|gsm8k|5": { + "acc": 0.3502653525398029, + "acc_stderr": 0.013140409455571276 + }, + "all": { + "acc": 0.6168729607530143, + "acc_stderr": 0.03275935687592929, + "acc_norm": 0.6223546289993482, + "acc_norm_stderr": 0.033429838036267936, + "mc1": 0.31946144430844553, + "mc1_stderr": 0.016322644182960498, + "mc2": 0.48469236116826936, + "mc2_stderr": 0.014679613330954367 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "655285b7bc0c41cd" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "b5283ba83792d647" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/MrNJK/gpt2-xl-sft/results_2023-08-09T09-21-02.216696.json b/eval-results/MrNJK/gpt2-xl-sft/results_2023-08-09T09-21-02.216696.json new file mode 100644 index 0000000000000000000000000000000000000000..1ee9ff5fae3a9fa6b0e5287574db638dcdc19fa1 --- /dev/null +++ b/eval-results/MrNJK/gpt2-xl-sft/results_2023-08-09T09-21-02.216696.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.26535836177474403, + "acc_stderr": 0.012902554762313962, + "acc_norm": 0.3003412969283277, + "acc_norm_stderr": 0.013395909309957 + }, + "harness|hellaswag|10": { + "acc": 0.3895638319059948, + "acc_stderr": 0.004866547422355562, + "acc_norm": 0.49173471420035847, + "acc_norm_stderr": 0.004989099611536817 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.03785714465066653, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.03785714465066653 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17105263157894737, + "acc_stderr": 0.030643607071677088, + "acc_norm": 0.17105263157894737, + "acc_norm_stderr": 0.030643607071677088 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.27169811320754716, + "acc_stderr": 0.027377706624670713, + "acc_norm": 0.27169811320754716, + "acc_norm_stderr": 0.027377706624670713 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3402777777777778, + "acc_stderr": 0.03962135573486219, + "acc_norm": 0.3402777777777778, + "acc_norm_stderr": 0.03962135573486219 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.03295304696818317, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.03295304696818317 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.04280105837364395, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.04280105837364395 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102967, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102967 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748141, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748141 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3103448275862069, + "acc_stderr": 0.03855289616378947, + "acc_norm": 0.3103448275862069, + "acc_norm_stderr": 0.03855289616378947 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.022418042891113946, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.022418042891113946 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.04104947269903394, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.04104947269903394 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.14, + "acc_stderr": 0.03487350880197771, + "acc_norm": 0.14, + "acc_norm_stderr": 0.03487350880197771 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25483870967741934, + "acc_stderr": 0.024790118459332208, + "acc_norm": 0.25483870967741934, + "acc_norm_stderr": 0.024790118459332208 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.22167487684729065, + "acc_stderr": 0.029225575892489607, + "acc_norm": 0.22167487684729065, + "acc_norm_stderr": 0.029225575892489607 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.30303030303030304, + "acc_stderr": 0.03588624800091709, + "acc_norm": 0.30303030303030304, + "acc_norm_stderr": 0.03588624800091709 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.18686868686868688, + "acc_stderr": 0.02777253333421898, + "acc_norm": 0.18686868686868688, + "acc_norm_stderr": 0.02777253333421898 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.21761658031088082, + "acc_stderr": 0.029778663037752954, + "acc_norm": 0.21761658031088082, + "acc_norm_stderr": 0.029778663037752954 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2794871794871795, + "acc_stderr": 0.022752388839776823, + "acc_norm": 0.2794871794871795, + "acc_norm_stderr": 0.022752388839776823 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.02684205787383371, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.02684205787383371 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.15966386554621848, + "acc_stderr": 0.023793353997528802, + "acc_norm": 0.15966386554621848, + "acc_norm_stderr": 0.023793353997528802 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.26490066225165565, + "acc_stderr": 0.03603038545360384, + "acc_norm": 0.26490066225165565, + "acc_norm_stderr": 0.03603038545360384 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.23853211009174313, + "acc_stderr": 0.018272575810231857, + "acc_norm": 0.23853211009174313, + "acc_norm_stderr": 0.018272575810231857 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.27314814814814814, + "acc_stderr": 0.030388051301678116, + "acc_norm": 0.27314814814814814, + "acc_norm_stderr": 0.030388051301678116 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2616033755274262, + "acc_stderr": 0.028609516716994934, + "acc_norm": 0.2616033755274262, + "acc_norm_stderr": 0.028609516716994934 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.22869955156950672, + "acc_stderr": 0.028188240046929196, + "acc_norm": 0.22869955156950672, + "acc_norm_stderr": 0.028188240046929196 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.0384487613978527, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.0384487613978527 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.36363636363636365, + "acc_stderr": 0.04391326286724071, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.04391326286724071 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.28703703703703703, + "acc_stderr": 0.043733130409147614, + "acc_norm": 0.28703703703703703, + "acc_norm_stderr": 0.043733130409147614 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.27607361963190186, + "acc_stderr": 0.0351238528370505, + "acc_norm": 0.27607361963190186, + "acc_norm_stderr": 0.0351238528370505 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.26785714285714285, + "acc_stderr": 0.04203277291467762, + "acc_norm": 0.26785714285714285, + "acc_norm_stderr": 0.04203277291467762 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.24271844660194175, + "acc_stderr": 0.04245022486384493, + "acc_norm": 0.24271844660194175, + "acc_norm_stderr": 0.04245022486384493 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2606837606837607, + "acc_stderr": 0.028760348956523414, + "acc_norm": 0.2606837606837607, + "acc_norm_stderr": 0.028760348956523414 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2848020434227331, + "acc_stderr": 0.016139174096522595, + "acc_norm": 0.2848020434227331, + "acc_norm_stderr": 0.016139174096522595 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2976878612716763, + "acc_stderr": 0.024617055388677006, + "acc_norm": 0.2976878612716763, + "acc_norm_stderr": 0.024617055388677006 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.025261691219729484, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.025261691219729484 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1864951768488746, + "acc_stderr": 0.022122439772480764, + "acc_norm": 0.1864951768488746, + "acc_norm_stderr": 0.022122439772480764 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25617283950617287, + "acc_stderr": 0.0242885336377261, + "acc_norm": 0.25617283950617287, + "acc_norm_stderr": 0.0242885336377261 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2695035460992908, + "acc_stderr": 0.026469036818590638, + "acc_norm": 0.2695035460992908, + "acc_norm_stderr": 0.026469036818590638 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2620599739243807, + "acc_stderr": 0.011231552795890394, + "acc_norm": 0.2620599739243807, + "acc_norm_stderr": 0.011231552795890394 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.023157468308559373, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.023157468308559373 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.272875816993464, + "acc_stderr": 0.01802047414839358, + "acc_norm": 0.272875816993464, + "acc_norm_stderr": 0.01802047414839358 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.33636363636363636, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.33636363636363636, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.16326530612244897, + "acc_stderr": 0.023661699177098622, + "acc_norm": 0.16326530612244897, + "acc_norm_stderr": 0.023661699177098622 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.20398009950248755, + "acc_stderr": 0.02849317624532609, + "acc_norm": 0.20398009950248755, + "acc_norm_stderr": 0.02849317624532609 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.22289156626506024, + "acc_stderr": 0.032400048255946876, + "acc_norm": 0.22289156626506024, + "acc_norm_stderr": 0.032400048255946876 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.26900584795321636, + "acc_stderr": 0.0340105262010409, + "acc_norm": 0.26900584795321636, + "acc_norm_stderr": 0.0340105262010409 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.24112607099143207, + "mc1_stderr": 0.01497482727975233, + "mc2": 0.3878303037389204, + "mc2_stderr": 0.014192799891575568 + }, + "all": { + "acc": 0.2580596743457459, + "acc_stderr": 0.03174089326207604, + "acc_norm": 0.2603843153194399, + "acc_norm_stderr": 0.03175133235931035, + "mc1": 0.24112607099143207, + "mc1_stderr": 0.01497482727975233, + "mc2": 0.3878303037389204, + "mc2_stderr": 0.014192799891575568 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "MrNJK/gpt2-xl-sft", + "model_sha": "53250831436460254b7ee9afc4014d4d3156b372", + "model_dtype": "torch.float16", + "lighteval_sha": "da839e70121267a9bf55a0fbea4fb2fae2948337", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "d57e59a4130853e0" + }, + "truncated": 1568, + "non-truncated": 3119, + "padded": 3087, + "non-padded": 1600, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "d8973ec3a510d4bc" + }, + "truncated": 1975, + "non-truncated": 38193, + "padded": 38021, + "non-padded": 2147, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "4a75531cbfd07f95" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "accb7cef363cf18e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "16b3626c8a5e3797" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "14362f67beb028ba" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "69d91a3fd2e4511e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 20, + "non-truncated": 672, + "padded": 660, + "non-padded": 32, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "4468714c283b10f9" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "8d66c298f1a52c46" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "f23c2d0723d2f830" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "9cf4df701a8e97ca" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "120b77ffae8b0591" + }, + "truncated": 16, + "non-truncated": 384, + "padded": 384, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "1ba11ec0fba0a4bb" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "822c5217a581c95f" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "a745b56725d20832" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "969464bbd6828346" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "f00cfc03022d559a" + }, + "truncated": 4, + "non-truncated": 860, + "padded": 860, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "f6dd7cf291429cd9" + }, + "truncated": 948, + "non-truncated": 0, + "padded": 0, + "non-padded": 948, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "ad79993e5e453770" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "5904fef477924132" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "201895f1be790f02" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "38fadc6201499c0e" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "dcdd301556b5df9e" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "67c525ef797587ce" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "0d9fbe99f871c5c5" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 6136, + "non-truncated": 0, + "padded": 0, + "non-padded": 6136, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "01ddc79c7e1f2f6d" + }, + "truncated": 1032, + "non-truncated": 56, + "padded": 48, + "non-padded": 1040, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "fa0fc10c4bdd757c" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "6483ae9688e0a0d6" + }, + "truncated": 980, + "non-truncated": 0, + "padded": 0, + "non-padded": 980, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "9ec52ea7962c54f5" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "bc42db2c568e27d6" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "c8f2395107c4b82b" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "18a3fbefef0c4910", + "hash_cont_tokens": "f1f2fb65023f2668" + }, + "total_evaluation_time_secondes": "2108.427546262741", + "truncated": 14155, + "non-truncated": 96864, + "padded": 96540, + "non-padded": 14479, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/MrNJK/gpt2-xl-sft/results_2023-08-11T16-51-50.845308.json b/eval-results/MrNJK/gpt2-xl-sft/results_2023-08-11T16-51-50.845308.json new file mode 100644 index 0000000000000000000000000000000000000000..38632d77ed8607247305d0b76c4bf85b335f2338 --- /dev/null +++ b/eval-results/MrNJK/gpt2-xl-sft/results_2023-08-11T16-51-50.845308.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.26535836177474403, + "acc_stderr": 0.012902554762313962, + "acc_norm": 0.3003412969283277, + "acc_norm_stderr": 0.013395909309957 + }, + "harness|hellaswag|10": { + "acc": 0.3895638319059948, + "acc_stderr": 0.004866547422355562, + "acc_norm": 0.49173471420035847, + "acc_norm_stderr": 0.004989099611536817 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.03785714465066653, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.03785714465066653 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17105263157894737, + "acc_stderr": 0.030643607071677088, + "acc_norm": 0.17105263157894737, + "acc_norm_stderr": 0.030643607071677088 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.27169811320754716, + "acc_stderr": 0.027377706624670713, + "acc_norm": 0.27169811320754716, + "acc_norm_stderr": 0.027377706624670713 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3402777777777778, + "acc_stderr": 0.03962135573486219, + "acc_norm": 0.3402777777777778, + "acc_norm_stderr": 0.03962135573486219 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.03295304696818317, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.03295304696818317 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.04280105837364395, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.04280105837364395 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102967, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102967 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748141, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748141 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3103448275862069, + "acc_stderr": 0.03855289616378947, + "acc_norm": 0.3103448275862069, + "acc_norm_stderr": 0.03855289616378947 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.022418042891113946, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.022418042891113946 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.04104947269903394, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.04104947269903394 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.14, + "acc_stderr": 0.03487350880197771, + "acc_norm": 0.14, + "acc_norm_stderr": 0.03487350880197771 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25483870967741934, + "acc_stderr": 0.024790118459332208, + "acc_norm": 0.25483870967741934, + "acc_norm_stderr": 0.024790118459332208 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.22167487684729065, + "acc_stderr": 0.029225575892489607, + "acc_norm": 0.22167487684729065, + "acc_norm_stderr": 0.029225575892489607 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.30303030303030304, + "acc_stderr": 0.03588624800091709, + "acc_norm": 0.30303030303030304, + "acc_norm_stderr": 0.03588624800091709 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.18686868686868688, + "acc_stderr": 0.02777253333421898, + "acc_norm": 0.18686868686868688, + "acc_norm_stderr": 0.02777253333421898 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.21761658031088082, + "acc_stderr": 0.029778663037752954, + "acc_norm": 0.21761658031088082, + "acc_norm_stderr": 0.029778663037752954 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2794871794871795, + "acc_stderr": 0.022752388839776823, + "acc_norm": 0.2794871794871795, + "acc_norm_stderr": 0.022752388839776823 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.02684205787383371, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.02684205787383371 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.15966386554621848, + "acc_stderr": 0.023793353997528802, + "acc_norm": 0.15966386554621848, + "acc_norm_stderr": 0.023793353997528802 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.26490066225165565, + "acc_stderr": 0.03603038545360384, + "acc_norm": 0.26490066225165565, + "acc_norm_stderr": 0.03603038545360384 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.23853211009174313, + "acc_stderr": 0.018272575810231857, + "acc_norm": 0.23853211009174313, + "acc_norm_stderr": 0.018272575810231857 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.27314814814814814, + "acc_stderr": 0.030388051301678116, + "acc_norm": 0.27314814814814814, + "acc_norm_stderr": 0.030388051301678116 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2616033755274262, + "acc_stderr": 0.028609516716994934, + "acc_norm": 0.2616033755274262, + "acc_norm_stderr": 0.028609516716994934 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.22869955156950672, + "acc_stderr": 0.028188240046929196, + "acc_norm": 0.22869955156950672, + "acc_norm_stderr": 0.028188240046929196 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.0384487613978527, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.0384487613978527 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.36363636363636365, + "acc_stderr": 0.04391326286724071, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.04391326286724071 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.28703703703703703, + "acc_stderr": 0.043733130409147614, + "acc_norm": 0.28703703703703703, + "acc_norm_stderr": 0.043733130409147614 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.27607361963190186, + "acc_stderr": 0.0351238528370505, + "acc_norm": 0.27607361963190186, + "acc_norm_stderr": 0.0351238528370505 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.26785714285714285, + "acc_stderr": 0.04203277291467762, + "acc_norm": 0.26785714285714285, + "acc_norm_stderr": 0.04203277291467762 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.24271844660194175, + "acc_stderr": 0.04245022486384493, + "acc_norm": 0.24271844660194175, + "acc_norm_stderr": 0.04245022486384493 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2606837606837607, + "acc_stderr": 0.028760348956523414, + "acc_norm": 0.2606837606837607, + "acc_norm_stderr": 0.028760348956523414 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2848020434227331, + "acc_stderr": 0.016139174096522595, + "acc_norm": 0.2848020434227331, + "acc_norm_stderr": 0.016139174096522595 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2976878612716763, + "acc_stderr": 0.024617055388677006, + "acc_norm": 0.2976878612716763, + "acc_norm_stderr": 0.024617055388677006 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.025261691219729484, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.025261691219729484 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1864951768488746, + "acc_stderr": 0.022122439772480764, + "acc_norm": 0.1864951768488746, + "acc_norm_stderr": 0.022122439772480764 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25617283950617287, + "acc_stderr": 0.0242885336377261, + "acc_norm": 0.25617283950617287, + "acc_norm_stderr": 0.0242885336377261 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2695035460992908, + "acc_stderr": 0.026469036818590638, + "acc_norm": 0.2695035460992908, + "acc_norm_stderr": 0.026469036818590638 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2620599739243807, + "acc_stderr": 0.011231552795890394, + "acc_norm": 0.2620599739243807, + "acc_norm_stderr": 0.011231552795890394 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.023157468308559373, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.023157468308559373 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.272875816993464, + "acc_stderr": 0.01802047414839358, + "acc_norm": 0.272875816993464, + "acc_norm_stderr": 0.01802047414839358 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.33636363636363636, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.33636363636363636, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.16326530612244897, + "acc_stderr": 0.023661699177098622, + "acc_norm": 0.16326530612244897, + "acc_norm_stderr": 0.023661699177098622 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.20398009950248755, + "acc_stderr": 0.02849317624532609, + "acc_norm": 0.20398009950248755, + "acc_norm_stderr": 0.02849317624532609 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.22289156626506024, + "acc_stderr": 0.032400048255946876, + "acc_norm": 0.22289156626506024, + "acc_norm_stderr": 0.032400048255946876 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.26900584795321636, + "acc_stderr": 0.0340105262010409, + "acc_norm": 0.26900584795321636, + "acc_norm_stderr": 0.0340105262010409 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.24112607099143207, + "mc1_stderr": 0.01497482727975233, + "mc2": 0.3878303037389204, + "mc2_stderr": 0.014192799891575568 + }, + "all": { + "acc": 0.2580596743457459, + "acc_stderr": 0.03174089326207604, + "acc_norm": 0.2603843153194399, + "acc_norm_stderr": 0.03175133235931035, + "mc1": 0.24112607099143207, + "mc1_stderr": 0.01497482727975233, + "mc2": 0.3878303037389204, + "mc2_stderr": 0.014192799891575568 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "MrNJK/gpt2-xl-sft", + "model_sha": "53250831436460254b7ee9afc4014d4d3156b372", + "model_dtype": "torch.float16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "d57e59a4130853e0" + }, + "truncated": 1568, + "non-truncated": 3119, + "padded": 3087, + "non-padded": 1600, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "d8973ec3a510d4bc" + }, + "truncated": 1975, + "non-truncated": 38193, + "padded": 38021, + "non-padded": 2147, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "4a75531cbfd07f95" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "accb7cef363cf18e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "16b3626c8a5e3797" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "14362f67beb028ba" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "69d91a3fd2e4511e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 20, + "non-truncated": 672, + "padded": 660, + "non-padded": 32, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "4468714c283b10f9" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "8d66c298f1a52c46" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "f23c2d0723d2f830" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "9cf4df701a8e97ca" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "120b77ffae8b0591" + }, + "truncated": 16, + "non-truncated": 384, + "padded": 384, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "1ba11ec0fba0a4bb" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "822c5217a581c95f" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "a745b56725d20832" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "969464bbd6828346" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "f00cfc03022d559a" + }, + "truncated": 4, + "non-truncated": 860, + "padded": 860, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "f6dd7cf291429cd9" + }, + "truncated": 948, + "non-truncated": 0, + "padded": 0, + "non-padded": 948, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "ad79993e5e453770" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "5904fef477924132" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "201895f1be790f02" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "38fadc6201499c0e" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "dcdd301556b5df9e" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "67c525ef797587ce" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "0d9fbe99f871c5c5" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 6136, + "non-truncated": 0, + "padded": 0, + "non-padded": 6136, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "01ddc79c7e1f2f6d" + }, + "truncated": 1032, + "non-truncated": 56, + "padded": 48, + "non-padded": 1040, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "fa0fc10c4bdd757c" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "6483ae9688e0a0d6" + }, + "truncated": 980, + "non-truncated": 0, + "padded": 0, + "non-padded": 980, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "9ec52ea7962c54f5" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "bc42db2c568e27d6" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "c8f2395107c4b82b" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "18a3fbefef0c4910", + "hash_cont_tokens": "f1f2fb65023f2668" + }, + "total_evaluation_time_secondes": "2115.4670326709747", + "truncated": 14155, + "non-truncated": 96864, + "padded": 96540, + "non-padded": 14479, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/MrNJK/gpt2-xl-sft/results_2023-09-17T20-10-52.677287.json b/eval-results/MrNJK/gpt2-xl-sft/results_2023-09-17T20-10-52.677287.json new file mode 100644 index 0000000000000000000000000000000000000000..d55412a56b05bd8e8e01bb14c439804fe48d1e9f --- /dev/null +++ b/eval-results/MrNJK/gpt2-xl-sft/results_2023-09-17T20-10-52.677287.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "MrNJK/gpt2-xl-sft", + "model_sha": "9f663fc8007db838eda45282df4b06f581c3c899", + "model_size": "2.91 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001572986577181208, + "em_stderr": 0.000405845113241776, + "f1": 0.053466862416107416, + "f1_stderr": 0.0012595479932490756 + }, + "harness|gsm8k|5": { + "acc": 0.0075815011372251705, + "acc_stderr": 0.0023892815120772075 + }, + "harness|winogrande|5": { + "acc": 0.5556432517758485, + "acc_stderr": 0.013965196769083553 + }, + "all": { + "em": 0.001572986577181208, + "em_stderr": 0.000405845113241776, + "f1": 0.053466862416107416, + "f1_stderr": 0.0012595479932490756, + "acc": 0.28161237645653686, + "acc_stderr": 0.00817723914058038 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "71e6a66aa8a3e22f" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "f5df04897b69c782" + }, + "truncated": 917, + "non-truncated": 402, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "6b3de69e2f87348c", + "hash_cont_tokens": "08512f0909d37e76" + }, + "total_evaluation_time_secondes": "20920.67697763443", + "truncated": 10207, + "non-truncated": 3182, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/OpenLemur/lemur-70b-chat-v1/results_2023-08-24T04-11-57.870589.json b/eval-results/OpenLemur/lemur-70b-chat-v1/results_2023-08-24T04-11-57.870589.json new file mode 100644 index 0000000000000000000000000000000000000000..a6611511b45a75467e08f891c756accd011585fd --- /dev/null +++ b/eval-results/OpenLemur/lemur-70b-chat-v1/results_2023-08-24T04-11-57.870589.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.6356655290102389, + "acc_stderr": 0.014063260279882419, + "acc_norm": 0.6697952218430034, + "acc_norm_stderr": 0.01374308560376042 + }, + "harness|hellaswag|10": { + "acc": 0.6613224457279426, + "acc_stderr": 0.004722928332834054, + "acc_norm": 0.8572993427604063, + "acc_norm_stderr": 0.0034905249650619163 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.562962962962963, + "acc_stderr": 0.04284958639753401, + "acc_norm": 0.562962962962963, + "acc_norm_stderr": 0.04284958639753401 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7236842105263158, + "acc_stderr": 0.03639057569952929, + "acc_norm": 0.7236842105263158, + "acc_norm_stderr": 0.03639057569952929 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6943396226415094, + "acc_stderr": 0.028353298073322666, + "acc_norm": 0.6943396226415094, + "acc_norm_stderr": 0.028353298073322666 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7430555555555556, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.7430555555555556, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6647398843930635, + "acc_stderr": 0.03599586301247077, + "acc_norm": 0.6647398843930635, + "acc_norm_stderr": 0.03599586301247077 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.04784060704105654, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.04784060704105654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5872340425531914, + "acc_stderr": 0.03218471141400351, + "acc_norm": 0.5872340425531914, + "acc_norm_stderr": 0.03218471141400351 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.37719298245614036, + "acc_stderr": 0.04559522141958216, + "acc_norm": 0.37719298245614036, + "acc_norm_stderr": 0.04559522141958216 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5517241379310345, + "acc_stderr": 0.04144311810878152, + "acc_norm": 0.5517241379310345, + "acc_norm_stderr": 0.04144311810878152 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4470899470899471, + "acc_stderr": 0.025606723995777025, + "acc_norm": 0.4470899470899471, + "acc_norm_stderr": 0.025606723995777025 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5317460317460317, + "acc_stderr": 0.04463112720677173, + "acc_norm": 0.5317460317460317, + "acc_norm_stderr": 0.04463112720677173 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7677419354838709, + "acc_stderr": 0.02402225613030823, + "acc_norm": 0.7677419354838709, + "acc_norm_stderr": 0.02402225613030823 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5073891625615764, + "acc_stderr": 0.035176035403610105, + "acc_norm": 0.5073891625615764, + "acc_norm_stderr": 0.035176035403610105 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.793939393939394, + "acc_stderr": 0.03158415324047711, + "acc_norm": 0.793939393939394, + "acc_norm_stderr": 0.03158415324047711 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8484848484848485, + "acc_stderr": 0.025545650426603627, + "acc_norm": 0.8484848484848485, + "acc_norm_stderr": 0.025545650426603627 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9067357512953368, + "acc_stderr": 0.02098685459328974, + "acc_norm": 0.9067357512953368, + "acc_norm_stderr": 0.02098685459328974 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6641025641025641, + "acc_stderr": 0.023946724741563973, + "acc_norm": 0.6641025641025641, + "acc_norm_stderr": 0.023946724741563973 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.02684205787383371, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.02684205787383371 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7016806722689075, + "acc_stderr": 0.02971914287634286, + "acc_norm": 0.7016806722689075, + "acc_norm_stderr": 0.02971914287634286 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4370860927152318, + "acc_stderr": 0.04050035722230636, + "acc_norm": 0.4370860927152318, + "acc_norm_stderr": 0.04050035722230636 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8366972477064221, + "acc_stderr": 0.015848255806501534, + "acc_norm": 0.8366972477064221, + "acc_norm_stderr": 0.015848255806501534 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5509259259259259, + "acc_stderr": 0.03392238405321617, + "acc_norm": 0.5509259259259259, + "acc_norm_stderr": 0.03392238405321617 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8578431372549019, + "acc_stderr": 0.024509803921568603, + "acc_norm": 0.8578431372549019, + "acc_norm_stderr": 0.024509803921568603 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8438818565400844, + "acc_stderr": 0.023627159460318674, + "acc_norm": 0.8438818565400844, + "acc_norm_stderr": 0.023627159460318674 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7443946188340808, + "acc_stderr": 0.029275891003969927, + "acc_norm": 0.7443946188340808, + "acc_norm_stderr": 0.029275891003969927 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7786259541984732, + "acc_stderr": 0.03641297081313729, + "acc_norm": 0.7786259541984732, + "acc_norm_stderr": 0.03641297081313729 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.859504132231405, + "acc_stderr": 0.031722334260021585, + "acc_norm": 0.859504132231405, + "acc_norm_stderr": 0.031722334260021585 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7962962962962963, + "acc_stderr": 0.03893542518824847, + "acc_norm": 0.7962962962962963, + "acc_norm_stderr": 0.03893542518824847 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8159509202453987, + "acc_stderr": 0.030446777687971726, + "acc_norm": 0.8159509202453987, + "acc_norm_stderr": 0.030446777687971726 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5357142857142857, + "acc_stderr": 0.04733667890053756, + "acc_norm": 0.5357142857142857, + "acc_norm_stderr": 0.04733667890053756 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8675213675213675, + "acc_stderr": 0.022209309073165616, + "acc_norm": 0.8675213675213675, + "acc_norm_stderr": 0.022209309073165616 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8109833971902938, + "acc_stderr": 0.014000791294406999, + "acc_norm": 0.8109833971902938, + "acc_norm_stderr": 0.014000791294406999 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7572254335260116, + "acc_stderr": 0.023083658586984204, + "acc_norm": 0.7572254335260116, + "acc_norm_stderr": 0.023083658586984204 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.5217877094972067, + "acc_stderr": 0.016706617522176136, + "acc_norm": 0.5217877094972067, + "acc_norm_stderr": 0.016706617522176136 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7320261437908496, + "acc_stderr": 0.025360603796242557, + "acc_norm": 0.7320261437908496, + "acc_norm_stderr": 0.025360603796242557 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7234726688102894, + "acc_stderr": 0.02540383297817961, + "acc_norm": 0.7234726688102894, + "acc_norm_stderr": 0.02540383297817961 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.75, + "acc_stderr": 0.02409347123262133, + "acc_norm": 0.75, + "acc_norm_stderr": 0.02409347123262133 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5035460992907801, + "acc_stderr": 0.02982674915328092, + "acc_norm": 0.5035460992907801, + "acc_norm_stderr": 0.02982674915328092 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5182529335071708, + "acc_stderr": 0.012761723960595474, + "acc_norm": 0.5182529335071708, + "acc_norm_stderr": 0.012761723960595474 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.028418208619406762, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.028418208619406762 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.704248366013072, + "acc_stderr": 0.018463154132632806, + "acc_norm": 0.704248366013072, + "acc_norm_stderr": 0.018463154132632806 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7181818181818181, + "acc_stderr": 0.043091187099464585, + "acc_norm": 0.7181818181818181, + "acc_norm_stderr": 0.043091187099464585 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7755102040816326, + "acc_stderr": 0.0267114305555384, + "acc_norm": 0.7755102040816326, + "acc_norm_stderr": 0.0267114305555384 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.835820895522388, + "acc_stderr": 0.026193923544454132, + "acc_norm": 0.835820895522388, + "acc_norm_stderr": 0.026193923544454132 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.88, + "acc_stderr": 0.03265986323710906, + "acc_norm": 0.88, + "acc_norm_stderr": 0.03265986323710906 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5240963855421686, + "acc_stderr": 0.03887971849597264, + "acc_norm": 0.5240963855421686, + "acc_norm_stderr": 0.03887971849597264 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8128654970760234, + "acc_stderr": 0.02991312723236804, + "acc_norm": 0.8128654970760234, + "acc_norm_stderr": 0.02991312723236804 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.37821297429620565, + "mc1_stderr": 0.01697633590754687, + "mc2": 0.5657669903989726, + "mc2_stderr": 0.015172114047787783 + }, + "all": { + "acc": 0.6595282838357237, + "acc_stderr": 0.03239598314274976, + "acc_norm": 0.6634283955283548, + "acc_norm_stderr": 0.03236966826064986, + "mc1": 0.37821297429620565, + "mc1_stderr": 0.01697633590754687, + "mc2": 0.5657669903989726, + "mc2_stderr": 0.015172114047787783 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "OpenLemur/lemur-70b-chat-v1", + "model_sha": "33da87ba6d90662c6a00535bd628e5b39b3afd3b", + "model_dtype": "torch.float16", + "lighteval_sha": "e8904188e4f5b1d33bff41c604e7bba0dfa25e14", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "44149.28886651993", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/OpenLemur/lemur-70b-chat-v1/results_2023-09-17T13-31-04.707005.json b/eval-results/OpenLemur/lemur-70b-chat-v1/results_2023-09-17T13-31-04.707005.json new file mode 100644 index 0000000000000000000000000000000000000000..f383ea5c6de7c34a416244efa13f0161fb0b5fe9 --- /dev/null +++ b/eval-results/OpenLemur/lemur-70b-chat-v1/results_2023-09-17T13-31-04.707005.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "OpenLemur/lemur-70b-chat-v1", + "model_sha": "8bc10a845b99906fc00a16322023546a83a039a4", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.006711409395973154, + "em_stderr": 0.0008361500895152445, + "f1": 0.0739702181208053, + "f1_stderr": 0.001585201628872726 + }, + "harness|gsm8k|5": { + "acc": 0.35329795299469297, + "acc_stderr": 0.013166337192115683 + }, + "harness|winogrande|5": { + "acc": 0.8168902920284136, + "acc_stderr": 0.010869778633168358 + }, + "all": { + "em": 0.006711409395973154, + "em_stderr": 0.0008361500895152445, + "f1": 0.0739702181208053, + "f1_stderr": 0.001585201628872726, + "acc": 0.5850941225115532, + "acc_stderr": 0.01201805791264202 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "722868830bb85586" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "5cf542397f7cd972" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "c1e0b2ba226688af" + }, + "total_evaluation_time_secondes": "49917.6910071373", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/OpenLemur/lemur-70b-v1/results_2023-08-24T09-13-21.689197.json b/eval-results/OpenLemur/lemur-70b-v1/results_2023-08-24T09-13-21.689197.json new file mode 100644 index 0000000000000000000000000000000000000000..2c3cf27c507e0aabda22c221da7405e880e5c67b --- /dev/null +++ b/eval-results/OpenLemur/lemur-70b-v1/results_2023-08-24T09-13-21.689197.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.590443686006826, + "acc_stderr": 0.014370358632472446, + "acc_norm": 0.643344709897611, + "acc_norm_stderr": 0.013998056902620194 + }, + "harness|hellaswag|10": { + "acc": 0.6538538139812786, + "acc_stderr": 0.004747682003491471, + "acc_norm": 0.8571997610037841, + "acc_norm_stderr": 0.0034915398589272883 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5259259259259259, + "acc_stderr": 0.04313531696750575, + "acc_norm": 0.5259259259259259, + "acc_norm_stderr": 0.04313531696750575 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7302631578947368, + "acc_stderr": 0.03611780560284898, + "acc_norm": 0.7302631578947368, + "acc_norm_stderr": 0.03611780560284898 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6754716981132075, + "acc_stderr": 0.02881561571343211, + "acc_norm": 0.6754716981132075, + "acc_norm_stderr": 0.02881561571343211 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7708333333333334, + "acc_stderr": 0.035146974678623884, + "acc_norm": 0.7708333333333334, + "acc_norm_stderr": 0.035146974678623884 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.45, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.653179190751445, + "acc_stderr": 0.036291466701596636, + "acc_norm": 0.653179190751445, + "acc_norm_stderr": 0.036291466701596636 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4019607843137255, + "acc_stderr": 0.048786087144669955, + "acc_norm": 0.4019607843137255, + "acc_norm_stderr": 0.048786087144669955 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.574468085106383, + "acc_stderr": 0.03232146916224468, + "acc_norm": 0.574468085106383, + "acc_norm_stderr": 0.03232146916224468 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.40350877192982454, + "acc_stderr": 0.04615186962583703, + "acc_norm": 0.40350877192982454, + "acc_norm_stderr": 0.04615186962583703 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6137931034482759, + "acc_stderr": 0.04057324734419036, + "acc_norm": 0.6137931034482759, + "acc_norm_stderr": 0.04057324734419036 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.025591857761382182, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.025591857761382182 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5396825396825397, + "acc_stderr": 0.04458029125470973, + "acc_norm": 0.5396825396825397, + "acc_norm_stderr": 0.04458029125470973 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7903225806451613, + "acc_stderr": 0.023157879349083525, + "acc_norm": 0.7903225806451613, + "acc_norm_stderr": 0.023157879349083525 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4975369458128079, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.4975369458128079, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.793939393939394, + "acc_stderr": 0.03158415324047709, + "acc_norm": 0.793939393939394, + "acc_norm_stderr": 0.03158415324047709 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8686868686868687, + "acc_stderr": 0.024063156416822523, + "acc_norm": 0.8686868686868687, + "acc_norm_stderr": 0.024063156416822523 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8911917098445595, + "acc_stderr": 0.022473253332768766, + "acc_norm": 0.8911917098445595, + "acc_norm_stderr": 0.022473253332768766 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7076923076923077, + "acc_stderr": 0.023060438380857754, + "acc_norm": 0.7076923076923077, + "acc_norm_stderr": 0.023060438380857754 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3111111111111111, + "acc_stderr": 0.028226446749683515, + "acc_norm": 0.3111111111111111, + "acc_norm_stderr": 0.028226446749683515 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7184873949579832, + "acc_stderr": 0.029213549414372167, + "acc_norm": 0.7184873949579832, + "acc_norm_stderr": 0.029213549414372167 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.44370860927152317, + "acc_stderr": 0.04056527902281732, + "acc_norm": 0.44370860927152317, + "acc_norm_stderr": 0.04056527902281732 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8458715596330275, + "acc_stderr": 0.015480826865374303, + "acc_norm": 0.8458715596330275, + "acc_norm_stderr": 0.015480826865374303 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5740740740740741, + "acc_stderr": 0.03372343271653063, + "acc_norm": 0.5740740740740741, + "acc_norm_stderr": 0.03372343271653063 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8774509803921569, + "acc_stderr": 0.023015389732458254, + "acc_norm": 0.8774509803921569, + "acc_norm_stderr": 0.023015389732458254 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8396624472573839, + "acc_stderr": 0.023884380925965665, + "acc_norm": 0.8396624472573839, + "acc_norm_stderr": 0.023884380925965665 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7937219730941704, + "acc_stderr": 0.02715715047956382, + "acc_norm": 0.7937219730941704, + "acc_norm_stderr": 0.02715715047956382 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7633587786259542, + "acc_stderr": 0.03727673575596915, + "acc_norm": 0.7633587786259542, + "acc_norm_stderr": 0.03727673575596915 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8760330578512396, + "acc_stderr": 0.03008309871603521, + "acc_norm": 0.8760330578512396, + "acc_norm_stderr": 0.03008309871603521 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.75, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7791411042944786, + "acc_stderr": 0.03259177392742179, + "acc_norm": 0.7791411042944786, + "acc_norm_stderr": 0.03259177392742179 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5178571428571429, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.5178571428571429, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7961165048543689, + "acc_stderr": 0.039891398595317706, + "acc_norm": 0.7961165048543689, + "acc_norm_stderr": 0.039891398595317706 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8675213675213675, + "acc_stderr": 0.022209309073165616, + "acc_norm": 0.8675213675213675, + "acc_norm_stderr": 0.022209309073165616 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8109833971902938, + "acc_stderr": 0.014000791294406994, + "acc_norm": 0.8109833971902938, + "acc_norm_stderr": 0.014000791294406994 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7427745664739884, + "acc_stderr": 0.02353292543104428, + "acc_norm": 0.7427745664739884, + "acc_norm_stderr": 0.02353292543104428 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.27262569832402234, + "acc_stderr": 0.014893391735249615, + "acc_norm": 0.27262569832402234, + "acc_norm_stderr": 0.014893391735249615 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.02564686309713791, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.02564686309713791 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.729903536977492, + "acc_stderr": 0.025218040373410622, + "acc_norm": 0.729903536977492, + "acc_norm_stderr": 0.025218040373410622 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.023788583551658544, + "acc_norm": 0.7592592592592593, + "acc_norm_stderr": 0.023788583551658544 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5, + "acc_stderr": 0.029827499313594685, + "acc_norm": 0.5, + "acc_norm_stderr": 0.029827499313594685 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5176010430247718, + "acc_stderr": 0.012762321298823648, + "acc_norm": 0.5176010430247718, + "acc_norm_stderr": 0.012762321298823648 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6838235294117647, + "acc_stderr": 0.028245687391462923, + "acc_norm": 0.6838235294117647, + "acc_norm_stderr": 0.028245687391462923 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6862745098039216, + "acc_stderr": 0.01877168389352818, + "acc_norm": 0.6862745098039216, + "acc_norm_stderr": 0.01877168389352818 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6818181818181818, + "acc_stderr": 0.04461272175910509, + "acc_norm": 0.6818181818181818, + "acc_norm_stderr": 0.04461272175910509 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7836734693877551, + "acc_stderr": 0.026358916334904017, + "acc_norm": 0.7836734693877551, + "acc_norm_stderr": 0.026358916334904017 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.845771144278607, + "acc_stderr": 0.025538433368578334, + "acc_norm": 0.845771144278607, + "acc_norm_stderr": 0.025538433368578334 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.92, + "acc_stderr": 0.0272659924344291, + "acc_norm": 0.92, + "acc_norm_stderr": 0.0272659924344291 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5240963855421686, + "acc_stderr": 0.03887971849597264, + "acc_norm": 0.5240963855421686, + "acc_norm_stderr": 0.03887971849597264 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8245614035087719, + "acc_stderr": 0.02917088550072767, + "acc_norm": 0.8245614035087719, + "acc_norm_stderr": 0.02917088550072767 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.29498164014687883, + "mc1_stderr": 0.015964400965589667, + "mc2": 0.44779995636228015, + "mc2_stderr": 0.014856454376420808 + }, + "all": { + "acc": 0.6572322507554619, + "acc_stderr": 0.03222212694549847, + "acc_norm": 0.6615754197539924, + "acc_norm_stderr": 0.03219452620186429, + "mc1": 0.29498164014687883, + "mc1_stderr": 0.015964400965589667, + "mc2": 0.44779995636228015, + "mc2_stderr": 0.014856454376420808 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "OpenLemur/lemur-70b-v1", + "model_sha": "74432ae16ef50207fe17fb88b2f1c1d32ef3b481", + "model_dtype": "torch.float16", + "lighteval_sha": "e8904188e4f5b1d33bff41c604e7bba0dfa25e14", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "26301.346576690674", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/OpenLemur/lemur-70b-v1/results_2023-09-18T14-30-20.780139.json b/eval-results/OpenLemur/lemur-70b-v1/results_2023-09-18T14-30-20.780139.json new file mode 100644 index 0000000000000000000000000000000000000000..98f0ca0df64a301883f2a440b4a0f0df951c2131 --- /dev/null +++ b/eval-results/OpenLemur/lemur-70b-v1/results_2023-09-18T14-30-20.780139.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "OpenLemur/lemur-70b-v1", + "model_sha": "8bb4e71ccbd3a8cce8c10c4a56bbe253d91a5f85", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0026216442953020135, + "em_stderr": 0.0005236685642965847, + "f1": 0.057400377516778664, + "f1_stderr": 0.001295669399059679 + }, + "harness|gsm8k|5": { + "acc": 0.287338893100834, + "acc_stderr": 0.01246467706010708 + }, + "harness|winogrande|5": { + "acc": 0.8303078137332282, + "acc_stderr": 0.010549542647363692 + }, + "all": { + "em": 0.0026216442953020135, + "em_stderr": 0.0005236685642965847, + "f1": 0.057400377516778664, + "f1_stderr": 0.001295669399059679, + "acc": 0.558823353417031, + "acc_stderr": 0.011507109853735386 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "31f8bf954759ab38" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "5e41fb12854028c2" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "984f4f30c2bccb19" + }, + "total_evaluation_time_secondes": "52274.17562413216", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/2x-LoRA-Assemble-13B/results_2023-10-03T14-58-01.778055.json b/eval-results/PulsarAI/2x-LoRA-Assemble-13B/results_2023-10-03T14-58-01.778055.json new file mode 100644 index 0000000000000000000000000000000000000000..fd912458000c809b56b6ee89752135b4475db394 --- /dev/null +++ b/eval-results/PulsarAI/2x-LoRA-Assemble-13B/results_2023-10-03T14-58-01.778055.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "PulsarAI/2x-LoRA-Assemble-13B", + "model_sha": "1aca45d37eade21eb381aaefc9245b58ec3b7b26", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6040955631399317, + "acc_stderr": 0.014291228393536587, + "acc_norm": 0.636518771331058, + "acc_norm_stderr": 0.014056207319068283 + }, + "harness|hellaswag|10": { + "acc": 0.6358295160326628, + "acc_stderr": 0.004802133511654241, + "acc_norm": 0.8346942840071699, + "acc_norm_stderr": 0.003706970856410953 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.618421052631579, + "acc_stderr": 0.03953173377749194, + "acc_norm": 0.618421052631579, + "acc_norm_stderr": 0.03953173377749194 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6188679245283019, + "acc_stderr": 0.029890609686286637, + "acc_norm": 0.6188679245283019, + "acc_norm_stderr": 0.029890609686286637 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6597222222222222, + "acc_stderr": 0.039621355734862175, + "acc_norm": 0.6597222222222222, + "acc_norm_stderr": 0.039621355734862175 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6011560693641619, + "acc_stderr": 0.037336266553835096, + "acc_norm": 0.6011560693641619, + "acc_norm_stderr": 0.037336266553835096 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04690650298201942, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04690650298201942 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4978723404255319, + "acc_stderr": 0.03268572658667492, + "acc_norm": 0.4978723404255319, + "acc_norm_stderr": 0.03268572658667492 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.35964912280701755, + "acc_stderr": 0.04514496132873634, + "acc_norm": 0.35964912280701755, + "acc_norm_stderr": 0.04514496132873634 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5724137931034483, + "acc_stderr": 0.04122737111370333, + "acc_norm": 0.5724137931034483, + "acc_norm_stderr": 0.04122737111370333 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.024552292209342658, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.024552292209342658 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.043062412591271526, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.043062412591271526 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.667741935483871, + "acc_stderr": 0.0267955608481228, + "acc_norm": 0.667741935483871, + "acc_norm_stderr": 0.0267955608481228 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4876847290640394, + "acc_stderr": 0.035169204442208966, + "acc_norm": 0.4876847290640394, + "acc_norm_stderr": 0.035169204442208966 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.62, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.62, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.0347769116216366, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.0347769116216366 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.029620227874790482, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.029620227874790482 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8808290155440415, + "acc_stderr": 0.02338193534812143, + "acc_norm": 0.8808290155440415, + "acc_norm_stderr": 0.02338193534812143 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6205128205128205, + "acc_stderr": 0.02460362692409742, + "acc_norm": 0.6205128205128205, + "acc_norm_stderr": 0.02460362692409742 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32222222222222224, + "acc_stderr": 0.028493465091028597, + "acc_norm": 0.32222222222222224, + "acc_norm_stderr": 0.028493465091028597 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5966386554621849, + "acc_stderr": 0.031866081214088314, + "acc_norm": 0.5966386554621849, + "acc_norm_stderr": 0.031866081214088314 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943342, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943342 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7944954128440367, + "acc_stderr": 0.01732435232501602, + "acc_norm": 0.7944954128440367, + "acc_norm_stderr": 0.01732435232501602 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.033723432716530645, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.033723432716530645 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.02615686752393104, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.02615686752393104 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7848101265822784, + "acc_stderr": 0.02675082699467617, + "acc_norm": 0.7848101265822784, + "acc_norm_stderr": 0.02675082699467617 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6860986547085202, + "acc_stderr": 0.03114679648297246, + "acc_norm": 0.6860986547085202, + "acc_norm_stderr": 0.03114679648297246 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6793893129770993, + "acc_stderr": 0.04093329229834278, + "acc_norm": 0.6793893129770993, + "acc_norm_stderr": 0.04093329229834278 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908706, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908706 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7685185185185185, + "acc_stderr": 0.04077494709252627, + "acc_norm": 0.7685185185185185, + "acc_norm_stderr": 0.04077494709252627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6993865030674846, + "acc_stderr": 0.03602511318806771, + "acc_norm": 0.6993865030674846, + "acc_norm_stderr": 0.03602511318806771 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4017857142857143, + "acc_stderr": 0.04653333146973646, + "acc_norm": 0.4017857142857143, + "acc_norm_stderr": 0.04653333146973646 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8461538461538461, + "acc_stderr": 0.023636873317489294, + "acc_norm": 0.8461538461538461, + "acc_norm_stderr": 0.023636873317489294 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7956577266922095, + "acc_stderr": 0.0144191239809319, + "acc_norm": 0.7956577266922095, + "acc_norm_stderr": 0.0144191239809319 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.661849710982659, + "acc_stderr": 0.025469770149400172, + "acc_norm": 0.661849710982659, + "acc_norm_stderr": 0.025469770149400172 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.48044692737430167, + "acc_stderr": 0.016709709877661995, + "acc_norm": 0.48044692737430167, + "acc_norm_stderr": 0.016709709877661995 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6699346405228758, + "acc_stderr": 0.026925654653615693, + "acc_norm": 0.6699346405228758, + "acc_norm_stderr": 0.026925654653615693 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6881028938906752, + "acc_stderr": 0.026311858071854155, + "acc_norm": 0.6881028938906752, + "acc_norm_stderr": 0.026311858071854155 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7067901234567902, + "acc_stderr": 0.025329888171900922, + "acc_norm": 0.7067901234567902, + "acc_norm_stderr": 0.025329888171900922 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.029766675075873866, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.029766675075873866 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.46088657105606257, + "acc_stderr": 0.012731102790504526, + "acc_norm": 0.46088657105606257, + "acc_norm_stderr": 0.012731102790504526 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6102941176470589, + "acc_stderr": 0.0296246635811597, + "acc_norm": 0.6102941176470589, + "acc_norm_stderr": 0.0296246635811597 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5898692810457516, + "acc_stderr": 0.019898412717635903, + "acc_norm": 0.5898692810457516, + "acc_norm_stderr": 0.019898412717635903 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6775510204081633, + "acc_stderr": 0.02992310056368391, + "acc_norm": 0.6775510204081633, + "acc_norm_stderr": 0.02992310056368391 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7611940298507462, + "acc_stderr": 0.03014777593540922, + "acc_norm": 0.7611940298507462, + "acc_norm_stderr": 0.03014777593540922 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4879518072289157, + "acc_stderr": 0.03891364495835821, + "acc_norm": 0.4879518072289157, + "acc_norm_stderr": 0.03891364495835821 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8011695906432749, + "acc_stderr": 0.030611116557432528, + "acc_norm": 0.8011695906432749, + "acc_norm_stderr": 0.030611116557432528 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.40514075887392903, + "mc1_stderr": 0.01718561172775337, + "mc2": 0.5594181501740189, + "mc2_stderr": 0.015699414732693026 + }, + "all": { + "acc": 0.598938175511998, + "acc_stderr": 0.03385413189247629, + "acc_norm": 0.6028583107012461, + "acc_norm_stderr": 0.03383158640553202, + "mc1": 0.40514075887392903, + "mc1_stderr": 0.01718561172775337, + "mc2": 0.5594181501740189, + "mc2_stderr": 0.015699414732693026 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6369.136570453644", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} diff --git a/eval-results/PulsarAI/2x-LoRA-Assemble-13B/results_2023-10-28T12-51-33.520951.json b/eval-results/PulsarAI/2x-LoRA-Assemble-13B/results_2023-10-28T12-51-33.520951.json new file mode 100644 index 0000000000000000000000000000000000000000..ff9c7f9ca080dc699c16988a2118fc9fdfedcda3 --- /dev/null +++ b/eval-results/PulsarAI/2x-LoRA-Assemble-13B/results_2023-10-28T12-51-33.520951.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "PulsarAI/2x-LoRA-Assemble-13B", + "model_sha": "32df907518898cf87f8212ce5e64018d9ca66c8d", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.01740771812080537, + "em_stderr": 0.0013393597649753585, + "f1": 0.12005662751677883, + "f1_stderr": 0.002244767452564408 + }, + "harness|gsm8k|5": { + "acc": 0.09249431387414708, + "acc_stderr": 0.007980396874560173 + }, + "harness|winogrande|5": { + "acc": 0.7647987371744278, + "acc_stderr": 0.011920008163650865 + }, + "all": { + "em": 0.01740771812080537, + "em_stderr": 0.0013393597649753585, + "f1": 0.12005662751677883, + "f1_stderr": 0.002244767452564408, + "acc": 0.42864652552428745, + "acc_stderr": 0.009950202519105519 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "2b2b40a621edc8d1" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "faaeab813cafde05" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "76f7183d55970bc8" + }, + "total_evaluation_time_secondes": "11530.844326257706", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/2x-LoRA-Assemble-Nova-13B/results_2023-10-08T14-51-09.823341.json b/eval-results/PulsarAI/2x-LoRA-Assemble-Nova-13B/results_2023-10-08T14-51-09.823341.json new file mode 100644 index 0000000000000000000000000000000000000000..1729aa48a2ddd6ab6a3481d8a1b4e23b686cf8a9 --- /dev/null +++ b/eval-results/PulsarAI/2x-LoRA-Assemble-Nova-13B/results_2023-10-08T14-51-09.823341.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "PulsarAI/2x-LoRA-Assemble-Nova-13B", + "model_sha": "2a344b91b28ce4d0bd48b9b5a6cc87b71123eab5", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5861774744027304, + "acc_stderr": 0.014392730009221009, + "acc_norm": 0.6262798634812287, + "acc_norm_stderr": 0.01413770860175909 + }, + "harness|hellaswag|10": { + "acc": 0.6335391356303525, + "acc_stderr": 0.004808526802718585, + "acc_norm": 0.8324039036048596, + "acc_norm_stderr": 0.0037274387865133944 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5111111111111111, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.5111111111111111, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5789473684210527, + "acc_stderr": 0.04017901275981749, + "acc_norm": 0.5789473684210527, + "acc_norm_stderr": 0.04017901275981749 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6075471698113207, + "acc_stderr": 0.030052580579557845, + "acc_norm": 0.6075471698113207, + "acc_norm_stderr": 0.030052580579557845 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.625, + "acc_stderr": 0.04048439222695598, + "acc_norm": 0.625, + "acc_norm_stderr": 0.04048439222695598 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5722543352601156, + "acc_stderr": 0.03772446857518027, + "acc_norm": 0.5722543352601156, + "acc_norm_stderr": 0.03772446857518027 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252609, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252609 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4765957446808511, + "acc_stderr": 0.032650194750335815, + "acc_norm": 0.4765957446808511, + "acc_norm_stderr": 0.032650194750335815 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.043391383225798615, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.043391383225798615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5103448275862069, + "acc_stderr": 0.04165774775728762, + "acc_norm": 0.5103448275862069, + "acc_norm_stderr": 0.04165774775728762 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.0242785680243077, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.0242785680243077 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.46825396825396826, + "acc_stderr": 0.04463112720677173, + "acc_norm": 0.46825396825396826, + "acc_norm_stderr": 0.04463112720677173 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6612903225806451, + "acc_stderr": 0.02692344605930284, + "acc_norm": 0.6612903225806451, + "acc_norm_stderr": 0.02692344605930284 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4236453201970443, + "acc_stderr": 0.03476725747649038, + "acc_norm": 0.4236453201970443, + "acc_norm_stderr": 0.03476725747649038 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7333333333333333, + "acc_stderr": 0.03453131801885415, + "acc_norm": 0.7333333333333333, + "acc_norm_stderr": 0.03453131801885415 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7727272727272727, + "acc_stderr": 0.02985751567338642, + "acc_norm": 0.7727272727272727, + "acc_norm_stderr": 0.02985751567338642 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.844559585492228, + "acc_stderr": 0.02614848346915331, + "acc_norm": 0.844559585492228, + "acc_norm_stderr": 0.02614848346915331 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6256410256410256, + "acc_stderr": 0.024537591572830503, + "acc_norm": 0.6256410256410256, + "acc_norm_stderr": 0.024537591572830503 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.337037037037037, + "acc_stderr": 0.028820884666253252, + "acc_norm": 0.337037037037037, + "acc_norm_stderr": 0.028820884666253252 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6680672268907563, + "acc_stderr": 0.03058869701378364, + "acc_norm": 0.6680672268907563, + "acc_norm_stderr": 0.03058869701378364 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.038020397601079024, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.038020397601079024 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8036697247706422, + "acc_stderr": 0.017030719339154336, + "acc_norm": 0.8036697247706422, + "acc_norm_stderr": 0.017030719339154336 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5648148148148148, + "acc_stderr": 0.03381200005643526, + "acc_norm": 0.5648148148148148, + "acc_norm_stderr": 0.03381200005643526 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8235294117647058, + "acc_stderr": 0.026756401538078962, + "acc_norm": 0.8235294117647058, + "acc_norm_stderr": 0.026756401538078962 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7848101265822784, + "acc_stderr": 0.026750826994676177, + "acc_norm": 0.7848101265822784, + "acc_norm_stderr": 0.026750826994676177 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.672645739910314, + "acc_stderr": 0.031493846709941306, + "acc_norm": 0.672645739910314, + "acc_norm_stderr": 0.031493846709941306 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6564885496183206, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.6564885496183206, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6859504132231405, + "acc_stderr": 0.042369647530410184, + "acc_norm": 0.6859504132231405, + "acc_norm_stderr": 0.042369647530410184 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.04330043749650741, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.04330043749650741 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7116564417177914, + "acc_stderr": 0.03559039531617342, + "acc_norm": 0.7116564417177914, + "acc_norm_stderr": 0.03559039531617342 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.38392857142857145, + "acc_stderr": 0.04616143075028547, + "acc_norm": 0.38392857142857145, + "acc_norm_stderr": 0.04616143075028547 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7475728155339806, + "acc_stderr": 0.04301250399690878, + "acc_norm": 0.7475728155339806, + "acc_norm_stderr": 0.04301250399690878 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8247863247863247, + "acc_stderr": 0.024904439098918228, + "acc_norm": 0.8247863247863247, + "acc_norm_stderr": 0.024904439098918228 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7624521072796935, + "acc_stderr": 0.015218733046150191, + "acc_norm": 0.7624521072796935, + "acc_norm_stderr": 0.015218733046150191 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6676300578034682, + "acc_stderr": 0.02536116874968824, + "acc_norm": 0.6676300578034682, + "acc_norm_stderr": 0.02536116874968824 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.39776536312849164, + "acc_stderr": 0.01636920497126298, + "acc_norm": 0.39776536312849164, + "acc_norm_stderr": 0.01636920497126298 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.027914055510468008, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.027914055510468008 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6688102893890675, + "acc_stderr": 0.02673062072800491, + "acc_norm": 0.6688102893890675, + "acc_norm_stderr": 0.02673062072800491 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.025630824975621344, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.025630824975621344 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.46099290780141844, + "acc_stderr": 0.029736592526424434, + "acc_norm": 0.46099290780141844, + "acc_norm_stderr": 0.029736592526424434 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44328552803129073, + "acc_stderr": 0.01268781841959992, + "acc_norm": 0.44328552803129073, + "acc_norm_stderr": 0.01268781841959992 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.029520095697687765, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.029520095697687765 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5718954248366013, + "acc_stderr": 0.0200176292142131, + "acc_norm": 0.5718954248366013, + "acc_norm_stderr": 0.0200176292142131 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6081632653061224, + "acc_stderr": 0.031251275910891656, + "acc_norm": 0.6081632653061224, + "acc_norm_stderr": 0.031251275910891656 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7263681592039801, + "acc_stderr": 0.031524391865554016, + "acc_norm": 0.7263681592039801, + "acc_norm_stderr": 0.031524391865554016 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.46987951807228917, + "acc_stderr": 0.03885425420866766, + "acc_norm": 0.46987951807228917, + "acc_norm_stderr": 0.03885425420866766 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7719298245614035, + "acc_stderr": 0.032180937956023566, + "acc_norm": 0.7719298245614035, + "acc_norm_stderr": 0.032180937956023566 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3574051407588739, + "mc1_stderr": 0.0167765996767294, + "mc2": 0.5188449816339029, + "mc2_stderr": 0.015388101762026653 + }, + "all": { + "acc": 0.5871990415581504, + "acc_stderr": 0.034205209769974654, + "acc_norm": 0.5912493323556589, + "acc_norm_stderr": 0.034182563847539614, + "mc1": 0.3574051407588739, + "mc1_stderr": 0.0167765996767294, + "mc2": 0.5188449816339029, + "mc2_stderr": 0.015388101762026653 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6351.361641407013", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/2x-LoRA-Assemble-Nova-13B/results_2023-10-26T09-15-27.308196.json b/eval-results/PulsarAI/2x-LoRA-Assemble-Nova-13B/results_2023-10-26T09-15-27.308196.json new file mode 100644 index 0000000000000000000000000000000000000000..8cbbff8436e9f0fe353fc220900b9c8d24c84a55 --- /dev/null +++ b/eval-results/PulsarAI/2x-LoRA-Assemble-Nova-13B/results_2023-10-26T09-15-27.308196.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "PulsarAI/2x-LoRA-Assemble-Nova-13B", + "model_sha": "2a344b91b28ce4d0bd48b9b5a6cc87b71123eab5", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.005243288590604027, + "em_stderr": 0.0007396052260778, + "f1": 0.08796455536912774, + "f1_stderr": 0.0018271669211415338 + }, + "harness|gsm8k|5": { + "acc": 0.1023502653525398, + "acc_stderr": 0.008349110996208824 + }, + "harness|winogrande|5": { + "acc": 0.7695343330702447, + "acc_stderr": 0.01183587216483667 + }, + "all": { + "em": 0.005243288590604027, + "em_stderr": 0.0007396052260778, + "f1": 0.08796455536912774, + "f1_stderr": 0.0018271669211415338, + "acc": 0.4359422992113922, + "acc_stderr": 0.010092491580522747 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "b81f8b9ea858a4f7" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "335ac0c063d69e0d" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "47cdf6b8d9107ceb" + }, + "total_evaluation_time_secondes": "13407.532195806503", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/2x-LoRA-Assemble-Platypus2-13B/results_2023-10-08T14-58-33.553023.json b/eval-results/PulsarAI/2x-LoRA-Assemble-Platypus2-13B/results_2023-10-08T14-58-33.553023.json new file mode 100644 index 0000000000000000000000000000000000000000..edfb6c5fc649188d7861fbfec5dc3f6dcde03444 --- /dev/null +++ b/eval-results/PulsarAI/2x-LoRA-Assemble-Platypus2-13B/results_2023-10-08T14-58-33.553023.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "PulsarAI/2x-LoRA-Assemble-Platypus2-13B", + "model_sha": "f147bf8428c174d1dc0332da626d4b039690ceab", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5742320819112628, + "acc_stderr": 0.014449464278868807, + "acc_norm": 0.60580204778157, + "acc_norm_stderr": 0.01428052266746732 + }, + "harness|hellaswag|10": { + "acc": 0.6287592113124876, + "acc_stderr": 0.004821492994082128, + "acc_norm": 0.8256323441545509, + "acc_norm_stderr": 0.00378649885676912 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5111111111111111, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.5111111111111111, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6118421052631579, + "acc_stderr": 0.03965842097512744, + "acc_norm": 0.6118421052631579, + "acc_norm_stderr": 0.03965842097512744 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5811320754716981, + "acc_stderr": 0.030365050829115208, + "acc_norm": 0.5811320754716981, + "acc_norm_stderr": 0.030365050829115208 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6319444444444444, + "acc_stderr": 0.04032999053960719, + "acc_norm": 0.6319444444444444, + "acc_norm_stderr": 0.04032999053960719 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5664739884393064, + "acc_stderr": 0.03778621079092055, + "acc_norm": 0.5664739884393064, + "acc_norm_stderr": 0.03778621079092055 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.04533838195929777, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.04533838195929777 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526094, + "acc_norm": 0.67, + "acc_norm_stderr": 0.047258156262526094 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4765957446808511, + "acc_stderr": 0.032650194750335815, + "acc_norm": 0.4765957446808511, + "acc_norm_stderr": 0.032650194750335815 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3508771929824561, + "acc_stderr": 0.044895393502706986, + "acc_norm": 0.3508771929824561, + "acc_norm_stderr": 0.044895393502706986 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.34656084656084657, + "acc_stderr": 0.02450877752102842, + "acc_norm": 0.34656084656084657, + "acc_norm_stderr": 0.02450877752102842 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4365079365079365, + "acc_stderr": 0.04435932892851466, + "acc_norm": 0.4365079365079365, + "acc_norm_stderr": 0.04435932892851466 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6258064516129033, + "acc_stderr": 0.027528904299845697, + "acc_norm": 0.6258064516129033, + "acc_norm_stderr": 0.027528904299845697 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.39408866995073893, + "acc_stderr": 0.03438157967036545, + "acc_norm": 0.39408866995073893, + "acc_norm_stderr": 0.03438157967036545 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237101, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237101 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7393939393939394, + "acc_stderr": 0.03427743175816524, + "acc_norm": 0.7393939393939394, + "acc_norm_stderr": 0.03427743175816524 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7424242424242424, + "acc_stderr": 0.03115626951964683, + "acc_norm": 0.7424242424242424, + "acc_norm_stderr": 0.03115626951964683 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8704663212435233, + "acc_stderr": 0.024233532297758723, + "acc_norm": 0.8704663212435233, + "acc_norm_stderr": 0.024233532297758723 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.617948717948718, + "acc_stderr": 0.02463554916390823, + "acc_norm": 0.617948717948718, + "acc_norm_stderr": 0.02463554916390823 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.028742040903948496, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.028742040903948496 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.032145368597886394, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.032145368597886394 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.038020397601079024, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.038020397601079024 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.781651376146789, + "acc_stderr": 0.017712600528722727, + "acc_norm": 0.781651376146789, + "acc_norm_stderr": 0.017712600528722727 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4398148148148148, + "acc_stderr": 0.03385177976044812, + "acc_norm": 0.4398148148148148, + "acc_norm_stderr": 0.03385177976044812 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8186274509803921, + "acc_stderr": 0.02704462171947408, + "acc_norm": 0.8186274509803921, + "acc_norm_stderr": 0.02704462171947408 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7552742616033755, + "acc_stderr": 0.027985699387036416, + "acc_norm": 0.7552742616033755, + "acc_norm_stderr": 0.027985699387036416 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6771300448430493, + "acc_stderr": 0.03138147637575499, + "acc_norm": 0.6771300448430493, + "acc_norm_stderr": 0.03138147637575499 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6564885496183206, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.6564885496183206, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.040261875275912073, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.040261875275912073 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6932515337423313, + "acc_stderr": 0.03623089915724146, + "acc_norm": 0.6932515337423313, + "acc_norm_stderr": 0.03623089915724146 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7378640776699029, + "acc_stderr": 0.043546310772605956, + "acc_norm": 0.7378640776699029, + "acc_norm_stderr": 0.043546310772605956 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.02441494730454368, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.02441494730454368 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7828863346104725, + "acc_stderr": 0.014743125394823298, + "acc_norm": 0.7828863346104725, + "acc_norm_stderr": 0.014743125394823298 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.638728323699422, + "acc_stderr": 0.02586220185227789, + "acc_norm": 0.638728323699422, + "acc_norm_stderr": 0.02586220185227789 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.48156424581005586, + "acc_stderr": 0.016711130497782813, + "acc_norm": 0.48156424581005586, + "acc_norm_stderr": 0.016711130497782813 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6372549019607843, + "acc_stderr": 0.027530078447110307, + "acc_norm": 0.6372549019607843, + "acc_norm_stderr": 0.027530078447110307 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6591639871382636, + "acc_stderr": 0.026920841260776162, + "acc_norm": 0.6591639871382636, + "acc_norm_stderr": 0.026920841260776162 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.025630824975621344, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.025630824975621344 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.450354609929078, + "acc_stderr": 0.029680105565029036, + "acc_norm": 0.450354609929078, + "acc_norm_stderr": 0.029680105565029036 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.45241199478487615, + "acc_stderr": 0.012712265105889133, + "acc_norm": 0.45241199478487615, + "acc_norm_stderr": 0.012712265105889133 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5992647058823529, + "acc_stderr": 0.029768263528933105, + "acc_norm": 0.5992647058823529, + "acc_norm_stderr": 0.029768263528933105 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5931372549019608, + "acc_stderr": 0.019873802005061177, + "acc_norm": 0.5931372549019608, + "acc_norm_stderr": 0.019873802005061177 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6204081632653061, + "acc_stderr": 0.031067211262872475, + "acc_norm": 0.6204081632653061, + "acc_norm_stderr": 0.031067211262872475 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6965174129353234, + "acc_stderr": 0.03251006816458618, + "acc_norm": 0.6965174129353234, + "acc_norm_stderr": 0.03251006816458618 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.8, + "acc_stderr": 0.040201512610368466, + "acc_norm": 0.8, + "acc_norm_stderr": 0.040201512610368466 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8128654970760234, + "acc_stderr": 0.02991312723236804, + "acc_norm": 0.8128654970760234, + "acc_norm_stderr": 0.02991312723236804 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.386780905752754, + "mc1_stderr": 0.017048857010515107, + "mc2": 0.5477106072151011, + "mc2_stderr": 0.015859389336255567 + }, + "all": { + "acc": 0.5831519747078309, + "acc_stderr": 0.034192008793577415, + "acc_norm": 0.5870238916351592, + "acc_norm_stderr": 0.03417160310292123, + "mc1": 0.386780905752754, + "mc1_stderr": 0.017048857010515107, + "mc2": 0.5477106072151011, + "mc2_stderr": 0.015859389336255567 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6348.060599327087", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/2x-LoRA-Assemble-Platypus2-13B/results_2023-10-26T04-49-09.510505.json b/eval-results/PulsarAI/2x-LoRA-Assemble-Platypus2-13B/results_2023-10-26T04-49-09.510505.json new file mode 100644 index 0000000000000000000000000000000000000000..b437971deb2240b5b70f2e729338daab8cc4e78f --- /dev/null +++ b/eval-results/PulsarAI/2x-LoRA-Assemble-Platypus2-13B/results_2023-10-26T04-49-09.510505.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "PulsarAI/2x-LoRA-Assemble-Platypus2-13B", + "model_sha": "f147bf8428c174d1dc0332da626d4b039690ceab", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.14020553691275167, + "em_stderr": 0.003555654511760366, + "f1": 0.25958473154362444, + "f1_stderr": 0.003697673494004961 + }, + "harness|gsm8k|5": { + "acc": 0.009097801364670205, + "acc_stderr": 0.002615326510775672 + }, + "harness|winogrande|5": { + "acc": 0.7490134175217048, + "acc_stderr": 0.012185776220516161 + }, + "all": { + "em": 0.14020553691275167, + "em_stderr": 0.003555654511760366, + "f1": 0.25958473154362444, + "f1_stderr": 0.003697673494004961, + "acc": 0.3790556094431875, + "acc_stderr": 0.007400551365645916 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "aae80228bd0c48ff" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "0d7deb00f47153c2" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "6cfa8048410f0ddd" + }, + "total_evaluation_time_secondes": "10950.83692908287", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/Chat-AYB-Nova-13B/results_2023-10-08T14-44-32.660445.json b/eval-results/PulsarAI/Chat-AYB-Nova-13B/results_2023-10-08T14-44-32.660445.json new file mode 100644 index 0000000000000000000000000000000000000000..ca566aa9eef1390caf828da0d37eb47dc22279e2 --- /dev/null +++ b/eval-results/PulsarAI/Chat-AYB-Nova-13B/results_2023-10-08T14-44-32.660445.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "PulsarAI/Chat-AYB-Nova-13B", + "model_sha": "942af4d59533af09cf9ba13d1e369b8e871a0a4b", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5836177474402731, + "acc_stderr": 0.01440561827943617, + "acc_norm": 0.6296928327645052, + "acc_norm_stderr": 0.01411129875167495 + }, + "harness|hellaswag|10": { + "acc": 0.6440948018323043, + "acc_stderr": 0.004778081784542406, + "acc_norm": 0.842760406293567, + "acc_norm_stderr": 0.0036328254791285954 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4962962962962963, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.4962962962962963, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5789473684210527, + "acc_stderr": 0.04017901275981749, + "acc_norm": 0.5789473684210527, + "acc_norm_stderr": 0.04017901275981749 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6150943396226415, + "acc_stderr": 0.02994649856769995, + "acc_norm": 0.6150943396226415, + "acc_norm_stderr": 0.02994649856769995 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6527777777777778, + "acc_stderr": 0.0398124054371786, + "acc_norm": 0.6527777777777778, + "acc_norm_stderr": 0.0398124054371786 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5606936416184971, + "acc_stderr": 0.037842719328874674, + "acc_norm": 0.5606936416184971, + "acc_norm_stderr": 0.037842719328874674 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4215686274509804, + "acc_stderr": 0.04913595201274498, + "acc_norm": 0.4215686274509804, + "acc_norm_stderr": 0.04913595201274498 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526094, + "acc_norm": 0.67, + "acc_norm_stderr": 0.047258156262526094 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.49361702127659574, + "acc_stderr": 0.032683358999363366, + "acc_norm": 0.49361702127659574, + "acc_norm_stderr": 0.032683358999363366 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.04339138322579861, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.04339138322579861 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.46206896551724136, + "acc_stderr": 0.04154659671707546, + "acc_norm": 0.46206896551724136, + "acc_norm_stderr": 0.04154659671707546 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.024278568024307706, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.024278568024307706 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6774193548387096, + "acc_stderr": 0.026593084516572277, + "acc_norm": 0.6774193548387096, + "acc_norm_stderr": 0.026593084516572277 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4729064039408867, + "acc_stderr": 0.03512819077876106, + "acc_norm": 0.4729064039408867, + "acc_norm_stderr": 0.03512819077876106 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7151515151515152, + "acc_stderr": 0.03524390844511781, + "acc_norm": 0.7151515151515152, + "acc_norm_stderr": 0.03524390844511781 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7525252525252525, + "acc_stderr": 0.030746300742124498, + "acc_norm": 0.7525252525252525, + "acc_norm_stderr": 0.030746300742124498 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8290155440414507, + "acc_stderr": 0.027171213683164552, + "acc_norm": 0.8290155440414507, + "acc_norm_stderr": 0.027171213683164552 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6076923076923076, + "acc_stderr": 0.024756000382130952, + "acc_norm": 0.6076923076923076, + "acc_norm_stderr": 0.024756000382130952 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3111111111111111, + "acc_stderr": 0.028226446749683515, + "acc_norm": 0.3111111111111111, + "acc_norm_stderr": 0.028226446749683515 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6386554621848739, + "acc_stderr": 0.031204691225150013, + "acc_norm": 0.6386554621848739, + "acc_norm_stderr": 0.031204691225150013 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7926605504587156, + "acc_stderr": 0.01738141556360868, + "acc_norm": 0.7926605504587156, + "acc_norm_stderr": 0.01738141556360868 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5462962962962963, + "acc_stderr": 0.033953227263757976, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.033953227263757976 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.803921568627451, + "acc_stderr": 0.027865942286639325, + "acc_norm": 0.803921568627451, + "acc_norm_stderr": 0.027865942286639325 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229966, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229966 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6942148760330579, + "acc_stderr": 0.04205953933884122, + "acc_norm": 0.6942148760330579, + "acc_norm_stderr": 0.04205953933884122 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7239263803680982, + "acc_stderr": 0.035123852837050475, + "acc_norm": 0.7239263803680982, + "acc_norm_stderr": 0.035123852837050475 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285712, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285712 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8247863247863247, + "acc_stderr": 0.02490443909891823, + "acc_norm": 0.8247863247863247, + "acc_norm_stderr": 0.02490443909891823 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7624521072796935, + "acc_stderr": 0.015218733046150191, + "acc_norm": 0.7624521072796935, + "acc_norm_stderr": 0.015218733046150191 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6416184971098265, + "acc_stderr": 0.025816756791584187, + "acc_norm": 0.6416184971098265, + "acc_norm_stderr": 0.025816756791584187 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.38212290502793295, + "acc_stderr": 0.01625113971157077, + "acc_norm": 0.38212290502793295, + "acc_norm_stderr": 0.01625113971157077 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6143790849673203, + "acc_stderr": 0.02787074527829028, + "acc_norm": 0.6143790849673203, + "acc_norm_stderr": 0.02787074527829028 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6655948553054662, + "acc_stderr": 0.026795422327893937, + "acc_norm": 0.6655948553054662, + "acc_norm_stderr": 0.026795422327893937 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6820987654320988, + "acc_stderr": 0.02591006352824089, + "acc_norm": 0.6820987654320988, + "acc_norm_stderr": 0.02591006352824089 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.475177304964539, + "acc_stderr": 0.029790719243829727, + "acc_norm": 0.475177304964539, + "acc_norm_stderr": 0.029790719243829727 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.43415906127770537, + "acc_stderr": 0.012659033237067248, + "acc_norm": 0.43415906127770537, + "acc_norm_stderr": 0.012659033237067248 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5772058823529411, + "acc_stderr": 0.030008562845003486, + "acc_norm": 0.5772058823529411, + "acc_norm_stderr": 0.030008562845003486 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5735294117647058, + "acc_stderr": 0.020007912739359365, + "acc_norm": 0.5735294117647058, + "acc_norm_stderr": 0.020007912739359365 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6244897959183674, + "acc_stderr": 0.03100120903989484, + "acc_norm": 0.6244897959183674, + "acc_norm_stderr": 0.03100120903989484 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7512437810945274, + "acc_stderr": 0.030567675938916714, + "acc_norm": 0.7512437810945274, + "acc_norm_stderr": 0.030567675938916714 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.035887028128263686, + "acc_norm": 0.85, + "acc_norm_stderr": 0.035887028128263686 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.46987951807228917, + "acc_stderr": 0.03885425420866766, + "acc_norm": 0.46987951807228917, + "acc_norm_stderr": 0.03885425420866766 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7660818713450293, + "acc_stderr": 0.03246721765117826, + "acc_norm": 0.7660818713450293, + "acc_norm_stderr": 0.03246721765117826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3561811505507956, + "mc1_stderr": 0.016763790728446335, + "mc2": 0.5128475358249373, + "mc2_stderr": 0.015464651537836402 + }, + "all": { + "acc": 0.586765594212576, + "acc_stderr": 0.03410914044006396, + "acc_norm": 0.5909137414970759, + "acc_norm_stderr": 0.03408474084967116, + "mc1": 0.3561811505507956, + "mc1_stderr": 0.016763790728446335, + "mc2": 0.5128475358249373, + "mc2_stderr": 0.015464651537836402 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6688.776878595352", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/Chat-AYB-Nova-13B/results_2023-10-27T20-18-17.450635.json b/eval-results/PulsarAI/Chat-AYB-Nova-13B/results_2023-10-27T20-18-17.450635.json new file mode 100644 index 0000000000000000000000000000000000000000..6e94d2244fe730409756bc72fd52284a68d7ae67 --- /dev/null +++ b/eval-results/PulsarAI/Chat-AYB-Nova-13B/results_2023-10-27T20-18-17.450635.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "PulsarAI/Chat-AYB-Nova-13B", + "model_sha": "942af4d59533af09cf9ba13d1e369b8e871a0a4b", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0041946308724832215, + "em_stderr": 0.0006618716168266419, + "f1": 0.0802946728187919, + "f1_stderr": 0.0016873252068220475 + }, + "harness|gsm8k|5": { + "acc": 0.12357846853677028, + "acc_stderr": 0.009065050306776921 + }, + "harness|winogrande|5": { + "acc": 0.7758484609313339, + "acc_stderr": 0.011720400740774104 + }, + "all": { + "em": 0.0041946308724832215, + "em_stderr": 0.0006618716168266419, + "f1": 0.0802946728187919, + "f1_stderr": 0.0016873252068220475, + "acc": 0.44971346473405205, + "acc_stderr": 0.010392725523775513 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "0ace44a61596c447" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "6cfe7d2f59ab16ae" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "7aa905c41221f26b" + }, + "total_evaluation_time_secondes": "12773.737463474274", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/Chat-AYB-Platypus2-13B/results_2023-10-08T14-46-05.202813.json b/eval-results/PulsarAI/Chat-AYB-Platypus2-13B/results_2023-10-08T14-46-05.202813.json new file mode 100644 index 0000000000000000000000000000000000000000..efc1e147d4ad6d84edcaca30d5b0662f6f75a7b3 --- /dev/null +++ b/eval-results/PulsarAI/Chat-AYB-Platypus2-13B/results_2023-10-08T14-46-05.202813.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "PulsarAI/Chat-AYB-Platypus2-13B", + "model_sha": "5a54eb9d5a66df4720ec52422f5627ccd94d5fd6", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5750853242320819, + "acc_stderr": 0.014445698968520765, + "acc_norm": 0.6049488054607508, + "acc_norm_stderr": 0.014285898292938163 + }, + "harness|hellaswag|10": { + "acc": 0.6450906193985262, + "acc_stderr": 0.0047750796365670966, + "acc_norm": 0.8402708623780123, + "acc_norm_stderr": 0.0036560593900501065 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.04605661864718381, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04605661864718381 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5037037037037037, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.5037037037037037, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6118421052631579, + "acc_stderr": 0.03965842097512744, + "acc_norm": 0.6118421052631579, + "acc_norm_stderr": 0.03965842097512744 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5660377358490566, + "acc_stderr": 0.03050329201334259, + "acc_norm": 0.5660377358490566, + "acc_norm_stderr": 0.03050329201334259 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.625, + "acc_stderr": 0.04048439222695598, + "acc_norm": 0.625, + "acc_norm_stderr": 0.04048439222695598 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5375722543352601, + "acc_stderr": 0.0380168510452446, + "acc_norm": 0.5375722543352601, + "acc_norm_stderr": 0.0380168510452446 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.04488482852329017, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.04488482852329017 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4808510638297872, + "acc_stderr": 0.032662042990646775, + "acc_norm": 0.4808510638297872, + "acc_norm_stderr": 0.032662042990646775 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.044346007015849245, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.044346007015849245 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3201058201058201, + "acc_stderr": 0.0240268463928735, + "acc_norm": 0.3201058201058201, + "acc_norm_stderr": 0.0240268463928735 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.043435254289490965, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.043435254289490965 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6258064516129033, + "acc_stderr": 0.027528904299845697, + "acc_norm": 0.6258064516129033, + "acc_norm_stderr": 0.027528904299845697 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.034819048444388045, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.034819048444388045 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7393939393939394, + "acc_stderr": 0.03427743175816524, + "acc_norm": 0.7393939393939394, + "acc_norm_stderr": 0.03427743175816524 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7373737373737373, + "acc_stderr": 0.031353050095330855, + "acc_norm": 0.7373737373737373, + "acc_norm_stderr": 0.031353050095330855 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8652849740932642, + "acc_stderr": 0.024639789097709443, + "acc_norm": 0.8652849740932642, + "acc_norm_stderr": 0.024639789097709443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5897435897435898, + "acc_stderr": 0.024939313906940784, + "acc_norm": 0.5897435897435898, + "acc_norm_stderr": 0.024939313906940784 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.028742040903948496, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.028742040903948496 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.0322529423239964, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.0322529423239964 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7908256880733945, + "acc_stderr": 0.017437937173343233, + "acc_norm": 0.7908256880733945, + "acc_norm_stderr": 0.017437937173343233 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4398148148148148, + "acc_stderr": 0.03385177976044812, + "acc_norm": 0.4398148148148148, + "acc_norm_stderr": 0.03385177976044812 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.028125972265654366, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.028125972265654366 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.759493670886076, + "acc_stderr": 0.027820781981149685, + "acc_norm": 0.759493670886076, + "acc_norm_stderr": 0.027820781981149685 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6681614349775785, + "acc_stderr": 0.03160295143776678, + "acc_norm": 0.6681614349775785, + "acc_norm_stderr": 0.03160295143776678 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6412213740458015, + "acc_stderr": 0.04206739313864908, + "acc_norm": 0.6412213740458015, + "acc_norm_stderr": 0.04206739313864908 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908707, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908707 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.75, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6625766871165644, + "acc_stderr": 0.03714908409935574, + "acc_norm": 0.6625766871165644, + "acc_norm_stderr": 0.03714908409935574 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.04521829902833585, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.04521829902833585 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7087378640776699, + "acc_stderr": 0.044986763205729224, + "acc_norm": 0.7087378640776699, + "acc_norm_stderr": 0.044986763205729224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8376068376068376, + "acc_stderr": 0.02416161812798774, + "acc_norm": 0.8376068376068376, + "acc_norm_stderr": 0.02416161812798774 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7905491698595147, + "acc_stderr": 0.014551310568143693, + "acc_norm": 0.7905491698595147, + "acc_norm_stderr": 0.014551310568143693 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6184971098265896, + "acc_stderr": 0.026152198619726803, + "acc_norm": 0.6184971098265896, + "acc_norm_stderr": 0.026152198619726803 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4681564245810056, + "acc_stderr": 0.016688553415612206, + "acc_norm": 0.4681564245810056, + "acc_norm_stderr": 0.016688553415612206 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.630718954248366, + "acc_stderr": 0.027634176689602653, + "acc_norm": 0.630718954248366, + "acc_norm_stderr": 0.027634176689602653 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6559485530546624, + "acc_stderr": 0.02698147804364803, + "acc_norm": 0.6559485530546624, + "acc_norm_stderr": 0.02698147804364803 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6790123456790124, + "acc_stderr": 0.025976566010862744, + "acc_norm": 0.6790123456790124, + "acc_norm_stderr": 0.025976566010862744 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4716312056737589, + "acc_stderr": 0.029779450957303055, + "acc_norm": 0.4716312056737589, + "acc_norm_stderr": 0.029779450957303055 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.45436766623207303, + "acc_stderr": 0.012716941720734815, + "acc_norm": 0.45436766623207303, + "acc_norm_stderr": 0.012716941720734815 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.02989616303312547, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.02989616303312547 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5735294117647058, + "acc_stderr": 0.020007912739359375, + "acc_norm": 0.5735294117647058, + "acc_norm_stderr": 0.020007912739359375 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.045820048415054174, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.045820048415054174 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6530612244897959, + "acc_stderr": 0.030472526026726496, + "acc_norm": 0.6530612244897959, + "acc_norm_stderr": 0.030472526026726496 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6716417910447762, + "acc_stderr": 0.033206858897443244, + "acc_norm": 0.6716417910447762, + "acc_norm_stderr": 0.033206858897443244 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.83, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4879518072289157, + "acc_stderr": 0.03891364495835821, + "acc_norm": 0.4879518072289157, + "acc_norm_stderr": 0.03891364495835821 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8187134502923976, + "acc_stderr": 0.029547741687640038, + "acc_norm": 0.8187134502923976, + "acc_norm_stderr": 0.029547741687640038 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3806609547123623, + "mc1_stderr": 0.01699762787190793, + "mc2": 0.5451670851918026, + "mc2_stderr": 0.01582581744184166 + }, + "all": { + "acc": 0.5793506755644279, + "acc_stderr": 0.03418617983940258, + "acc_norm": 0.5831649759747356, + "acc_norm_stderr": 0.03416450490851953, + "mc1": 0.3806609547123623, + "mc1_stderr": 0.01699762787190793, + "mc2": 0.5451670851918026, + "mc2_stderr": 0.01582581744184166 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6380.6676704883575", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/Chat-AYB-Platypus2-13B/results_2023-10-28T16-53-41.047162.json b/eval-results/PulsarAI/Chat-AYB-Platypus2-13B/results_2023-10-28T16-53-41.047162.json new file mode 100644 index 0000000000000000000000000000000000000000..b6305448e009cdd9480b396a0bf058230ea6041c --- /dev/null +++ b/eval-results/PulsarAI/Chat-AYB-Platypus2-13B/results_2023-10-28T16-53-41.047162.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "PulsarAI/Chat-AYB-Platypus2-13B", + "model_sha": "5a54eb9d5a66df4720ec52422f5627ccd94d5fd6", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.2752726510067114, + "em_stderr": 0.0045741300617909856, + "f1": 0.38116505872483314, + "f1_stderr": 0.004403649120675284 + }, + "harness|gsm8k|5": { + "acc": 0.029567854435178165, + "acc_stderr": 0.004665893134220814 + }, + "harness|winogrande|5": { + "acc": 0.7576953433307024, + "acc_stderr": 0.012042352526174785 + }, + "all": { + "em": 0.2752726510067114, + "em_stderr": 0.0045741300617909856, + "f1": 0.38116505872483314, + "f1_stderr": 0.004403649120675284, + "acc": 0.3936315988829403, + "acc_stderr": 0.0083541228301978 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "7174c6f72bd44fc0" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "c0aeaffab9b9ac51" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "13cd6cb9d2263bcb" + }, + "total_evaluation_time_secondes": "9837.45160651207", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/CollectiveCognition-v1.1-Nebula-7B/results_2023-11-12T21-42-17.063541.json b/eval-results/PulsarAI/CollectiveCognition-v1.1-Nebula-7B/results_2023-11-12T21-42-17.063541.json new file mode 100644 index 0000000000000000000000000000000000000000..929f0658a1682d075a79117b3bc7793759d257ab --- /dev/null +++ b/eval-results/PulsarAI/CollectiveCognition-v1.1-Nebula-7B/results_2023-11-12T21-42-17.063541.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 252513.982240914, + "end_time": null, + "total_evaluation_time_secondes": null, + "model_name": "PulsarAI/CollectiveCognition-v1.1-Nebula-7B", + "model_sha": "c41d373a2d49b79236d6c4d0dfc4086e709c07eb", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5324232081911263, + "acc_stderr": 0.014580637569995421, + "acc_norm": 0.5810580204778157, + "acc_norm_stderr": 0.014418106953639013 + }, + "harness|hellaswag|10": { + "acc": 0.6309500099581756, + "acc_stderr": 0.004815613144385404, + "acc_norm": 0.8239394542919737, + "acc_norm_stderr": 0.0038009327705977565 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.04292596718256981, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.04292596718256981 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5986842105263158, + "acc_stderr": 0.03988903703336284, + "acc_norm": 0.5986842105263158, + "acc_norm_stderr": 0.03988903703336284 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6188679245283019, + "acc_stderr": 0.029890609686286623, + "acc_norm": 0.6188679245283019, + "acc_norm_stderr": 0.029890609686286623 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6319444444444444, + "acc_stderr": 0.040329990539607175, + "acc_norm": 0.6319444444444444, + "acc_norm_stderr": 0.040329990539607175 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.45, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5433526011560693, + "acc_stderr": 0.03798106566014498, + "acc_norm": 0.5433526011560693, + "acc_norm_stderr": 0.03798106566014498 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.30392156862745096, + "acc_stderr": 0.04576665403207763, + "acc_norm": 0.30392156862745096, + "acc_norm_stderr": 0.04576665403207763 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.49361702127659574, + "acc_stderr": 0.03268335899936337, + "acc_norm": 0.49361702127659574, + "acc_norm_stderr": 0.03268335899936337 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4473684210526316, + "acc_stderr": 0.04677473004491199, + "acc_norm": 0.4473684210526316, + "acc_norm_stderr": 0.04677473004491199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5448275862068965, + "acc_stderr": 0.04149886942192117, + "acc_norm": 0.5448275862068965, + "acc_norm_stderr": 0.04149886942192117 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3915343915343915, + "acc_stderr": 0.02513809138885108, + "acc_norm": 0.3915343915343915, + "acc_norm_stderr": 0.02513809138885108 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.04343525428949098, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.04343525428949098 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6483870967741936, + "acc_stderr": 0.027162537826948458, + "acc_norm": 0.6483870967741936, + "acc_norm_stderr": 0.027162537826948458 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.45320197044334976, + "acc_stderr": 0.03502544650845872, + "acc_norm": 0.45320197044334976, + "acc_norm_stderr": 0.03502544650845872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7333333333333333, + "acc_stderr": 0.03453131801885417, + "acc_norm": 0.7333333333333333, + "acc_norm_stderr": 0.03453131801885417 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7676767676767676, + "acc_stderr": 0.030088629490217487, + "acc_norm": 0.7676767676767676, + "acc_norm_stderr": 0.030088629490217487 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8238341968911918, + "acc_stderr": 0.02749350424454806, + "acc_norm": 0.8238341968911918, + "acc_norm_stderr": 0.02749350424454806 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5615384615384615, + "acc_stderr": 0.025158266016868592, + "acc_norm": 0.5615384615384615, + "acc_norm_stderr": 0.025158266016868592 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085626, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085626 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.0322529423239964, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.0322529423239964 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7614678899082569, + "acc_stderr": 0.018272575810231867, + "acc_norm": 0.7614678899082569, + "acc_norm_stderr": 0.018272575810231867 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.39351851851851855, + "acc_stderr": 0.03331747876370312, + "acc_norm": 0.39351851851851855, + "acc_norm_stderr": 0.03331747876370312 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7205882352941176, + "acc_stderr": 0.03149328104507957, + "acc_norm": 0.7205882352941176, + "acc_norm_stderr": 0.03149328104507957 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.729957805907173, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.729957805907173, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6681614349775785, + "acc_stderr": 0.03160295143776679, + "acc_norm": 0.6681614349775785, + "acc_norm_stderr": 0.03160295143776679 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6564885496183206, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.6564885496183206, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7603305785123967, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.7603305785123967, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.044531975073749834, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.044531975073749834 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6871165644171779, + "acc_stderr": 0.036429145782924055, + "acc_norm": 0.6871165644171779, + "acc_norm_stderr": 0.036429145782924055 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.0457237235873743, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.0457237235873743 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7378640776699029, + "acc_stderr": 0.04354631077260597, + "acc_norm": 0.7378640776699029, + "acc_norm_stderr": 0.04354631077260597 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.811965811965812, + "acc_stderr": 0.025598193686652265, + "acc_norm": 0.811965811965812, + "acc_norm_stderr": 0.025598193686652265 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7713920817369093, + "acc_stderr": 0.015016884698539892, + "acc_norm": 0.7713920817369093, + "acc_norm_stderr": 0.015016884698539892 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6184971098265896, + "acc_stderr": 0.0261521986197268, + "acc_norm": 0.6184971098265896, + "acc_norm_stderr": 0.0261521986197268 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.22793296089385476, + "acc_stderr": 0.014030149950805098, + "acc_norm": 0.22793296089385476, + "acc_norm_stderr": 0.014030149950805098 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6405228758169934, + "acc_stderr": 0.027475969910660952, + "acc_norm": 0.6405228758169934, + "acc_norm_stderr": 0.027475969910660952 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6334405144694534, + "acc_stderr": 0.027368078243971646, + "acc_norm": 0.6334405144694534, + "acc_norm_stderr": 0.027368078243971646 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6820987654320988, + "acc_stderr": 0.02591006352824088, + "acc_norm": 0.6820987654320988, + "acc_norm_stderr": 0.02591006352824088 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.42907801418439717, + "acc_stderr": 0.02952591430255856, + "acc_norm": 0.42907801418439717, + "acc_norm_stderr": 0.02952591430255856 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4315514993481095, + "acc_stderr": 0.012650007999463888, + "acc_norm": 0.4315514993481095, + "acc_norm_stderr": 0.012650007999463888 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5257352941176471, + "acc_stderr": 0.030332578094555033, + "acc_norm": 0.5257352941176471, + "acc_norm_stderr": 0.030332578094555033 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6078431372549019, + "acc_stderr": 0.019751726508762637, + "acc_norm": 0.6078431372549019, + "acc_norm_stderr": 0.019751726508762637 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5755102040816327, + "acc_stderr": 0.031642094879429414, + "acc_norm": 0.5755102040816327, + "acc_norm_stderr": 0.031642094879429414 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7512437810945274, + "acc_stderr": 0.030567675938916718, + "acc_norm": 0.7512437810945274, + "acc_norm_stderr": 0.030567675938916718 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5180722891566265, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.5180722891566265, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.783625730994152, + "acc_stderr": 0.03158149539338734, + "acc_norm": 0.783625730994152, + "acc_norm_stderr": 0.03158149539338734 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.38555691554467564, + "mc1_stderr": 0.01703883901059167, + "mc2": 0.5353024010333743, + "mc2_stderr": 0.015743888224866397 + }, + "harness|winogrande|5": { + "acc": 0.7371744277821626, + "acc_stderr": 0.012370922527262008 + }, + "harness|drop|3": { + "em": 0.35675335570469796, + "em_stderr": 0.004905829488253491, + "f1": 0.4216977768456382, + "f1_stderr": 0.0047367493845716785 + }, + "harness|gsm8k|5": { + "acc": 0.09552691432903715, + "acc_stderr": 0.008096605771155759 + }, + "all": { + "acc": 0.5655902624582015, + "acc_stderr": 0.033540567370804734, + "acc_norm": 0.5747445580416879, + "acc_norm_stderr": 0.03431067576831402, + "mc1": 0.38555691554467564, + "mc1_stderr": 0.01703883901059167, + "mc2": 0.5353024010333743, + "mc2_stderr": 0.015743888224866397, + "em": 0.35675335570469796, + "em_stderr": 0.004905829488253491, + "f1": 0.4216977768456382, + "f1_stderr": 0.0047367493845716785 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "ce2cb37be1ed6238" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "18ef8f316843a5c9" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "a1df794fa70d75ca" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/EnsembleV5-Nova-13B/results_2023-10-03T19-22-59.151966.json b/eval-results/PulsarAI/EnsembleV5-Nova-13B/results_2023-10-03T19-22-59.151966.json new file mode 100644 index 0000000000000000000000000000000000000000..eec54c2a67e3c6a3dd2b527f3c6971bec32b4153 --- /dev/null +++ b/eval-results/PulsarAI/EnsembleV5-Nova-13B/results_2023-10-03T19-22-59.151966.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "PulsarAI/EnsembleV5-Nova-13B", + "model_sha": "3e25556187ba576082a85c270d2d4b4ea6ea9f6f", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5784982935153583, + "acc_stderr": 0.014430197069326023, + "acc_norm": 0.6271331058020477, + "acc_norm_stderr": 0.01413117676013117 + }, + "harness|hellaswag|10": { + "acc": 0.6217884883489345, + "acc_stderr": 0.004839497020536615, + "acc_norm": 0.8255327623979287, + "acc_norm_stderr": 0.0037873515193708063 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5789473684210527, + "acc_stderr": 0.04017901275981749, + "acc_norm": 0.5789473684210527, + "acc_norm_stderr": 0.04017901275981749 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6037735849056604, + "acc_stderr": 0.030102793781791197, + "acc_norm": 0.6037735849056604, + "acc_norm_stderr": 0.030102793781791197 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6527777777777778, + "acc_stderr": 0.039812405437178615, + "acc_norm": 0.6527777777777778, + "acc_norm_stderr": 0.039812405437178615 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5433526011560693, + "acc_stderr": 0.03798106566014498, + "acc_norm": 0.5433526011560693, + "acc_norm_stderr": 0.03798106566014498 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.46382978723404256, + "acc_stderr": 0.03260038511835771, + "acc_norm": 0.46382978723404256, + "acc_norm_stderr": 0.03260038511835771 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.04339138322579861, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.04339138322579861 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4413793103448276, + "acc_stderr": 0.04137931034482758, + "acc_norm": 0.4413793103448276, + "acc_norm_stderr": 0.04137931034482758 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3783068783068783, + "acc_stderr": 0.024976954053155243, + "acc_norm": 0.3783068783068783, + "acc_norm_stderr": 0.024976954053155243 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.04390259265377562, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.04390259265377562 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6580645161290323, + "acc_stderr": 0.02698528957655274, + "acc_norm": 0.6580645161290323, + "acc_norm_stderr": 0.02698528957655274 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43349753694581283, + "acc_stderr": 0.03486731727419872, + "acc_norm": 0.43349753694581283, + "acc_norm_stderr": 0.03486731727419872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7212121212121212, + "acc_stderr": 0.035014387062967806, + "acc_norm": 0.7212121212121212, + "acc_norm_stderr": 0.035014387062967806 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7626262626262627, + "acc_stderr": 0.030313710538198913, + "acc_norm": 0.7626262626262627, + "acc_norm_stderr": 0.030313710538198913 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8393782383419689, + "acc_stderr": 0.02649905770139746, + "acc_norm": 0.8393782383419689, + "acc_norm_stderr": 0.02649905770139746 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6153846153846154, + "acc_stderr": 0.02466674491518722, + "acc_norm": 0.6153846153846154, + "acc_norm_stderr": 0.02466674491518722 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3296296296296296, + "acc_stderr": 0.028661201116524586, + "acc_norm": 0.3296296296296296, + "acc_norm_stderr": 0.028661201116524586 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6302521008403361, + "acc_stderr": 0.03135709599613591, + "acc_norm": 0.6302521008403361, + "acc_norm_stderr": 0.03135709599613591 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8, + "acc_stderr": 0.017149858514250955, + "acc_norm": 0.8, + "acc_norm_stderr": 0.017149858514250955 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5138888888888888, + "acc_stderr": 0.03408655867977749, + "acc_norm": 0.5138888888888888, + "acc_norm_stderr": 0.03408655867977749 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8137254901960784, + "acc_stderr": 0.02732547096671632, + "acc_norm": 0.8137254901960784, + "acc_norm_stderr": 0.02732547096671632 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229962, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229962 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6278026905829597, + "acc_stderr": 0.03244305283008731, + "acc_norm": 0.6278026905829597, + "acc_norm_stderr": 0.03244305283008731 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6335877862595419, + "acc_stderr": 0.04225875451969637, + "acc_norm": 0.6335877862595419, + "acc_norm_stderr": 0.04225875451969637 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6528925619834711, + "acc_stderr": 0.043457245702925335, + "acc_norm": 0.6528925619834711, + "acc_norm_stderr": 0.043457245702925335 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6203703703703703, + "acc_stderr": 0.04691521224077742, + "acc_norm": 0.6203703703703703, + "acc_norm_stderr": 0.04691521224077742 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6809815950920245, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.6809815950920245, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.044328040552915185, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.044328040552915185 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7991452991452992, + "acc_stderr": 0.02624677294689048, + "acc_norm": 0.7991452991452992, + "acc_norm_stderr": 0.02624677294689048 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7586206896551724, + "acc_stderr": 0.01530238012354209, + "acc_norm": 0.7586206896551724, + "acc_norm_stderr": 0.01530238012354209 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6271676300578035, + "acc_stderr": 0.02603389061357628, + "acc_norm": 0.6271676300578035, + "acc_norm_stderr": 0.02603389061357628 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3843575418994413, + "acc_stderr": 0.016269088663959402, + "acc_norm": 0.3843575418994413, + "acc_norm_stderr": 0.016269088663959402 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6045751633986928, + "acc_stderr": 0.027996723180631445, + "acc_norm": 0.6045751633986928, + "acc_norm_stderr": 0.027996723180631445 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6463022508038585, + "acc_stderr": 0.027155208103200865, + "acc_norm": 0.6463022508038585, + "acc_norm_stderr": 0.027155208103200865 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6419753086419753, + "acc_stderr": 0.026675611926037086, + "acc_norm": 0.6419753086419753, + "acc_norm_stderr": 0.026675611926037086 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4432624113475177, + "acc_stderr": 0.029634838473766002, + "acc_norm": 0.4432624113475177, + "acc_norm_stderr": 0.029634838473766002 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4439374185136897, + "acc_stderr": 0.012689708167787682, + "acc_norm": 0.4439374185136897, + "acc_norm_stderr": 0.012689708167787682 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5919117647058824, + "acc_stderr": 0.029855261393483924, + "acc_norm": 0.5919117647058824, + "acc_norm_stderr": 0.029855261393483924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5751633986928104, + "acc_stderr": 0.01999797303545833, + "acc_norm": 0.5751633986928104, + "acc_norm_stderr": 0.01999797303545833 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.04582004841505417, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.04582004841505417 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.031680911612338825, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.031680911612338825 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6766169154228856, + "acc_stderr": 0.03307615947979033, + "acc_norm": 0.6766169154228856, + "acc_norm_stderr": 0.03307615947979033 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4457831325301205, + "acc_stderr": 0.03869543323472101, + "acc_norm": 0.4457831325301205, + "acc_norm_stderr": 0.03869543323472101 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7660818713450293, + "acc_stderr": 0.03246721765117826, + "acc_norm": 0.7660818713450293, + "acc_norm_stderr": 0.03246721765117826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3427172582619339, + "mc1_stderr": 0.016614949385347036, + "mc2": 0.4985854685041301, + "mc2_stderr": 0.015160720709708817 + }, + "all": { + "acc": 0.5689777286407749, + "acc_stderr": 0.03448372173215078, + "acc_norm": 0.5732553402735832, + "acc_norm_stderr": 0.03446082061672094, + "mc1": 0.3427172582619339, + "mc1_stderr": 0.016614949385347036, + "mc2": 0.4985854685041301, + "mc2_stderr": 0.015160720709708817 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6405.03169631958", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/EnsembleV5-Nova-13B/results_2023-10-23T15-24-00.966689.json b/eval-results/PulsarAI/EnsembleV5-Nova-13B/results_2023-10-23T15-24-00.966689.json new file mode 100644 index 0000000000000000000000000000000000000000..97155c7072725e471e5da3b8415e09359949dfe0 --- /dev/null +++ b/eval-results/PulsarAI/EnsembleV5-Nova-13B/results_2023-10-23T15-24-00.966689.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "PulsarAI/EnsembleV5-Nova-13B", + "model_sha": "b2b03e4a3714d5c738bb49767c1945da4f4d98d6", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.007445469798657718, + "em_stderr": 0.0008803652515899855, + "f1": 0.08636220637583875, + "f1_stderr": 0.0018310737230495444 + }, + "harness|gsm8k|5": { + "acc": 0.10765731614859743, + "acc_stderr": 0.008537484003023352 + }, + "harness|winogrande|5": { + "acc": 0.7624309392265194, + "acc_stderr": 0.011961298905803157 + }, + "all": { + "em": 0.007445469798657718, + "em_stderr": 0.0008803652515899855, + "f1": 0.08636220637583875, + "f1_stderr": 0.0018310737230495444, + "acc": 0.4350441276875584, + "acc_stderr": 0.010249391454413254 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "3cf8d80a0a153ca6" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "e4063189c38e1b54" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "0852126b435f5e70" + }, + "total_evaluation_time_secondes": "42982.43106532097", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/GenAI-Nova-13B/results_2023-10-08T15-05-19.512883.json b/eval-results/PulsarAI/GenAI-Nova-13B/results_2023-10-08T15-05-19.512883.json new file mode 100644 index 0000000000000000000000000000000000000000..e8216694daf29a5f4529469dd0fb913230638bfd --- /dev/null +++ b/eval-results/PulsarAI/GenAI-Nova-13B/results_2023-10-08T15-05-19.512883.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "PulsarAI/GenAI-Nova-13B", + "model_sha": "0ce62a64ca53cd5feb18f523a96dd3be86e6513d", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5776450511945392, + "acc_stderr": 0.014434138713379981, + "acc_norm": 0.6228668941979523, + "acc_norm_stderr": 0.0141633668961926 + }, + "harness|hellaswag|10": { + "acc": 0.6317466640111532, + "acc_stderr": 0.0048134486154044346, + "acc_norm": 0.8327026488747261, + "acc_norm_stderr": 0.003724783389253327 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6118421052631579, + "acc_stderr": 0.03965842097512744, + "acc_norm": 0.6118421052631579, + "acc_norm_stderr": 0.03965842097512744 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6113207547169811, + "acc_stderr": 0.030000485448675986, + "acc_norm": 0.6113207547169811, + "acc_norm_stderr": 0.030000485448675986 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03942082639927213, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03942082639927213 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5895953757225434, + "acc_stderr": 0.03750757044895537, + "acc_norm": 0.5895953757225434, + "acc_norm_stderr": 0.03750757044895537 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.37254901960784315, + "acc_stderr": 0.04810840148082635, + "acc_norm": 0.37254901960784315, + "acc_norm_stderr": 0.04810840148082635 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4808510638297872, + "acc_stderr": 0.03266204299064678, + "acc_norm": 0.4808510638297872, + "acc_norm_stderr": 0.03266204299064678 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.043036840335373146, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.043036840335373146 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.041641887201693775, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.041641887201693775 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.024552292209342654, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.024552292209342654 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.48412698412698413, + "acc_stderr": 0.04469881854072606, + "acc_norm": 0.48412698412698413, + "acc_norm_stderr": 0.04469881854072606 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6806451612903226, + "acc_stderr": 0.02652270967466777, + "acc_norm": 0.6806451612903226, + "acc_norm_stderr": 0.02652270967466777 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4975369458128079, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.4975369458128079, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.0347769116216366, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.0347769116216366 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.029620227874790482, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.029620227874790482 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8134715025906736, + "acc_stderr": 0.028112091210117467, + "acc_norm": 0.8134715025906736, + "acc_norm_stderr": 0.028112091210117467 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6256410256410256, + "acc_stderr": 0.024537591572830506, + "acc_norm": 0.6256410256410256, + "acc_norm_stderr": 0.024537591572830506 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.337037037037037, + "acc_stderr": 0.028820884666253255, + "acc_norm": 0.337037037037037, + "acc_norm_stderr": 0.028820884666253255 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6554621848739496, + "acc_stderr": 0.030868682604121626, + "acc_norm": 0.6554621848739496, + "acc_norm_stderr": 0.030868682604121626 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8018348623853211, + "acc_stderr": 0.01709057380421791, + "acc_norm": 0.8018348623853211, + "acc_norm_stderr": 0.01709057380421791 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5138888888888888, + "acc_stderr": 0.034086558679777494, + "acc_norm": 0.5138888888888888, + "acc_norm_stderr": 0.034086558679777494 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8235294117647058, + "acc_stderr": 0.026756401538078962, + "acc_norm": 0.8235294117647058, + "acc_norm_stderr": 0.026756401538078962 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229966, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229966 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.03102441174057222, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.03102441174057222 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6564885496183206, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.6564885496183206, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.71900826446281, + "acc_stderr": 0.04103203830514512, + "acc_norm": 0.71900826446281, + "acc_norm_stderr": 0.04103203830514512 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7239263803680982, + "acc_stderr": 0.035123852837050475, + "acc_norm": 0.7239263803680982, + "acc_norm_stderr": 0.035123852837050475 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764376, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8376068376068376, + "acc_stderr": 0.02416161812798774, + "acc_norm": 0.8376068376068376, + "acc_norm_stderr": 0.02416161812798774 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.62, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.62, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7867177522349936, + "acc_stderr": 0.014648172749593518, + "acc_norm": 0.7867177522349936, + "acc_norm_stderr": 0.014648172749593518 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6560693641618497, + "acc_stderr": 0.02557412378654667, + "acc_norm": 0.6560693641618497, + "acc_norm_stderr": 0.02557412378654667 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4558659217877095, + "acc_stderr": 0.01665722942458631, + "acc_norm": 0.4558659217877095, + "acc_norm_stderr": 0.01665722942458631 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6241830065359477, + "acc_stderr": 0.027732834353363947, + "acc_norm": 0.6241830065359477, + "acc_norm_stderr": 0.027732834353363947 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6720257234726688, + "acc_stderr": 0.026664410886937617, + "acc_norm": 0.6720257234726688, + "acc_norm_stderr": 0.026664410886937617 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6882716049382716, + "acc_stderr": 0.025773111169630433, + "acc_norm": 0.6882716049382716, + "acc_norm_stderr": 0.025773111169630433 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4645390070921986, + "acc_stderr": 0.029752389657427047, + "acc_norm": 0.4645390070921986, + "acc_norm_stderr": 0.029752389657427047 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4498044328552803, + "acc_stderr": 0.012705721498565106, + "acc_norm": 0.4498044328552803, + "acc_norm_stderr": 0.012705721498565106 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6102941176470589, + "acc_stderr": 0.0296246635811597, + "acc_norm": 0.6102941176470589, + "acc_norm_stderr": 0.0296246635811597 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5915032679738562, + "acc_stderr": 0.019886221037501862, + "acc_norm": 0.5915032679738562, + "acc_norm_stderr": 0.019886221037501862 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6163265306122448, + "acc_stderr": 0.031130880396235936, + "acc_norm": 0.6163265306122448, + "acc_norm_stderr": 0.031130880396235936 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.736318407960199, + "acc_stderr": 0.03115715086935557, + "acc_norm": 0.736318407960199, + "acc_norm_stderr": 0.03115715086935557 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036625, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036625 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.03889951252827217, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.03889951252827217 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7660818713450293, + "acc_stderr": 0.03246721765117825, + "acc_norm": 0.7660818713450293, + "acc_norm_stderr": 0.03246721765117825 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3623011015911873, + "mc1_stderr": 0.016826646897262255, + "mc2": 0.5179019852720109, + "mc2_stderr": 0.015198745057161709 + }, + "all": { + "acc": 0.5950478518617498, + "acc_stderr": 0.03403770527311197, + "acc_norm": 0.5992203574188173, + "acc_norm_stderr": 0.03401466396729267, + "mc1": 0.3623011015911873, + "mc1_stderr": 0.016826646897262255, + "mc2": 0.5179019852720109, + "mc2_stderr": 0.015198745057161709 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6409.4479167461395", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/GenAI-Nova-13B/results_2023-10-29T14-58-59.300779.json b/eval-results/PulsarAI/GenAI-Nova-13B/results_2023-10-29T14-58-59.300779.json new file mode 100644 index 0000000000000000000000000000000000000000..d7af8769efc685d9bc0db9b059efdee8bb7e3d3c --- /dev/null +++ b/eval-results/PulsarAI/GenAI-Nova-13B/results_2023-10-29T14-58-59.300779.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "PulsarAI/GenAI-Nova-13B", + "model_sha": "0ce62a64ca53cd5feb18f523a96dd3be86e6513d", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.10769714765100671, + "em_stderr": 0.003174664916131534, + "f1": 0.18815016778523358, + "f1_stderr": 0.0033317211011039192 + }, + "harness|gsm8k|5": { + "acc": 0.07733131159969674, + "acc_stderr": 0.007357713523222347 + }, + "harness|winogrande|5": { + "acc": 0.7734806629834254, + "acc_stderr": 0.01176414905469833 + }, + "all": { + "em": 0.10769714765100671, + "em_stderr": 0.003174664916131534, + "f1": 0.18815016778523358, + "f1_stderr": 0.0033317211011039192, + "acc": 0.4254059872915611, + "acc_stderr": 0.009560931288960338 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "8beb407e8c335a59" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "cb1018565bc7009b" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "cbfdfe700fee9f7a" + }, + "total_evaluation_time_secondes": "40884.74162721634", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/MetaMath-Chupacabra-7B-v2.01-Slerp/results_2023-12-09T17-58-17.272756.json b/eval-results/PulsarAI/MetaMath-Chupacabra-7B-v2.01-Slerp/results_2023-12-09T17-58-17.272756.json new file mode 100644 index 0000000000000000000000000000000000000000..7e29c78dfda2c9fa7569416848ba934740bb2acd --- /dev/null +++ b/eval-results/PulsarAI/MetaMath-Chupacabra-7B-v2.01-Slerp/results_2023-12-09T17-58-17.272756.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 592472.455497929, + "end_time": 599575.533745683, + "total_evaluation_time_secondes": "7103.07824775402", + "model_name": "PulsarAI/MetaMath-Chupacabra-7B-v2.01-Slerp", + "model_sha": "dcc6fff61bfd608d8e14a040dff22cd8dae78b1e", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6271331058020477, + "acc_stderr": 0.014131176760131169, + "acc_norm": 0.6612627986348123, + "acc_norm_stderr": 0.013830568927974332 + }, + "harness|hellaswag|10": { + "acc": 0.6669986058554073, + "acc_stderr": 0.004703238534045804, + "acc_norm": 0.8546106353316073, + "acc_norm_stderr": 0.0035177257870177433 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6370370370370371, + "acc_stderr": 0.04153948404742398, + "acc_norm": 0.6370370370370371, + "acc_norm_stderr": 0.04153948404742398 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6842105263157895, + "acc_stderr": 0.0378272898086547, + "acc_norm": 0.6842105263157895, + "acc_norm_stderr": 0.0378272898086547 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6943396226415094, + "acc_stderr": 0.028353298073322663, + "acc_norm": 0.6943396226415094, + "acc_norm_stderr": 0.028353298073322663 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7291666666666666, + "acc_stderr": 0.03716177437566017, + "acc_norm": 0.7291666666666666, + "acc_norm_stderr": 0.03716177437566017 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.036430371689585475, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.036430371689585475 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.04897104952726366, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.04897104952726366 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932261, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932261 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5829787234042553, + "acc_stderr": 0.03223276266711712, + "acc_norm": 0.5829787234042553, + "acc_norm_stderr": 0.03223276266711712 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.49122807017543857, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.49122807017543857, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.41534391534391535, + "acc_stderr": 0.025379524910778408, + "acc_norm": 0.41534391534391535, + "acc_norm_stderr": 0.025379524910778408 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.04390259265377562, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.04390259265377562 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7677419354838709, + "acc_stderr": 0.024022256130308235, + "acc_norm": 0.7677419354838709, + "acc_norm_stderr": 0.024022256130308235 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.035158955511656986, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.035158955511656986 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7757575757575758, + "acc_stderr": 0.032568666616811015, + "acc_norm": 0.7757575757575758, + "acc_norm_stderr": 0.032568666616811015 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.029126522834586818, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.029126522834586818 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8860103626943006, + "acc_stderr": 0.022935144053919446, + "acc_norm": 0.8860103626943006, + "acc_norm_stderr": 0.022935144053919446 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6564102564102564, + "acc_stderr": 0.024078696580635477, + "acc_norm": 0.6564102564102564, + "acc_norm_stderr": 0.024078696580635477 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.02874204090394848, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.02874204090394848 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6974789915966386, + "acc_stderr": 0.02983796238829194, + "acc_norm": 0.6974789915966386, + "acc_norm_stderr": 0.02983796238829194 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8440366972477065, + "acc_stderr": 0.01555580271359017, + "acc_norm": 0.8440366972477065, + "acc_norm_stderr": 0.01555580271359017 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5509259259259259, + "acc_stderr": 0.03392238405321617, + "acc_norm": 0.5509259259259259, + "acc_norm_stderr": 0.03392238405321617 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.803921568627451, + "acc_stderr": 0.027865942286639325, + "acc_norm": 0.803921568627451, + "acc_norm_stderr": 0.027865942286639325 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7974683544303798, + "acc_stderr": 0.026160568246601446, + "acc_norm": 0.7974683544303798, + "acc_norm_stderr": 0.026160568246601446 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7557251908396947, + "acc_stderr": 0.037683359597287434, + "acc_norm": 0.7557251908396947, + "acc_norm_stderr": 0.037683359597287434 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7851239669421488, + "acc_stderr": 0.037494924487096966, + "acc_norm": 0.7851239669421488, + "acc_norm_stderr": 0.037494924487096966 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8148148148148148, + "acc_stderr": 0.03755265865037182, + "acc_norm": 0.8148148148148148, + "acc_norm_stderr": 0.03755265865037182 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.754601226993865, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.754601226993865, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8760683760683761, + "acc_stderr": 0.02158649400128138, + "acc_norm": 0.8760683760683761, + "acc_norm_stderr": 0.02158649400128138 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8199233716475096, + "acc_stderr": 0.013740797258579828, + "acc_norm": 0.8199233716475096, + "acc_norm_stderr": 0.013740797258579828 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7283236994219653, + "acc_stderr": 0.023948512905468365, + "acc_norm": 0.7283236994219653, + "acc_norm_stderr": 0.023948512905468365 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.41787709497206704, + "acc_stderr": 0.016495400635820084, + "acc_norm": 0.41787709497206704, + "acc_norm_stderr": 0.016495400635820084 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7156862745098039, + "acc_stderr": 0.025829163272757485, + "acc_norm": 0.7156862745098039, + "acc_norm_stderr": 0.025829163272757485 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7170418006430869, + "acc_stderr": 0.02558306248998481, + "acc_norm": 0.7170418006430869, + "acc_norm_stderr": 0.02558306248998481 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7283950617283951, + "acc_stderr": 0.02474862449053737, + "acc_norm": 0.7283950617283951, + "acc_norm_stderr": 0.02474862449053737 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.48936170212765956, + "acc_stderr": 0.029820747191422473, + "acc_norm": 0.48936170212765956, + "acc_norm_stderr": 0.029820747191422473 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.45045632333767927, + "acc_stderr": 0.012707390438502346, + "acc_norm": 0.45045632333767927, + "acc_norm_stderr": 0.012707390438502346 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6544117647058824, + "acc_stderr": 0.028888193103988633, + "acc_norm": 0.6544117647058824, + "acc_norm_stderr": 0.028888193103988633 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6715686274509803, + "acc_stderr": 0.018999707383162673, + "acc_norm": 0.6715686274509803, + "acc_norm_stderr": 0.018999707383162673 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7346938775510204, + "acc_stderr": 0.028263889943784603, + "acc_norm": 0.7346938775510204, + "acc_norm_stderr": 0.028263889943784603 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8407960199004975, + "acc_stderr": 0.025870646766169143, + "acc_norm": 0.8407960199004975, + "acc_norm_stderr": 0.025870646766169143 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.033799766898963086, + "acc_norm": 0.87, + "acc_norm_stderr": 0.033799766898963086 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5180722891566265, + "acc_stderr": 0.03889951252827216, + "acc_norm": 0.5180722891566265, + "acc_norm_stderr": 0.03889951252827216 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8187134502923976, + "acc_stderr": 0.029547741687640044, + "acc_norm": 0.8187134502923976, + "acc_norm_stderr": 0.029547741687640044 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.39412484700122397, + "mc1_stderr": 0.017106588140700322, + "mc2": 0.5614591813728808, + "mc2_stderr": 0.015408154626799953 + }, + "harness|winogrande|5": { + "acc": 0.7947908445146015, + "acc_stderr": 0.01135031570746206 + }, + "harness|gsm8k|5": { + "acc": 0.7012888551933283, + "acc_stderr": 0.012607137125693625 + }, + "all": { + "acc": 0.6430394737674227, + "acc_stderr": 0.03225098588955544, + "acc_norm": 0.643238473261251, + "acc_norm_stderr": 0.03291299264153459, + "mc1": 0.39412484700122397, + "mc1_stderr": 0.017106588140700322, + "mc2": 0.5614591813728808, + "mc2_stderr": 0.015408154626799953 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "4c35cae7a90717e4" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "f29e2b3ff84b2f5a" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/MetaMath-OpenHermes-2.5-neural-chat-v3-3-Slerp/results_2023-12-10T02-45-05.724710.json b/eval-results/PulsarAI/MetaMath-OpenHermes-2.5-neural-chat-v3-3-Slerp/results_2023-12-10T02-45-05.724710.json new file mode 100644 index 0000000000000000000000000000000000000000..e270f9fad706c782a15f36be8d6baf76e3124aeb --- /dev/null +++ b/eval-results/PulsarAI/MetaMath-OpenHermes-2.5-neural-chat-v3-3-Slerp/results_2023-12-10T02-45-05.724710.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 623610.860985178, + "end_time": 631181.661367698, + "total_evaluation_time_secondes": "7570.800382519956", + "model_name": "PulsarAI/MetaMath-OpenHermes-2.5-neural-chat-v3-3-Slerp", + "model_sha": "111ae8b3fb38d550a32f04dbd977f8cd447a3a92", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6220136518771331, + "acc_stderr": 0.014169664520303098, + "acc_norm": 0.6459044368600683, + "acc_norm_stderr": 0.013975454122756564 + }, + "harness|hellaswag|10": { + "acc": 0.6632144991037642, + "acc_stderr": 0.004716449792353795, + "acc_norm": 0.8539135630352519, + "acc_norm_stderr": 0.003524710243768616 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.04605661864718381, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04605661864718381 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6296296296296297, + "acc_stderr": 0.041716541613545426, + "acc_norm": 0.6296296296296297, + "acc_norm_stderr": 0.041716541613545426 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6907894736842105, + "acc_stderr": 0.037610708698674805, + "acc_norm": 0.6907894736842105, + "acc_norm_stderr": 0.037610708698674805 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7169811320754716, + "acc_stderr": 0.027724236492700918, + "acc_norm": 0.7169811320754716, + "acc_norm_stderr": 0.027724236492700918 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.75, + "acc_stderr": 0.03621034121889507, + "acc_norm": 0.75, + "acc_norm_stderr": 0.03621034121889507 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6416184971098265, + "acc_stderr": 0.03656343653353159, + "acc_norm": 0.6416184971098265, + "acc_norm_stderr": 0.03656343653353159 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.048971049527263666, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.048971049527263666 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5914893617021276, + "acc_stderr": 0.032134180267015755, + "acc_norm": 0.5914893617021276, + "acc_norm_stderr": 0.032134180267015755 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.49122807017543857, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.49122807017543857, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5448275862068965, + "acc_stderr": 0.04149886942192117, + "acc_norm": 0.5448275862068965, + "acc_norm_stderr": 0.04149886942192117 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.41005291005291006, + "acc_stderr": 0.025331202438944447, + "acc_norm": 0.41005291005291006, + "acc_norm_stderr": 0.025331202438944447 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.04415438226743744, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.04415438226743744 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7806451612903226, + "acc_stderr": 0.023540799358723292, + "acc_norm": 0.7806451612903226, + "acc_norm_stderr": 0.023540799358723292 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5123152709359606, + "acc_stderr": 0.035169204442208966, + "acc_norm": 0.5123152709359606, + "acc_norm_stderr": 0.035169204442208966 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252609, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252609 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7454545454545455, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.7454545454545455, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7828282828282829, + "acc_stderr": 0.02937661648494563, + "acc_norm": 0.7828282828282829, + "acc_norm_stderr": 0.02937661648494563 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8756476683937824, + "acc_stderr": 0.02381447708659355, + "acc_norm": 0.8756476683937824, + "acc_norm_stderr": 0.02381447708659355 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6615384615384615, + "acc_stderr": 0.023991500500313036, + "acc_norm": 0.6615384615384615, + "acc_norm_stderr": 0.023991500500313036 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3592592592592593, + "acc_stderr": 0.029252905927251972, + "acc_norm": 0.3592592592592593, + "acc_norm_stderr": 0.029252905927251972 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.680672268907563, + "acc_stderr": 0.030283995525884396, + "acc_norm": 0.680672268907563, + "acc_norm_stderr": 0.030283995525884396 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.038020397601079024, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.038020397601079024 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8495412844036697, + "acc_stderr": 0.015328563932669237, + "acc_norm": 0.8495412844036697, + "acc_norm_stderr": 0.015328563932669237 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5231481481481481, + "acc_stderr": 0.03406315360711507, + "acc_norm": 0.5231481481481481, + "acc_norm_stderr": 0.03406315360711507 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7892156862745098, + "acc_stderr": 0.028626547912437406, + "acc_norm": 0.7892156862745098, + "acc_norm_stderr": 0.028626547912437406 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7974683544303798, + "acc_stderr": 0.026160568246601443, + "acc_norm": 0.7974683544303798, + "acc_norm_stderr": 0.026160568246601443 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6860986547085202, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.6860986547085202, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7709923664122137, + "acc_stderr": 0.036853466317118506, + "acc_norm": 0.7709923664122137, + "acc_norm_stderr": 0.036853466317118506 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8099173553719008, + "acc_stderr": 0.03581796951709282, + "acc_norm": 0.8099173553719008, + "acc_norm_stderr": 0.03581796951709282 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8055555555555556, + "acc_stderr": 0.038260763248848646, + "acc_norm": 0.8055555555555556, + "acc_norm_stderr": 0.038260763248848646 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7668711656441718, + "acc_stderr": 0.0332201579577674, + "acc_norm": 0.7668711656441718, + "acc_norm_stderr": 0.0332201579577674 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7669902912621359, + "acc_stderr": 0.04185832598928315, + "acc_norm": 0.7669902912621359, + "acc_norm_stderr": 0.04185832598928315 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.02190190511507333, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.02190190511507333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8237547892720306, + "acc_stderr": 0.013625556907993452, + "acc_norm": 0.8237547892720306, + "acc_norm_stderr": 0.013625556907993452 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7225433526011561, + "acc_stderr": 0.02410571260775431, + "acc_norm": 0.7225433526011561, + "acc_norm_stderr": 0.02410571260775431 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3877094972067039, + "acc_stderr": 0.01629533232815581, + "acc_norm": 0.3877094972067039, + "acc_norm_stderr": 0.01629533232815581 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7254901960784313, + "acc_stderr": 0.025553169991826524, + "acc_norm": 0.7254901960784313, + "acc_norm_stderr": 0.025553169991826524 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6977491961414791, + "acc_stderr": 0.026082700695399665, + "acc_norm": 0.6977491961414791, + "acc_norm_stderr": 0.026082700695399665 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7376543209876543, + "acc_stderr": 0.024477222856135118, + "acc_norm": 0.7376543209876543, + "acc_norm_stderr": 0.024477222856135118 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.48936170212765956, + "acc_stderr": 0.029820747191422466, + "acc_norm": 0.48936170212765956, + "acc_norm_stderr": 0.029820747191422466 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.45697522816166886, + "acc_stderr": 0.012722869501611419, + "acc_norm": 0.45697522816166886, + "acc_norm_stderr": 0.012722869501611419 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6617647058823529, + "acc_stderr": 0.028739328513983572, + "acc_norm": 0.6617647058823529, + "acc_norm_stderr": 0.028739328513983572 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6683006535947712, + "acc_stderr": 0.019047485239360378, + "acc_norm": 0.6683006535947712, + "acc_norm_stderr": 0.019047485239360378 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.0449429086625209, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.0449429086625209 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7428571428571429, + "acc_stderr": 0.02797982353874455, + "acc_norm": 0.7428571428571429, + "acc_norm_stderr": 0.02797982353874455 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8606965174129353, + "acc_stderr": 0.024484487162913973, + "acc_norm": 0.8606965174129353, + "acc_norm_stderr": 0.024484487162913973 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.0358870281282637, + "acc_norm": 0.85, + "acc_norm_stderr": 0.0358870281282637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5421686746987951, + "acc_stderr": 0.0387862677100236, + "acc_norm": 0.5421686746987951, + "acc_norm_stderr": 0.0387862677100236 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8538011695906432, + "acc_stderr": 0.02709729011807082, + "acc_norm": 0.8538011695906432, + "acc_norm_stderr": 0.02709729011807082 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.39167686658506734, + "mc1_stderr": 0.01708779588176963, + "mc2": 0.5514034273421413, + "mc2_stderr": 0.015341235748555455 + }, + "harness|winogrande|5": { + "acc": 0.7963693764798737, + "acc_stderr": 0.011317798781626915 + }, + "harness|gsm8k|5": { + "acc": 0.7164518574677786, + "acc_stderr": 0.012415070917508124 + }, + "all": { + "acc": 0.6464664842416276, + "acc_stderr": 0.03217172590988582, + "acc_norm": 0.646376680571289, + "acc_norm_stderr": 0.032836550184029964, + "mc1": 0.39167686658506734, + "mc1_stderr": 0.01708779588176963, + "mc2": 0.5514034273421413, + "mc2_stderr": 0.015341235748555455 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "79b7c2c107372a4c" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "56ad15f0326db087" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/MetaMath-Tulpar-7b-v2-Slerp/results_2023-12-09T17-55-14.434225.json b/eval-results/PulsarAI/MetaMath-Tulpar-7b-v2-Slerp/results_2023-12-09T17-55-14.434225.json new file mode 100644 index 0000000000000000000000000000000000000000..0a2e2016139c5dda88be47cdc015de2dc67ac620 --- /dev/null +++ b/eval-results/PulsarAI/MetaMath-Tulpar-7b-v2-Slerp/results_2023-12-09T17-55-14.434225.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 372291.106887652, + "end_time": 379423.588036934, + "total_evaluation_time_secondes": "7132.481149281957", + "model_name": "PulsarAI/MetaMath-Tulpar-7b-v2-Slerp", + "model_sha": "41612eecf338ae2b1cbb63a3729ce7b125c6ca3c", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6313993174061433, + "acc_stderr": 0.014097810678042194, + "acc_norm": 0.6561433447098977, + "acc_norm_stderr": 0.013880644570156213 + }, + "harness|hellaswag|10": { + "acc": 0.6677952599083847, + "acc_stderr": 0.004700413824942566, + "acc_norm": 0.8516231826329417, + "acc_norm_stderr": 0.0035474663103253973 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6296296296296297, + "acc_stderr": 0.041716541613545426, + "acc_norm": 0.6296296296296297, + "acc_norm_stderr": 0.041716541613545426 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7171052631578947, + "acc_stderr": 0.03665349695640767, + "acc_norm": 0.7171052631578947, + "acc_norm_stderr": 0.03665349695640767 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7018867924528301, + "acc_stderr": 0.028152837942493864, + "acc_norm": 0.7018867924528301, + "acc_norm_stderr": 0.028152837942493864 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.037455547914624555, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.037455547914624555 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.653179190751445, + "acc_stderr": 0.036291466701596636, + "acc_norm": 0.653179190751445, + "acc_norm_stderr": 0.036291466701596636 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4019607843137255, + "acc_stderr": 0.048786087144669955, + "acc_norm": 0.4019607843137255, + "acc_norm_stderr": 0.048786087144669955 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5872340425531914, + "acc_stderr": 0.03218471141400351, + "acc_norm": 0.5872340425531914, + "acc_norm_stderr": 0.03218471141400351 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4649122807017544, + "acc_stderr": 0.046920083813689104, + "acc_norm": 0.4649122807017544, + "acc_norm_stderr": 0.046920083813689104 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.41798941798941797, + "acc_stderr": 0.02540255550326091, + "acc_norm": 0.41798941798941797, + "acc_norm_stderr": 0.02540255550326091 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4365079365079365, + "acc_stderr": 0.04435932892851466, + "acc_norm": 0.4365079365079365, + "acc_norm_stderr": 0.04435932892851466 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7838709677419354, + "acc_stderr": 0.02341529343356852, + "acc_norm": 0.7838709677419354, + "acc_norm_stderr": 0.02341529343356852 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4729064039408867, + "acc_stderr": 0.03512819077876106, + "acc_norm": 0.4729064039408867, + "acc_norm_stderr": 0.03512819077876106 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252607, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252607 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7818181818181819, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.7818181818181819, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7828282828282829, + "acc_stderr": 0.02937661648494563, + "acc_norm": 0.7828282828282829, + "acc_norm_stderr": 0.02937661648494563 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8808290155440415, + "acc_stderr": 0.02338193534812143, + "acc_norm": 0.8808290155440415, + "acc_norm_stderr": 0.02338193534812143 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6384615384615384, + "acc_stderr": 0.024359581465396997, + "acc_norm": 0.6384615384615384, + "acc_norm_stderr": 0.024359581465396997 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34074074074074073, + "acc_stderr": 0.02889774874113115, + "acc_norm": 0.34074074074074073, + "acc_norm_stderr": 0.02889774874113115 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.030388353551886786, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.030388353551886786 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943343, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943343 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8422018348623853, + "acc_stderr": 0.01563002297009244, + "acc_norm": 0.8422018348623853, + "acc_norm_stderr": 0.01563002297009244 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.03407632093854051, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.03407632093854051 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7892156862745098, + "acc_stderr": 0.028626547912437406, + "acc_norm": 0.7892156862745098, + "acc_norm_stderr": 0.028626547912437406 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7890295358649789, + "acc_stderr": 0.02655837250266192, + "acc_norm": 0.7890295358649789, + "acc_norm_stderr": 0.02655837250266192 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.03102441174057221, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.03102441174057221 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7480916030534351, + "acc_stderr": 0.03807387116306085, + "acc_norm": 0.7480916030534351, + "acc_norm_stderr": 0.03807387116306085 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8016528925619835, + "acc_stderr": 0.036401182719909456, + "acc_norm": 0.8016528925619835, + "acc_norm_stderr": 0.036401182719909456 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8055555555555556, + "acc_stderr": 0.03826076324884866, + "acc_norm": 0.8055555555555556, + "acc_norm_stderr": 0.03826076324884866 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7423312883435583, + "acc_stderr": 0.03436150827846917, + "acc_norm": 0.7423312883435583, + "acc_norm_stderr": 0.03436150827846917 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5089285714285714, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.5089285714285714, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7669902912621359, + "acc_stderr": 0.04185832598928315, + "acc_norm": 0.7669902912621359, + "acc_norm_stderr": 0.04185832598928315 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.02190190511507333, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.02190190511507333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8275862068965517, + "acc_stderr": 0.013507943909371803, + "acc_norm": 0.8275862068965517, + "acc_norm_stderr": 0.013507943909371803 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7225433526011561, + "acc_stderr": 0.024105712607754307, + "acc_norm": 0.7225433526011561, + "acc_norm_stderr": 0.024105712607754307 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.41787709497206704, + "acc_stderr": 0.016495400635820084, + "acc_norm": 0.41787709497206704, + "acc_norm_stderr": 0.016495400635820084 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7189542483660131, + "acc_stderr": 0.02573885479781874, + "acc_norm": 0.7189542483660131, + "acc_norm_stderr": 0.02573885479781874 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7138263665594855, + "acc_stderr": 0.02567025924218893, + "acc_norm": 0.7138263665594855, + "acc_norm_stderr": 0.02567025924218893 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7283950617283951, + "acc_stderr": 0.02474862449053737, + "acc_norm": 0.7283950617283951, + "acc_norm_stderr": 0.02474862449053737 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4716312056737589, + "acc_stderr": 0.029779450957303062, + "acc_norm": 0.4716312056737589, + "acc_norm_stderr": 0.029779450957303062 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4602346805736636, + "acc_stderr": 0.012729785386598559, + "acc_norm": 0.4602346805736636, + "acc_norm_stderr": 0.012729785386598559 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6507352941176471, + "acc_stderr": 0.02895975519682487, + "acc_norm": 0.6507352941176471, + "acc_norm_stderr": 0.02895975519682487 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6617647058823529, + "acc_stderr": 0.019139943748487043, + "acc_norm": 0.6617647058823529, + "acc_norm_stderr": 0.019139943748487043 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7306122448979592, + "acc_stderr": 0.02840125202902294, + "acc_norm": 0.7306122448979592, + "acc_norm_stderr": 0.02840125202902294 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.835820895522388, + "acc_stderr": 0.026193923544454125, + "acc_norm": 0.835820895522388, + "acc_norm_stderr": 0.026193923544454125 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.03379976689896309, + "acc_norm": 0.87, + "acc_norm_stderr": 0.03379976689896309 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5180722891566265, + "acc_stderr": 0.03889951252827216, + "acc_norm": 0.5180722891566265, + "acc_norm_stderr": 0.03889951252827216 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8070175438596491, + "acc_stderr": 0.030267457554898458, + "acc_norm": 0.8070175438596491, + "acc_norm_stderr": 0.030267457554898458 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.401468788249694, + "mc1_stderr": 0.017160273901693654, + "mc2": 0.564970662967412, + "mc2_stderr": 0.015518503176886996 + }, + "harness|winogrande|5": { + "acc": 0.7947908445146015, + "acc_stderr": 0.011350315707462063 + }, + "harness|gsm8k|5": { + "acc": 0.709628506444276, + "acc_stderr": 0.012503592481818948 + }, + "all": { + "acc": 0.639251601749628, + "acc_stderr": 0.03221647012444142, + "acc_norm": 0.6389576323016398, + "acc_norm_stderr": 0.03288102806405326, + "mc1": 0.401468788249694, + "mc1_stderr": 0.017160273901693654, + "mc2": 0.564970662967412, + "mc2_stderr": 0.015518503176886996 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "33482ce5dbc4cf89" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "07b014df5c321fd1" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/Nebula-7B/results_2023-10-09T12-29-36.965037.json b/eval-results/PulsarAI/Nebula-7B/results_2023-10-09T12-29-36.965037.json new file mode 100644 index 0000000000000000000000000000000000000000..a9384a31949613d72a9b57c84dc840c1f9aa7286 --- /dev/null +++ b/eval-results/PulsarAI/Nebula-7B/results_2023-10-09T12-29-36.965037.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "PulsarAI/Nebula-7B", + "model_sha": "569f848698a468fb03d37033c67f3734bbaec127", + "model_size": "13.99 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5418088737201365, + "acc_stderr": 0.0145602203087147, + "acc_norm": 0.5930034129692833, + "acc_norm_stderr": 0.014356399418009121 + }, + "harness|hellaswag|10": { + "acc": 0.6342362079267079, + "acc_stderr": 0.004806593424942265, + "acc_norm": 0.8345947022505477, + "acc_norm_stderr": 0.0037078660457296048 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.04292596718256981, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.04292596718256981 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5986842105263158, + "acc_stderr": 0.039889037033362836, + "acc_norm": 0.5986842105263158, + "acc_norm_stderr": 0.039889037033362836 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5773584905660377, + "acc_stderr": 0.03040233144576954, + "acc_norm": 0.5773584905660377, + "acc_norm_stderr": 0.03040233144576954 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.03852084696008534, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.03852084696008534 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3137254901960784, + "acc_stderr": 0.04617034827006715, + "acc_norm": 0.3137254901960784, + "acc_norm_stderr": 0.04617034827006715 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.68, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5063829787234042, + "acc_stderr": 0.03268335899936336, + "acc_norm": 0.5063829787234042, + "acc_norm_stderr": 0.03268335899936336 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4649122807017544, + "acc_stderr": 0.04692008381368909, + "acc_norm": 0.4649122807017544, + "acc_norm_stderr": 0.04692008381368909 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.041665675771015785, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.041665675771015785 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.02501074911613759, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.02501074911613759 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.043902592653775635, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.043902592653775635 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6516129032258065, + "acc_stderr": 0.027104826328100944, + "acc_norm": 0.6516129032258065, + "acc_norm_stderr": 0.027104826328100944 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.47783251231527096, + "acc_stderr": 0.035145285621750094, + "acc_norm": 0.47783251231527096, + "acc_norm_stderr": 0.035145285621750094 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.56, + "acc_stderr": 0.049888765156985884, + "acc_norm": 0.56, + "acc_norm_stderr": 0.049888765156985884 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.696969696969697, + "acc_stderr": 0.03588624800091707, + "acc_norm": 0.696969696969697, + "acc_norm_stderr": 0.03588624800091707 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7525252525252525, + "acc_stderr": 0.030746300742124495, + "acc_norm": 0.7525252525252525, + "acc_norm_stderr": 0.030746300742124495 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8082901554404145, + "acc_stderr": 0.028408953626245282, + "acc_norm": 0.8082901554404145, + "acc_norm_stderr": 0.028408953626245282 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5333333333333333, + "acc_stderr": 0.02529460802398647, + "acc_norm": 0.5333333333333333, + "acc_norm_stderr": 0.02529460802398647 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2814814814814815, + "acc_stderr": 0.02742001935094527, + "acc_norm": 0.2814814814814815, + "acc_norm_stderr": 0.02742001935094527 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.032252942323996406, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.032252942323996406 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.03879687024073327, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.03879687024073327 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7743119266055046, + "acc_stderr": 0.017923087667803064, + "acc_norm": 0.7743119266055046, + "acc_norm_stderr": 0.017923087667803064 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.39351851851851855, + "acc_stderr": 0.03331747876370312, + "acc_norm": 0.39351851851851855, + "acc_norm_stderr": 0.03331747876370312 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7156862745098039, + "acc_stderr": 0.031660096793998116, + "acc_norm": 0.7156862745098039, + "acc_norm_stderr": 0.031660096793998116 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7510548523206751, + "acc_stderr": 0.028146970599422644, + "acc_norm": 0.7510548523206751, + "acc_norm_stderr": 0.028146970599422644 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6681614349775785, + "acc_stderr": 0.03160295143776679, + "acc_norm": 0.6681614349775785, + "acc_norm_stderr": 0.03160295143776679 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6793893129770993, + "acc_stderr": 0.04093329229834278, + "acc_norm": 0.6793893129770993, + "acc_norm_stderr": 0.04093329229834278 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.040261875275912046, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.040261875275912046 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6851851851851852, + "acc_stderr": 0.04489931073591311, + "acc_norm": 0.6851851851851852, + "acc_norm_stderr": 0.04489931073591311 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6932515337423313, + "acc_stderr": 0.03623089915724146, + "acc_norm": 0.6932515337423313, + "acc_norm_stderr": 0.03623089915724146 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.38392857142857145, + "acc_stderr": 0.04616143075028547, + "acc_norm": 0.38392857142857145, + "acc_norm_stderr": 0.04616143075028547 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.04541609446503947, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.04541609446503947 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8205128205128205, + "acc_stderr": 0.025140935950335442, + "acc_norm": 0.8205128205128205, + "acc_norm_stderr": 0.025140935950335442 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8007662835249042, + "acc_stderr": 0.014283378044296422, + "acc_norm": 0.8007662835249042, + "acc_norm_stderr": 0.014283378044296422 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.02607431485165708, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.02607431485165708 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2659217877094972, + "acc_stderr": 0.014776765066438885, + "acc_norm": 0.2659217877094972, + "acc_norm_stderr": 0.014776765066438885 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6274509803921569, + "acc_stderr": 0.027684181883302888, + "acc_norm": 0.6274509803921569, + "acc_norm_stderr": 0.027684181883302888 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.662379421221865, + "acc_stderr": 0.026858825879488544, + "acc_norm": 0.662379421221865, + "acc_norm_stderr": 0.026858825879488544 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7067901234567902, + "acc_stderr": 0.025329888171900926, + "acc_norm": 0.7067901234567902, + "acc_norm_stderr": 0.025329888171900926 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.44680851063829785, + "acc_stderr": 0.02965823509766691, + "acc_norm": 0.44680851063829785, + "acc_norm_stderr": 0.02965823509766691 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4406779661016949, + "acc_stderr": 0.012680037994097065, + "acc_norm": 0.4406779661016949, + "acc_norm_stderr": 0.012680037994097065 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5441176470588235, + "acc_stderr": 0.03025437257397671, + "acc_norm": 0.5441176470588235, + "acc_norm_stderr": 0.03025437257397671 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5980392156862745, + "acc_stderr": 0.019835176484375383, + "acc_norm": 0.5980392156862745, + "acc_norm_stderr": 0.019835176484375383 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6181818181818182, + "acc_stderr": 0.046534298079135075, + "acc_norm": 0.6181818181818182, + "acc_norm_stderr": 0.046534298079135075 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5142857142857142, + "acc_stderr": 0.03199615232806286, + "acc_norm": 0.5142857142857142, + "acc_norm_stderr": 0.03199615232806286 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7661691542288557, + "acc_stderr": 0.029929415408348377, + "acc_norm": 0.7661691542288557, + "acc_norm_stderr": 0.029929415408348377 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.03889951252827216, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.03889951252827216 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7660818713450293, + "acc_stderr": 0.03246721765117826, + "acc_norm": 0.7660818713450293, + "acc_norm_stderr": 0.03246721765117826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.31334149326805383, + "mc1_stderr": 0.0162380650690596, + "mc2": 0.45561649492894496, + "mc2_stderr": 0.014644899277894422 + }, + "all": { + "acc": 0.570596346471807, + "acc_stderr": 0.034371584431446715, + "acc_norm": 0.5748599572103322, + "acc_norm_stderr": 0.03434950734212607, + "mc1": 0.31334149326805383, + "mc1_stderr": 0.0162380650690596, + "mc2": 0.45561649492894496, + "mc2_stderr": 0.014644899277894422 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "4129.603856563568", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/Nebula-7B/results_2023-10-23T05-54-57.990759.json b/eval-results/PulsarAI/Nebula-7B/results_2023-10-23T05-54-57.990759.json new file mode 100644 index 0000000000000000000000000000000000000000..13da1d46881b73623519bf681f29d457b3c4c781 --- /dev/null +++ b/eval-results/PulsarAI/Nebula-7B/results_2023-10-23T05-54-57.990759.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "PulsarAI/Nebula-7B", + "model_sha": "569f848698a468fb03d37033c67f3734bbaec127", + "model_size": "13.99 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.3613674496644295, + "em_stderr": 0.004919712134554973, + "f1": 0.4096088506711411, + "f1_stderr": 0.00477602953566436 + }, + "harness|gsm8k|5": { + "acc": 0.14859742228961334, + "acc_stderr": 0.009797503180527892 + }, + "harness|winogrande|5": { + "acc": 0.7640094711917916, + "acc_stderr": 0.011933828850275625 + }, + "all": { + "em": 0.3613674496644295, + "em_stderr": 0.004919712134554973, + "f1": 0.4096088506711411, + "f1_stderr": 0.00477602953566436, + "acc": 0.4563034467407025, + "acc_stderr": 0.01086566601540176 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "61c7df901b4fdf27" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "09cbbba8ed15d954" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2397, + "non-padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "f458f977041c5f7c" + }, + "total_evaluation_time_secondes": "7752.515452384949", + "truncated": 0, + "non-truncated": 13389, + "padded": 2397, + "non-padded": 10992, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/Nebula-v2-7B/results_2023-11-21T14-05-56.848413.json b/eval-results/PulsarAI/Nebula-v2-7B/results_2023-11-21T14-05-56.848413.json new file mode 100644 index 0000000000000000000000000000000000000000..f2ec9dd05a929b48271a8f0f67fa2bd16390f17e --- /dev/null +++ b/eval-results/PulsarAI/Nebula-v2-7B/results_2023-11-21T14-05-56.848413.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 402433.221833314, + "end_time": 413461.844564697, + "total_evaluation_time_secondes": "11028.622731383017", + "model_name": "PulsarAI/Nebula-v2-7B", + "model_sha": "d2a5611f7d7c37bfa2270d1823bceef01c0be383", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5452218430034129, + "acc_stderr": 0.014551507060836355, + "acc_norm": 0.5870307167235495, + "acc_norm_stderr": 0.014388344935398324 + }, + "harness|hellaswag|10": { + "acc": 0.6362278430591516, + "acc_stderr": 0.00480100965769044, + "acc_norm": 0.8306114319856602, + "acc_norm_stderr": 0.0037432817493736267 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5703703703703704, + "acc_stderr": 0.042763494943766, + "acc_norm": 0.5703703703703704, + "acc_norm_stderr": 0.042763494943766 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5855263157894737, + "acc_stderr": 0.04008973785779206, + "acc_norm": 0.5855263157894737, + "acc_norm_stderr": 0.04008973785779206 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5849056603773585, + "acc_stderr": 0.03032594578928611, + "acc_norm": 0.5849056603773585, + "acc_norm_stderr": 0.03032594578928611 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6597222222222222, + "acc_stderr": 0.039621355734862175, + "acc_norm": 0.6597222222222222, + "acc_norm_stderr": 0.039621355734862175 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5664739884393064, + "acc_stderr": 0.03778621079092056, + "acc_norm": 0.5664739884393064, + "acc_norm_stderr": 0.03778621079092056 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421296, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421296 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5319148936170213, + "acc_stderr": 0.03261936918467382, + "acc_norm": 0.5319148936170213, + "acc_norm_stderr": 0.03261936918467382 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.39473684210526316, + "acc_stderr": 0.045981880578165414, + "acc_norm": 0.39473684210526316, + "acc_norm_stderr": 0.045981880578165414 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5241379310344828, + "acc_stderr": 0.04161808503501531, + "acc_norm": 0.5241379310344828, + "acc_norm_stderr": 0.04161808503501531 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.02490699045899257, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.02490699045899257 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.04415438226743744, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.04415438226743744 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6741935483870968, + "acc_stderr": 0.026662010578567104, + "acc_norm": 0.6741935483870968, + "acc_norm_stderr": 0.026662010578567104 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.458128078817734, + "acc_stderr": 0.03505630140785741, + "acc_norm": 0.458128078817734, + "acc_norm_stderr": 0.03505630140785741 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7212121212121212, + "acc_stderr": 0.03501438706296781, + "acc_norm": 0.7212121212121212, + "acc_norm_stderr": 0.03501438706296781 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.696969696969697, + "acc_stderr": 0.03274287914026867, + "acc_norm": 0.696969696969697, + "acc_norm_stderr": 0.03274287914026867 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7564766839378239, + "acc_stderr": 0.030975436386845436, + "acc_norm": 0.7564766839378239, + "acc_norm_stderr": 0.030975436386845436 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5384615384615384, + "acc_stderr": 0.025275892070240644, + "acc_norm": 0.5384615384615384, + "acc_norm_stderr": 0.025275892070240644 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.028037929969114993, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.028037929969114993 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.03214536859788639, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.03214536859788639 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8, + "acc_stderr": 0.01714985851425095, + "acc_norm": 0.8, + "acc_norm_stderr": 0.01714985851425095 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.03372343271653063, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.03372343271653063 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7352941176470589, + "acc_stderr": 0.030964517926923393, + "acc_norm": 0.7352941176470589, + "acc_norm_stderr": 0.030964517926923393 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7383966244725738, + "acc_stderr": 0.028609516716994934, + "acc_norm": 0.7383966244725738, + "acc_norm_stderr": 0.028609516716994934 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6591928251121076, + "acc_stderr": 0.03181149747055359, + "acc_norm": 0.6591928251121076, + "acc_norm_stderr": 0.03181149747055359 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6641221374045801, + "acc_stderr": 0.04142313771996664, + "acc_norm": 0.6641221374045801, + "acc_norm_stderr": 0.04142313771996664 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7520661157024794, + "acc_stderr": 0.03941897526516302, + "acc_norm": 0.7520661157024794, + "acc_norm_stderr": 0.03941897526516302 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.04557239513497752, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.04557239513497752 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7177914110429447, + "acc_stderr": 0.03536117886664742, + "acc_norm": 0.7177914110429447, + "acc_norm_stderr": 0.03536117886664742 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.042450224863844935, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.042450224863844935 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8247863247863247, + "acc_stderr": 0.02490443909891822, + "acc_norm": 0.8247863247863247, + "acc_norm_stderr": 0.02490443909891822 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7969348659003831, + "acc_stderr": 0.014385525076611573, + "acc_norm": 0.7969348659003831, + "acc_norm_stderr": 0.014385525076611573 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.02607431485165708, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.02607431485165708 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3307262569832402, + "acc_stderr": 0.01573502625896612, + "acc_norm": 0.3307262569832402, + "acc_norm_stderr": 0.01573502625896612 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6405228758169934, + "acc_stderr": 0.027475969910660956, + "acc_norm": 0.6405228758169934, + "acc_norm_stderr": 0.027475969910660956 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7170418006430869, + "acc_stderr": 0.025583062489984817, + "acc_norm": 0.7170418006430869, + "acc_norm_stderr": 0.025583062489984817 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7067901234567902, + "acc_stderr": 0.025329888171900922, + "acc_norm": 0.7067901234567902, + "acc_norm_stderr": 0.025329888171900922 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.44680851063829785, + "acc_stderr": 0.029658235097666907, + "acc_norm": 0.44680851063829785, + "acc_norm_stderr": 0.029658235097666907 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4367666232073012, + "acc_stderr": 0.012667701919603654, + "acc_norm": 0.4367666232073012, + "acc_norm_stderr": 0.012667701919603654 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6029411764705882, + "acc_stderr": 0.029722152099280065, + "acc_norm": 0.6029411764705882, + "acc_norm_stderr": 0.029722152099280065 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.019944914136873583, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.019944914136873583 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5020408163265306, + "acc_stderr": 0.0320089533497105, + "acc_norm": 0.5020408163265306, + "acc_norm_stderr": 0.0320089533497105 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.746268656716418, + "acc_stderr": 0.030769444967296014, + "acc_norm": 0.746268656716418, + "acc_norm_stderr": 0.030769444967296014 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.83, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.46987951807228917, + "acc_stderr": 0.03885425420866766, + "acc_norm": 0.46987951807228917, + "acc_norm_stderr": 0.03885425420866766 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7660818713450293, + "acc_stderr": 0.03246721765117825, + "acc_norm": 0.7660818713450293, + "acc_norm_stderr": 0.03246721765117825 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.31701346389228885, + "mc1_stderr": 0.016289203374403382, + "mc2": 0.46717875687520055, + "mc2_stderr": 0.014898776473320212 + }, + "harness|winogrande|5": { + "acc": 0.7513812154696132, + "acc_stderr": 0.012147314713403105 + }, + "harness|drop|3": { + "em": 0.39481963087248323, + "em_stderr": 0.005005891546798788, + "f1": 0.4411262583892629, + "f1_stderr": 0.004868374360691444 + }, + "harness|gsm8k|5": { + "acc": 0.1288855193328279, + "acc_stderr": 0.009229580761400286 + }, + "all": { + "acc": 0.5721628889468044, + "acc_stderr": 0.03350500497352606, + "acc_norm": 0.5806417280271063, + "acc_norm_stderr": 0.03425775454028019, + "mc1": 0.31701346389228885, + "mc1_stderr": 0.016289203374403382, + "mc2": 0.46717875687520055, + "mc2_stderr": 0.014898776473320212, + "em": 0.39481963087248323, + "em_stderr": 0.005005891546798788, + "f1": 0.4411262583892629, + "f1_stderr": 0.004868374360691444 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "f82ca99501d50e81" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "6feb4390e9bbcdf4" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "7353b1d2002c8701" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/Nebula-v2-7B/results_2023-12-02T13-58-09.073163.json b/eval-results/PulsarAI/Nebula-v2-7B/results_2023-12-02T13-58-09.073163.json new file mode 100644 index 0000000000000000000000000000000000000000..aaf61e8630b7169064f9aed58f0957609009fb14 --- /dev/null +++ b/eval-results/PulsarAI/Nebula-v2-7B/results_2023-12-02T13-58-09.073163.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1372246.775753896, + "end_time": 1374916.609388898, + "total_evaluation_time_secondes": "2669.833635001909", + "model_name": "PulsarAI/Nebula-v2-7B", + "model_sha": "309def6f8346de2f87e50f9cff2940eac2c62808", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.3169067475360121, + "acc_stderr": 0.012815868296721373 + }, + "all": { + "acc": 0.3169067475360121, + "acc_stderr": 0.012815868296721373 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "6feb4390e9bbcdf4" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "f17391d49d33b9c0", + "hash_cont_tokens": "b0ac033dd1654a10" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/Neural-una-cybertron-7b/results_2023-12-09T19-49-04.690282.json b/eval-results/PulsarAI/Neural-una-cybertron-7b/results_2023-12-09T19-49-04.690282.json new file mode 100644 index 0000000000000000000000000000000000000000..48ee632ede872b239acb4cf713a59173cb5ce40a --- /dev/null +++ b/eval-results/PulsarAI/Neural-una-cybertron-7b/results_2023-12-09T19-49-04.690282.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 598899.245047054, + "end_time": 606213.97993682, + "total_evaluation_time_secondes": "7314.734889765969", + "model_name": "PulsarAI/Neural-una-cybertron-7b", + "model_sha": "66dae63f92cac0c99b1b162383506b60ac060225", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6604095563139932, + "acc_stderr": 0.013839039762820164, + "acc_norm": 0.6902730375426621, + "acc_norm_stderr": 0.013512058415238363 + }, + "harness|hellaswag|10": { + "acc": 0.6704839673371839, + "acc_stderr": 0.004690768393854475, + "acc_norm": 0.8450507866958773, + "acc_norm_stderr": 0.0036111673029597625 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6296296296296297, + "acc_stderr": 0.041716541613545426, + "acc_norm": 0.6296296296296297, + "acc_norm_stderr": 0.041716541613545426 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7302631578947368, + "acc_stderr": 0.03611780560284898, + "acc_norm": 0.7302631578947368, + "acc_norm_stderr": 0.03611780560284898 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6830188679245283, + "acc_stderr": 0.02863723563980089, + "acc_norm": 0.6830188679245283, + "acc_norm_stderr": 0.02863723563980089 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6875, + "acc_stderr": 0.038760854559127644, + "acc_norm": 0.6875, + "acc_norm_stderr": 0.038760854559127644 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6705202312138728, + "acc_stderr": 0.03583901754736412, + "acc_norm": 0.6705202312138728, + "acc_norm_stderr": 0.03583901754736412 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.04897104952726366, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.04897104952726366 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5787234042553191, + "acc_stderr": 0.03227834510146268, + "acc_norm": 0.5787234042553191, + "acc_norm_stderr": 0.03227834510146268 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.49122807017543857, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.49122807017543857, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5448275862068965, + "acc_stderr": 0.04149886942192117, + "acc_norm": 0.5448275862068965, + "acc_norm_stderr": 0.04149886942192117 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3783068783068783, + "acc_stderr": 0.024976954053155247, + "acc_norm": 0.3783068783068783, + "acc_norm_stderr": 0.024976954053155247 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.04444444444444449, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.04444444444444449 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7612903225806451, + "acc_stderr": 0.02425107126220884, + "acc_norm": 0.7612903225806451, + "acc_norm_stderr": 0.02425107126220884 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.47783251231527096, + "acc_stderr": 0.03514528562175008, + "acc_norm": 0.47783251231527096, + "acc_norm_stderr": 0.03514528562175008 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.73, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.73, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7696969696969697, + "acc_stderr": 0.032876667586034906, + "acc_norm": 0.7696969696969697, + "acc_norm_stderr": 0.032876667586034906 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.029620227874790492, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.029620227874790492 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8497409326424871, + "acc_stderr": 0.025787723180723875, + "acc_norm": 0.8497409326424871, + "acc_norm_stderr": 0.025787723180723875 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6564102564102564, + "acc_stderr": 0.024078696580635474, + "acc_norm": 0.6564102564102564, + "acc_norm_stderr": 0.024078696580635474 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.028037929969114993, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.028037929969114993 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6932773109243697, + "acc_stderr": 0.029953823891887034, + "acc_norm": 0.6932773109243697, + "acc_norm_stderr": 0.029953823891887034 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8311926605504587, + "acc_stderr": 0.016060056268530343, + "acc_norm": 0.8311926605504587, + "acc_norm_stderr": 0.016060056268530343 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.03407632093854051, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.03407632093854051 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7941176470588235, + "acc_stderr": 0.028379449451588667, + "acc_norm": 0.7941176470588235, + "acc_norm_stderr": 0.028379449451588667 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229966, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229966 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.672645739910314, + "acc_stderr": 0.03149384670994131, + "acc_norm": 0.672645739910314, + "acc_norm_stderr": 0.03149384670994131 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7251908396946565, + "acc_stderr": 0.03915345408847835, + "acc_norm": 0.7251908396946565, + "acc_norm_stderr": 0.03915345408847835 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7520661157024794, + "acc_stderr": 0.03941897526516302, + "acc_norm": 0.7520661157024794, + "acc_norm_stderr": 0.03941897526516302 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.042844679680521934, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.042844679680521934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7177914110429447, + "acc_stderr": 0.03536117886664742, + "acc_norm": 0.7177914110429447, + "acc_norm_stderr": 0.03536117886664742 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.49107142857142855, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.49107142857142855, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8155339805825242, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.8155339805825242, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8461538461538461, + "acc_stderr": 0.023636873317489277, + "acc_norm": 0.8461538461538461, + "acc_norm_stderr": 0.023636873317489277 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.80970625798212, + "acc_stderr": 0.014036945850381398, + "acc_norm": 0.80970625798212, + "acc_norm_stderr": 0.014036945850381398 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6965317919075145, + "acc_stderr": 0.024752411960917205, + "acc_norm": 0.6965317919075145, + "acc_norm_stderr": 0.024752411960917205 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3787709497206704, + "acc_stderr": 0.016223533510365113, + "acc_norm": 0.3787709497206704, + "acc_norm_stderr": 0.016223533510365113 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6601307189542484, + "acc_stderr": 0.027121956071388856, + "acc_norm": 0.6601307189542484, + "acc_norm_stderr": 0.027121956071388856 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.707395498392283, + "acc_stderr": 0.02583989833487798, + "acc_norm": 0.707395498392283, + "acc_norm_stderr": 0.02583989833487798 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7160493827160493, + "acc_stderr": 0.025089478523765137, + "acc_norm": 0.7160493827160493, + "acc_norm_stderr": 0.025089478523765137 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.475177304964539, + "acc_stderr": 0.02979071924382972, + "acc_norm": 0.475177304964539, + "acc_norm_stderr": 0.02979071924382972 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44784876140808344, + "acc_stderr": 0.01270058240476822, + "acc_norm": 0.44784876140808344, + "acc_norm_stderr": 0.01270058240476822 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6470588235294118, + "acc_stderr": 0.0290294228156814, + "acc_norm": 0.6470588235294118, + "acc_norm_stderr": 0.0290294228156814 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6552287581699346, + "acc_stderr": 0.019228322018696647, + "acc_norm": 0.6552287581699346, + "acc_norm_stderr": 0.019228322018696647 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.045820048415054174, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.045820048415054174 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7020408163265306, + "acc_stderr": 0.029279567411065677, + "acc_norm": 0.7020408163265306, + "acc_norm_stderr": 0.029279567411065677 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8308457711442786, + "acc_stderr": 0.02650859065623325, + "acc_norm": 0.8308457711442786, + "acc_norm_stderr": 0.02650859065623325 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036625, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036625 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5301204819277109, + "acc_stderr": 0.03885425420866767, + "acc_norm": 0.5301204819277109, + "acc_norm_stderr": 0.03885425420866767 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7894736842105263, + "acc_stderr": 0.0312678171466318, + "acc_norm": 0.7894736842105263, + "acc_norm_stderr": 0.0312678171466318 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.49938800489596086, + "mc1_stderr": 0.01750348793889251, + "mc2": 0.6498823682901811, + "mc2_stderr": 0.01528184743332698 + }, + "harness|winogrande|5": { + "acc": 0.8066298342541437, + "acc_stderr": 0.011099796645920524 + }, + "harness|gsm8k|5": { + "acc": 0.5231235784685367, + "acc_stderr": 0.013757748544245336 + }, + "all": { + "acc": 0.6303659109315263, + "acc_stderr": 0.032701507219088696, + "acc_norm": 0.6326609738082676, + "acc_norm_stderr": 0.033364878181962175, + "mc1": 0.49938800489596086, + "mc1_stderr": 0.01750348793889251, + "mc2": 0.6498823682901811, + "mc2_stderr": 0.01528184743332698 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "b71a9efa60b088cf" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "8c35520a5a048acb" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/OpenHermes-2.5-neural-chat-v3-2-Slerp/results_2023-12-09T18-04-51.228408.json b/eval-results/PulsarAI/OpenHermes-2.5-neural-chat-v3-2-Slerp/results_2023-12-09T18-04-51.228408.json new file mode 100644 index 0000000000000000000000000000000000000000..dbdafd65bc3557e530d2570422fdf096bb71a427 --- /dev/null +++ b/eval-results/PulsarAI/OpenHermes-2.5-neural-chat-v3-2-Slerp/results_2023-12-09T18-04-51.228408.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 592818.78162626, + "end_time": 599962.361295504, + "total_evaluation_time_secondes": "7143.579669244005", + "model_name": "PulsarAI/OpenHermes-2.5-neural-chat-v3-2-Slerp", + "model_sha": "bf9ef6df7732dbef3cd0001d9e5cba846cb47306", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6459044368600683, + "acc_stderr": 0.013975454122756557, + "acc_norm": 0.6749146757679181, + "acc_norm_stderr": 0.013688147309729124 + }, + "harness|hellaswag|10": { + "acc": 0.6569408484365664, + "acc_stderr": 0.0047376083401634, + "acc_norm": 0.8542123083051185, + "acc_norm_stderr": 0.0035217202839105555 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6370370370370371, + "acc_stderr": 0.04153948404742398, + "acc_norm": 0.6370370370370371, + "acc_norm_stderr": 0.04153948404742398 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7171052631578947, + "acc_stderr": 0.03665349695640767, + "acc_norm": 0.7171052631578947, + "acc_norm_stderr": 0.03665349695640767 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6830188679245283, + "acc_stderr": 0.028637235639800886, + "acc_norm": 0.6830188679245283, + "acc_norm_stderr": 0.028637235639800886 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.75, + "acc_stderr": 0.03621034121889507, + "acc_norm": 0.75, + "acc_norm_stderr": 0.03621034121889507 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6416184971098265, + "acc_stderr": 0.036563436533531585, + "acc_norm": 0.6416184971098265, + "acc_norm_stderr": 0.036563436533531585 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4019607843137255, + "acc_stderr": 0.04878608714466996, + "acc_norm": 0.4019607843137255, + "acc_norm_stderr": 0.04878608714466996 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.574468085106383, + "acc_stderr": 0.03232146916224468, + "acc_norm": 0.574468085106383, + "acc_norm_stderr": 0.03232146916224468 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.04697085136647863, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.04697085136647863 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5724137931034483, + "acc_stderr": 0.04122737111370333, + "acc_norm": 0.5724137931034483, + "acc_norm_stderr": 0.04122737111370333 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.02530590624159063, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.02530590624159063 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4603174603174603, + "acc_stderr": 0.04458029125470973, + "acc_norm": 0.4603174603174603, + "acc_norm_stderr": 0.04458029125470973 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7838709677419354, + "acc_stderr": 0.02341529343356853, + "acc_norm": 0.7838709677419354, + "acc_norm_stderr": 0.02341529343356853 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4975369458128079, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.4975369458128079, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7757575757575758, + "acc_stderr": 0.032568666616811015, + "acc_norm": 0.7757575757575758, + "acc_norm_stderr": 0.032568666616811015 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.029126522834586815, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.029126522834586815 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8911917098445595, + "acc_stderr": 0.022473253332768776, + "acc_norm": 0.8911917098445595, + "acc_norm_stderr": 0.022473253332768776 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6641025641025641, + "acc_stderr": 0.023946724741563976, + "acc_norm": 0.6641025641025641, + "acc_norm_stderr": 0.023946724741563976 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3296296296296296, + "acc_stderr": 0.02866120111652457, + "acc_norm": 0.3296296296296296, + "acc_norm_stderr": 0.02866120111652457 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7016806722689075, + "acc_stderr": 0.02971914287634286, + "acc_norm": 0.7016806722689075, + "acc_norm_stderr": 0.02971914287634286 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8477064220183487, + "acc_stderr": 0.015405084393157074, + "acc_norm": 0.8477064220183487, + "acc_norm_stderr": 0.015405084393157074 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5416666666666666, + "acc_stderr": 0.03398110890294636, + "acc_norm": 0.5416666666666666, + "acc_norm_stderr": 0.03398110890294636 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.028125972265654373, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.028125972265654373 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7974683544303798, + "acc_stderr": 0.026160568246601443, + "acc_norm": 0.7974683544303798, + "acc_norm_stderr": 0.026160568246601443 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8091603053435115, + "acc_stderr": 0.03446513350752599, + "acc_norm": 0.8091603053435115, + "acc_norm_stderr": 0.03446513350752599 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7607361963190185, + "acc_stderr": 0.0335195387952127, + "acc_norm": 0.7607361963190185, + "acc_norm_stderr": 0.0335195387952127 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4732142857142857, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.4732142857142857, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7669902912621359, + "acc_stderr": 0.04185832598928315, + "acc_norm": 0.7669902912621359, + "acc_norm_stderr": 0.04185832598928315 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8675213675213675, + "acc_stderr": 0.022209309073165616, + "acc_norm": 0.8675213675213675, + "acc_norm_stderr": 0.022209309073165616 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8173690932311622, + "acc_stderr": 0.013816335389973138, + "acc_norm": 0.8173690932311622, + "acc_norm_stderr": 0.013816335389973138 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7196531791907514, + "acc_stderr": 0.024182427496577615, + "acc_norm": 0.7196531791907514, + "acc_norm_stderr": 0.024182427496577615 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4, + "acc_stderr": 0.01638463841038082, + "acc_norm": 0.4, + "acc_norm_stderr": 0.01638463841038082 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7549019607843137, + "acc_stderr": 0.024630048979824782, + "acc_norm": 0.7549019607843137, + "acc_norm_stderr": 0.024630048979824782 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7041800643086816, + "acc_stderr": 0.025922371788818763, + "acc_norm": 0.7041800643086816, + "acc_norm_stderr": 0.025922371788818763 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7530864197530864, + "acc_stderr": 0.023993501709042103, + "acc_norm": 0.7530864197530864, + "acc_norm_stderr": 0.023993501709042103 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4858156028368794, + "acc_stderr": 0.02981549448368206, + "acc_norm": 0.4858156028368794, + "acc_norm_stderr": 0.02981549448368206 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4602346805736636, + "acc_stderr": 0.012729785386598568, + "acc_norm": 0.4602346805736636, + "acc_norm_stderr": 0.012729785386598568 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6838235294117647, + "acc_stderr": 0.02824568739146293, + "acc_norm": 0.6838235294117647, + "acc_norm_stderr": 0.02824568739146293 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.0190709855896875, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.0190709855896875 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7306122448979592, + "acc_stderr": 0.02840125202902294, + "acc_norm": 0.7306122448979592, + "acc_norm_stderr": 0.02840125202902294 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8507462686567164, + "acc_stderr": 0.025196929874827075, + "acc_norm": 0.8507462686567164, + "acc_norm_stderr": 0.025196929874827075 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5240963855421686, + "acc_stderr": 0.03887971849597264, + "acc_norm": 0.5240963855421686, + "acc_norm_stderr": 0.03887971849597264 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8304093567251462, + "acc_stderr": 0.02878210810540171, + "acc_norm": 0.8304093567251462, + "acc_norm_stderr": 0.02878210810540171 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.4504283965728274, + "mc1_stderr": 0.017417264371967646, + "mc2": 0.6104827225746667, + "mc2_stderr": 0.014972794318436832 + }, + "harness|winogrande|5": { + "acc": 0.8003157063930545, + "acc_stderr": 0.011235328382625849 + }, + "harness|gsm8k|5": { + "acc": 0.6307808946171342, + "acc_stderr": 0.013293019538066244 + }, + "all": { + "acc": 0.644055937606071, + "acc_stderr": 0.032184807364406556, + "acc_norm": 0.6454677507073991, + "acc_norm_stderr": 0.03283460519387843, + "mc1": 0.4504283965728274, + "mc1_stderr": 0.017417264371967646, + "mc2": 0.6104827225746667, + "mc2_stderr": 0.014972794318436832 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "5dc10707efcbc6d6" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "cc4ffdafcc128c14" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/OpenHermes-2.5-neural-chat-v3-3-Slerp/results_2023-12-10T01-51-52.298552.json b/eval-results/PulsarAI/OpenHermes-2.5-neural-chat-v3-3-Slerp/results_2023-12-10T01-51-52.298552.json new file mode 100644 index 0000000000000000000000000000000000000000..aa292cfeaa5a60540dfa1d10c467627e241be24c --- /dev/null +++ b/eval-results/PulsarAI/OpenHermes-2.5-neural-chat-v3-3-Slerp/results_2023-12-10T01-51-52.298552.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 620421.133237178, + "end_time": 627982.465024655, + "total_evaluation_time_secondes": "7561.331787477015", + "model_name": "PulsarAI/OpenHermes-2.5-neural-chat-v3-3-Slerp", + "model_sha": "91f18df3f5c3d36f1293086113f810f662970449", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6450511945392492, + "acc_stderr": 0.013983036904094092, + "acc_norm": 0.6808873720136519, + "acc_norm_stderr": 0.013621696119173311 + }, + "harness|hellaswag|10": { + "acc": 0.667894841665007, + "acc_stderr": 0.00470005967137464, + "acc_norm": 0.861979685321649, + "acc_norm_stderr": 0.0034421638433628794 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6370370370370371, + "acc_stderr": 0.04153948404742398, + "acc_norm": 0.6370370370370371, + "acc_norm_stderr": 0.04153948404742398 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6776315789473685, + "acc_stderr": 0.03803510248351585, + "acc_norm": 0.6776315789473685, + "acc_norm_stderr": 0.03803510248351585 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7094339622641509, + "acc_stderr": 0.027943219989337142, + "acc_norm": 0.7094339622641509, + "acc_norm_stderr": 0.027943219989337142 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7708333333333334, + "acc_stderr": 0.03514697467862388, + "acc_norm": 0.7708333333333334, + "acc_norm_stderr": 0.03514697467862388 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.036928207672648664, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.036928207672648664 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4215686274509804, + "acc_stderr": 0.04913595201274498, + "acc_norm": 0.4215686274509804, + "acc_norm_stderr": 0.04913595201274498 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.74, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.74, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5531914893617021, + "acc_stderr": 0.0325005368436584, + "acc_norm": 0.5531914893617021, + "acc_norm_stderr": 0.0325005368436584 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.046970851366478626, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.046970851366478626 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5724137931034483, + "acc_stderr": 0.04122737111370333, + "acc_norm": 0.5724137931034483, + "acc_norm_stderr": 0.04122737111370333 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.42328042328042326, + "acc_stderr": 0.025446365634406796, + "acc_norm": 0.42328042328042326, + "acc_norm_stderr": 0.025446365634406796 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.04415438226743744, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.04415438226743744 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7806451612903226, + "acc_stderr": 0.023540799358723285, + "acc_norm": 0.7806451612903226, + "acc_norm_stderr": 0.023540799358723285 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.49261083743842365, + "acc_stderr": 0.035176035403610084, + "acc_norm": 0.49261083743842365, + "acc_norm_stderr": 0.035176035403610084 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7818181818181819, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.7818181818181819, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.02912652283458682, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.02912652283458682 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9067357512953368, + "acc_stderr": 0.02098685459328973, + "acc_norm": 0.9067357512953368, + "acc_norm_stderr": 0.02098685459328973 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6487179487179487, + "acc_stderr": 0.024203665177902803, + "acc_norm": 0.6487179487179487, + "acc_norm_stderr": 0.024203665177902803 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.02911661760608301, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.02911661760608301 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6890756302521008, + "acc_stderr": 0.03006676158297793, + "acc_norm": 0.6890756302521008, + "acc_norm_stderr": 0.03006676158297793 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8495412844036697, + "acc_stderr": 0.015328563932669237, + "acc_norm": 0.8495412844036697, + "acc_norm_stderr": 0.015328563932669237 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7941176470588235, + "acc_stderr": 0.028379449451588663, + "acc_norm": 0.7941176470588235, + "acc_norm_stderr": 0.028379449451588663 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7974683544303798, + "acc_stderr": 0.026160568246601432, + "acc_norm": 0.7974683544303798, + "acc_norm_stderr": 0.026160568246601432 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6860986547085202, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.6860986547085202, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8091603053435115, + "acc_stderr": 0.03446513350752599, + "acc_norm": 0.8091603053435115, + "acc_norm_stderr": 0.03446513350752599 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8181818181818182, + "acc_stderr": 0.03520893951097654, + "acc_norm": 0.8181818181818182, + "acc_norm_stderr": 0.03520893951097654 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.04133119440243839, + "acc_norm": 0.7592592592592593, + "acc_norm_stderr": 0.04133119440243839 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7607361963190185, + "acc_stderr": 0.0335195387952127, + "acc_norm": 0.7607361963190185, + "acc_norm_stderr": 0.0335195387952127 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.48214285714285715, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.48214285714285715, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7669902912621359, + "acc_stderr": 0.04185832598928315, + "acc_norm": 0.7669902912621359, + "acc_norm_stderr": 0.04185832598928315 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8803418803418803, + "acc_stderr": 0.021262719400406964, + "acc_norm": 0.8803418803418803, + "acc_norm_stderr": 0.021262719400406964 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8275862068965517, + "acc_stderr": 0.013507943909371803, + "acc_norm": 0.8275862068965517, + "acc_norm_stderr": 0.013507943909371803 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7225433526011561, + "acc_stderr": 0.02410571260775431, + "acc_norm": 0.7225433526011561, + "acc_norm_stderr": 0.02410571260775431 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3675977653631285, + "acc_stderr": 0.016125543823552954, + "acc_norm": 0.3675977653631285, + "acc_norm_stderr": 0.016125543823552954 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7483660130718954, + "acc_stderr": 0.024848018263875192, + "acc_norm": 0.7483660130718954, + "acc_norm_stderr": 0.024848018263875192 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6881028938906752, + "acc_stderr": 0.02631185807185416, + "acc_norm": 0.6881028938906752, + "acc_norm_stderr": 0.02631185807185416 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7469135802469136, + "acc_stderr": 0.024191808600712995, + "acc_norm": 0.7469135802469136, + "acc_norm_stderr": 0.024191808600712995 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.48226950354609927, + "acc_stderr": 0.02980873964223777, + "acc_norm": 0.48226950354609927, + "acc_norm_stderr": 0.02980873964223777 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4556714471968709, + "acc_stderr": 0.012719949543032197, + "acc_norm": 0.4556714471968709, + "acc_norm_stderr": 0.012719949543032197 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6875, + "acc_stderr": 0.02815637344037142, + "acc_norm": 0.6875, + "acc_norm_stderr": 0.02815637344037142 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6715686274509803, + "acc_stderr": 0.01899970738316268, + "acc_norm": 0.6715686274509803, + "acc_norm_stderr": 0.01899970738316268 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7224489795918367, + "acc_stderr": 0.028666857790274645, + "acc_norm": 0.7224489795918367, + "acc_norm_stderr": 0.028666857790274645 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8756218905472637, + "acc_stderr": 0.023335401790166327, + "acc_norm": 0.8756218905472637, + "acc_norm_stderr": 0.023335401790166327 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.03379976689896309, + "acc_norm": 0.87, + "acc_norm_stderr": 0.03379976689896309 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5542168674698795, + "acc_stderr": 0.03869543323472101, + "acc_norm": 0.5542168674698795, + "acc_norm_stderr": 0.03869543323472101 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8187134502923976, + "acc_stderr": 0.029547741687640044, + "acc_norm": 0.8187134502923976, + "acc_norm_stderr": 0.029547741687640044 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.4602203182374541, + "mc1_stderr": 0.017448017223960884, + "mc2": 0.627788323256757, + "mc2_stderr": 0.014997858897015229 + }, + "harness|winogrande|5": { + "acc": 0.7916337805840569, + "acc_stderr": 0.011414554399987726 + }, + "harness|gsm8k|5": { + "acc": 0.6777862016679302, + "acc_stderr": 0.012872435481188778 + }, + "all": { + "acc": 0.6460435872902499, + "acc_stderr": 0.03203449074198557, + "acc_norm": 0.6469349129421068, + "acc_norm_stderr": 0.032681317097745945, + "mc1": 0.4602203182374541, + "mc1_stderr": 0.017448017223960884, + "mc2": 0.627788323256757, + "mc2_stderr": 0.014997858897015229 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "0c6678e9a50265aa" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "dabd7ede2f85a8dd" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PulsarAI/SlimOpenOrca-Mistral-7B-v2/results_2023-11-12T18-15-51.369317.json b/eval-results/PulsarAI/SlimOpenOrca-Mistral-7B-v2/results_2023-11-12T18-15-51.369317.json new file mode 100644 index 0000000000000000000000000000000000000000..e2ea4211a9908a5924f0c23ce58118ff8f802b77 --- /dev/null +++ b/eval-results/PulsarAI/SlimOpenOrca-Mistral-7B-v2/results_2023-11-12T18-15-51.369317.json @@ -0,0 +1,1436 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 244465.032520747, + "end_time": null, + "total_evaluation_time_secondes": null, + "model_name": "PulsarAI/SlimOpenOrca-Mistral-7B-v2", + "model_sha": "7cd030ccdb169c2685fe028bb4380b91ad74920f", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5938566552901023, + "acc_stderr": 0.014351656690097858, + "acc_norm": 0.628839590443686, + "acc_norm_stderr": 0.014117971901142824 + }, + "harness|hellaswag|10": { + "acc": 0.6448914558852819, + "acc_stderr": 0.004775681871529862, + "acc_norm": 0.8340967934674368, + "acc_norm_stderr": 0.003712334763856884 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5851851851851851, + "acc_stderr": 0.04256193767901408, + "acc_norm": 0.5851851851851851, + "acc_norm_stderr": 0.04256193767901408 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6907894736842105, + "acc_stderr": 0.037610708698674805, + "acc_norm": 0.6907894736842105, + "acc_norm_stderr": 0.037610708698674805 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6754716981132075, + "acc_stderr": 0.028815615713432108, + "acc_norm": 0.6754716981132075, + "acc_norm_stderr": 0.028815615713432108 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7361111111111112, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.7361111111111112, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5549132947976878, + "acc_stderr": 0.03789401760283648, + "acc_norm": 0.5549132947976878, + "acc_norm_stderr": 0.03789401760283648 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5319148936170213, + "acc_stderr": 0.03261936918467381, + "acc_norm": 0.5319148936170213, + "acc_norm_stderr": 0.03261936918467381 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.42105263157894735, + "acc_stderr": 0.046446020912223177, + "acc_norm": 0.42105263157894735, + "acc_norm_stderr": 0.046446020912223177 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5586206896551724, + "acc_stderr": 0.04137931034482758, + "acc_norm": 0.5586206896551724, + "acc_norm_stderr": 0.04137931034482758 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.41534391534391535, + "acc_stderr": 0.02537952491077839, + "acc_norm": 0.41534391534391535, + "acc_norm_stderr": 0.02537952491077839 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7483870967741936, + "acc_stderr": 0.024685979286239963, + "acc_norm": 0.7483870967741936, + "acc_norm_stderr": 0.024685979286239963 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4630541871921182, + "acc_stderr": 0.035083705204426656, + "acc_norm": 0.4630541871921182, + "acc_norm_stderr": 0.035083705204426656 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.030532892233932022, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.030532892233932022 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8549222797927462, + "acc_stderr": 0.02541634309630645, + "acc_norm": 0.8549222797927462, + "acc_norm_stderr": 0.02541634309630645 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5974358974358974, + "acc_stderr": 0.02486499515976775, + "acc_norm": 0.5974358974358974, + "acc_norm_stderr": 0.02486499515976775 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.35555555555555557, + "acc_stderr": 0.029185714949857413, + "acc_norm": 0.35555555555555557, + "acc_norm_stderr": 0.029185714949857413 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6386554621848739, + "acc_stderr": 0.031204691225150016, + "acc_norm": 0.6386554621848739, + "acc_norm_stderr": 0.031204691225150016 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.03631329803969653, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.03631329803969653 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8311926605504587, + "acc_stderr": 0.01606005626853035, + "acc_norm": 0.8311926605504587, + "acc_norm_stderr": 0.01606005626853035 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.49074074074074076, + "acc_stderr": 0.034093869469927006, + "acc_norm": 0.49074074074074076, + "acc_norm_stderr": 0.034093869469927006 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8137254901960784, + "acc_stderr": 0.027325470966716312, + "acc_norm": 0.8137254901960784, + "acc_norm_stderr": 0.027325470966716312 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7890295358649789, + "acc_stderr": 0.02655837250266192, + "acc_norm": 0.7890295358649789, + "acc_norm_stderr": 0.02655837250266192 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6547085201793722, + "acc_stderr": 0.03191100192835794, + "acc_norm": 0.6547085201793722, + "acc_norm_stderr": 0.03191100192835794 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7633587786259542, + "acc_stderr": 0.03727673575596915, + "acc_norm": 0.7633587786259542, + "acc_norm_stderr": 0.03727673575596915 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.036959801280988226, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.036959801280988226 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.042844679680521934, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.042844679680521934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7423312883435583, + "acc_stderr": 0.03436150827846917, + "acc_norm": 0.7423312883435583, + "acc_norm_stderr": 0.03436150827846917 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5089285714285714, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.5089285714285714, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7961165048543689, + "acc_stderr": 0.039891398595317706, + "acc_norm": 0.7961165048543689, + "acc_norm_stderr": 0.039891398595317706 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8547008547008547, + "acc_stderr": 0.023086635086841407, + "acc_norm": 0.8547008547008547, + "acc_norm_stderr": 0.023086635086841407 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.80970625798212, + "acc_stderr": 0.014036945850381401, + "acc_norm": 0.80970625798212, + "acc_norm_stderr": 0.014036945850381401 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6878612716763006, + "acc_stderr": 0.024946792225272314, + "acc_norm": 0.6878612716763006, + "acc_norm_stderr": 0.024946792225272314 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3474860335195531, + "acc_stderr": 0.01592556406020815, + "acc_norm": 0.3474860335195531, + "acc_norm_stderr": 0.01592556406020815 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6993464052287581, + "acc_stderr": 0.026256053835718964, + "acc_norm": 0.6993464052287581, + "acc_norm_stderr": 0.026256053835718964 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6655948553054662, + "acc_stderr": 0.026795422327893937, + "acc_norm": 0.6655948553054662, + "acc_norm_stderr": 0.026795422327893937 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7160493827160493, + "acc_stderr": 0.025089478523765134, + "acc_norm": 0.7160493827160493, + "acc_norm_stderr": 0.025089478523765134 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.43617021276595747, + "acc_stderr": 0.02958345203628407, + "acc_norm": 0.43617021276595747, + "acc_norm_stderr": 0.02958345203628407 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4530638852672751, + "acc_stderr": 0.012713845972358978, + "acc_norm": 0.4530638852672751, + "acc_norm_stderr": 0.012713845972358978 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6066176470588235, + "acc_stderr": 0.029674288281311155, + "acc_norm": 0.6066176470588235, + "acc_norm_stderr": 0.029674288281311155 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6421568627450981, + "acc_stderr": 0.019393058402355442, + "acc_norm": 0.6421568627450981, + "acc_norm_stderr": 0.019393058402355442 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7142857142857143, + "acc_stderr": 0.028920583220675606, + "acc_norm": 0.7142857142857143, + "acc_norm_stderr": 0.028920583220675606 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8258706467661692, + "acc_stderr": 0.026814951200421603, + "acc_norm": 0.8258706467661692, + "acc_norm_stderr": 0.026814951200421603 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5120481927710844, + "acc_stderr": 0.03891364495835817, + "acc_norm": 0.5120481927710844, + "acc_norm_stderr": 0.03891364495835817 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8070175438596491, + "acc_stderr": 0.030267457554898458, + "acc_norm": 0.8070175438596491, + "acc_norm_stderr": 0.030267457554898458 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3929008567931457, + "mc1_stderr": 0.017097248285233065, + "mc2": 0.5664808334981362, + "mc2_stderr": 0.015491636686254535 + }, + "harness|winogrande|5": { + "acc": 0.7758484609313339, + "acc_stderr": 0.011720400740774099 + }, + "harness|drop|3": { + "em": 0.004718959731543624, + "em_stderr": 0.0007018360183131115, + "f1": 0.09190750838926176, + "f1_stderr": 0.0018302287340192876 + }, + "harness|gsm8k|5": { + "acc": 0.18953752843062927, + "acc_stderr": 0.010795837931896387 + }, + "all": { + "acc": 0.6159393027066592, + "acc_stderr": 0.032593338844127864, + "acc_norm": 0.6242559279403389, + "acc_norm_stderr": 0.03329458303258477, + "mc1": 0.3929008567931457, + "mc1_stderr": 0.017097248285233065, + "mc2": 0.5664808334981362, + "mc2_stderr": 0.015491636686254535, + "em": 0.004718959731543624, + "em_stderr": 0.0007018360183131115, + "f1": 0.09190750838926176, + "f1_stderr": 0.0018302287340192876 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "b99e115003f44360" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "cfb19cb746d72852" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "1478f95685cdbbd4" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/PygmalionAI/metharme-1.3b/results_2023-07-19T14-50-43.188696.json b/eval-results/PygmalionAI/metharme-1.3b/results_2023-07-19T14-50-43.188696.json new file mode 100644 index 0000000000000000000000000000000000000000..4b2e8d1a2429d9cea6a8f3b0267e6aea76201040 --- /dev/null +++ b/eval-results/PygmalionAI/metharme-1.3b/results_2023-07-19T14-50-43.188696.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.3165529010238908, + "acc_stderr": 0.013592431519068079, + "acc_norm": 0.3438566552901024, + "acc_norm_stderr": 0.01388064457015621 + }, + "harness|hellaswag|10": { + "acc": 0.4297948615813583, + "acc_stderr": 0.00494034967676932, + "acc_norm": 0.5593507269468233, + "acc_norm_stderr": 0.00495450360647161 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768081, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768081 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.1925925925925926, + "acc_stderr": 0.03406542058502652, + "acc_norm": 0.1925925925925926, + "acc_norm_stderr": 0.03406542058502652 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.16447368421052633, + "acc_stderr": 0.03016753346863271, + "acc_norm": 0.16447368421052633, + "acc_norm_stderr": 0.03016753346863271 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.27547169811320754, + "acc_stderr": 0.02749566368372406, + "acc_norm": 0.27547169811320754, + "acc_norm_stderr": 0.02749566368372406 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3263888888888889, + "acc_stderr": 0.03921067198982266, + "acc_norm": 0.3263888888888889, + "acc_norm_stderr": 0.03921067198982266 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.1907514450867052, + "acc_stderr": 0.029957851329869337, + "acc_norm": 0.1907514450867052, + "acc_norm_stderr": 0.029957851329869337 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171453, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171453 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102967, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102967 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.0383515395439942, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.0383515395439942 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.037245636197746325, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.037245636197746325 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2671957671957672, + "acc_stderr": 0.02278967314577656, + "acc_norm": 0.2671957671957672, + "acc_norm_stderr": 0.02278967314577656 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.20634920634920634, + "acc_stderr": 0.036196045241242515, + "acc_norm": 0.20634920634920634, + "acc_norm_stderr": 0.036196045241242515 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.23548387096774193, + "acc_stderr": 0.024137632429337714, + "acc_norm": 0.23548387096774193, + "acc_norm_stderr": 0.024137632429337714 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2019704433497537, + "acc_stderr": 0.028247350122180246, + "acc_norm": 0.2019704433497537, + "acc_norm_stderr": 0.028247350122180246 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.18181818181818182, + "acc_stderr": 0.03011768892950357, + "acc_norm": 0.18181818181818182, + "acc_norm_stderr": 0.03011768892950357 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.15151515151515152, + "acc_stderr": 0.025545650426603627, + "acc_norm": 0.15151515151515152, + "acc_norm_stderr": 0.025545650426603627 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.21243523316062177, + "acc_stderr": 0.029519282616817244, + "acc_norm": 0.21243523316062177, + "acc_norm_stderr": 0.029519282616817244 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.23846153846153847, + "acc_stderr": 0.021606294494647727, + "acc_norm": 0.23846153846153847, + "acc_norm_stderr": 0.021606294494647727 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.02620276653465215, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.02620276653465215 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21008403361344538, + "acc_stderr": 0.026461398717471874, + "acc_norm": 0.21008403361344538, + "acc_norm_stderr": 0.026461398717471874 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.03631329803969653, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.03631329803969653 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1926605504587156, + "acc_stderr": 0.016909276884936087, + "acc_norm": 0.1926605504587156, + "acc_norm_stderr": 0.016909276884936087 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4398148148148148, + "acc_stderr": 0.03385177976044811, + "acc_norm": 0.4398148148148148, + "acc_norm_stderr": 0.03385177976044811 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.030964517926923413, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.030964517926923413 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2742616033755274, + "acc_stderr": 0.029041333510598042, + "acc_norm": 0.2742616033755274, + "acc_norm_stderr": 0.029041333510598042 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.31390134529147984, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.31390134529147984, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.21374045801526717, + "acc_stderr": 0.0359546161177469, + "acc_norm": 0.21374045801526717, + "acc_norm_stderr": 0.0359546161177469 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2644628099173554, + "acc_stderr": 0.04026187527591205, + "acc_norm": 0.2644628099173554, + "acc_norm_stderr": 0.04026187527591205 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.25153374233128833, + "acc_stderr": 0.034089978868575295, + "acc_norm": 0.25153374233128833, + "acc_norm_stderr": 0.034089978868575295 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.30357142857142855, + "acc_stderr": 0.04364226155841044, + "acc_norm": 0.30357142857142855, + "acc_norm_stderr": 0.04364226155841044 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.20388349514563106, + "acc_stderr": 0.03989139859531772, + "acc_norm": 0.20388349514563106, + "acc_norm_stderr": 0.03989139859531772 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.25213675213675213, + "acc_stderr": 0.02844796547623101, + "acc_norm": 0.25213675213675213, + "acc_norm_stderr": 0.02844796547623101 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2503192848020434, + "acc_stderr": 0.015491088951494583, + "acc_norm": 0.2503192848020434, + "acc_norm_stderr": 0.015491088951494583 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.27167630057803466, + "acc_stderr": 0.023948512905468348, + "acc_norm": 0.27167630057803466, + "acc_norm_stderr": 0.023948512905468348 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.25163398692810457, + "acc_stderr": 0.024848018263875195, + "acc_norm": 0.25163398692810457, + "acc_norm_stderr": 0.024848018263875195 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24115755627009647, + "acc_stderr": 0.02429659403476343, + "acc_norm": 0.24115755627009647, + "acc_norm_stderr": 0.02429659403476343 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.23148148148148148, + "acc_stderr": 0.023468429832451163, + "acc_norm": 0.23148148148148148, + "acc_norm_stderr": 0.023468429832451163 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2765957446808511, + "acc_stderr": 0.026684564340460994, + "acc_norm": 0.2765957446808511, + "acc_norm_stderr": 0.026684564340460994 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142692, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142692 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.19852941176470587, + "acc_stderr": 0.0242310133705411, + "acc_norm": 0.19852941176470587, + "acc_norm_stderr": 0.0242310133705411 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2565359477124183, + "acc_stderr": 0.017667841612378977, + "acc_norm": 0.2565359477124183, + "acc_norm_stderr": 0.017667841612378977 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.19090909090909092, + "acc_stderr": 0.03764425585984927, + "acc_norm": 0.19090909090909092, + "acc_norm_stderr": 0.03764425585984927 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.22448979591836735, + "acc_stderr": 0.02671143055553841, + "acc_norm": 0.22448979591836735, + "acc_norm_stderr": 0.02671143055553841 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23383084577114427, + "acc_stderr": 0.02992941540834838, + "acc_norm": 0.23383084577114427, + "acc_norm_stderr": 0.02992941540834838 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2710843373493976, + "acc_stderr": 0.03460579907553027, + "acc_norm": 0.2710843373493976, + "acc_norm_stderr": 0.03460579907553027 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.035650796707083106, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.035650796707083106 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23990208078335373, + "mc1_stderr": 0.014948812679062133, + "mc2": 0.37681475766360784, + "mc2_stderr": 0.014150246336114062 + }, + "all": { + "acc": 0.25483520491115697, + "acc_stderr": 0.031495620958017834, + "acc_norm": 0.25749384253203284, + "acc_norm_stderr": 0.031500745822099024, + "mc1": 0.23990208078335373, + "mc1_stderr": 0.014948812679062133, + "mc2": 0.37681475766360784, + "mc2_stderr": 0.014150246336114062 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "PygmalionAI/metharme-1.3b", + "model_sha": "62ec4ff53042f692ef0661e54f371747214707a4", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "573b1b078b6e9deb", + "hash_cont_tokens": "22424bcffb42ecdf" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "f0fd0caf4d4c1110", + "hash_cont_tokens": "62a15ef112ea07d6" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "f076ac6b177ca28c", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "059827606e6b0780", + "hash_cont_tokens": "ec7e2288ab5f1ce9" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "1dd0dab88aa9e4b2", + "hash_cont_tokens": "d7e922da5bc6d1bf" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "d51eb5246cbe2173", + "hash_cont_tokens": "08933598b321179c" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "2337a7f17800c6ec", + "hash_cont_tokens": "bc82b3cc5072f164" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "e394ebbb8ceace76", + "hash_cont_tokens": "3bc45e0c4b6d612d" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "9221fbdf710a6f67", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "ebe2748d21b2ba41", + "hash_cont_tokens": "d839b8186e0f3d94" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "bfecefb08ffb7faa", + "hash_cont_tokens": "3c16f9c45a7a7272" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "2ac8aec9025dc58b", + "hash_cont_tokens": "16f654508cdc19c4" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "faf44c77f43368ef", + "hash_cont_tokens": "a3a24586c7218684" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "280c7f12abde10a5", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "217a841c86d2d992", + "hash_cont_tokens": "43818b3dc0c7496f" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "354267c0f98aad3b", + "hash_cont_tokens": "4f0a3e41169314a8" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "4f5e8d051d04dde0", + "hash_cont_tokens": "7e14ccd1e2688bb8" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "cd12bec1d5448dda", + "hash_cont_tokens": "317e29ee6bba387d" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "c549e395850984fe", + "hash_cont_tokens": "c01a9b75f55e32e0" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "81b06f5caa221f97", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "ad626d781102fe51", + "hash_cont_tokens": "edb2063e955bd5ca" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "2c0d3f2eacc6bbd5", + "hash_cont_tokens": "8000de09bc1dc113" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "aada51d0571db37b", + "hash_cont_tokens": "dcd6a0ada4ab8e0b" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "6e47d696116edd01", + "hash_cont_tokens": "47a5e5973f50fe17" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "0e8ee6c9e572e3c4", + "hash_cont_tokens": "812f79117b9593de" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8fa2bf90de3b07e7", + "hash_cont_tokens": "b4c405890ebd3ee1" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fabb8f176276af2f", + "hash_cont_tokens": "8d468d84a686647d" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3e86d13ef021476a", + "hash_cont_tokens": "e5d02f8f1c5dcf31" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a132b5e9c9531b36", + "hash_cont_tokens": "4c32e38c066727bc" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "f8f6fe5143776cb4", + "hash_cont_tokens": "9416ad85fd6f4a2c" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "e28121967b27a315", + "hash_cont_tokens": "57cc212706ddcdf4" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "bdbe90efb4a1c4ce", + "hash_cont_tokens": "8c5c954092a64343" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "b8f58f05dc082011", + "hash_cont_tokens": "e5ab34a54e3f5b7c" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "3af911bf93093a85", + "hash_cont_tokens": "f3276c80ce1b205b" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "1dd2240eb90b9a70", + "hash_cont_tokens": "7982edf99219e1b0" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f3de2f8181824a79", + "hash_cont_tokens": "ed73d516c5552dd0" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "0c2a1dd63cc74137", + "hash_cont_tokens": "549d9b32b8a90e4e" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "08e3527985f33aab", + "hash_cont_tokens": "ddf5241e450210d6" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "bf7216a648529f68", + "hash_cont_tokens": "eb791fcbee9e0682" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "28f5891c956afd65", + "hash_cont_tokens": "c66b1f3b46001b09" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6de88b824d4f64c3", + "hash_cont_tokens": "27795e9c98bdeda8" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "5ef855d01044fd83", + "hash_cont_tokens": "874c5b0b496cbe8a" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "1840e0b96d7e619e", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "02483f6b53dc13ac", + "hash_cont_tokens": "313ee361fbdbab3c" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "93202e79d594dde4", + "hash_cont_tokens": "fe7747dc69c4909e" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "41c03f41d2ba9fe7", + "hash_cont_tokens": "e0d0ad58a3f1ff22" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "d83bcb6dd08809ac", + "hash_cont_tokens": "c55a10a018de0228" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "65c70474c8a5d205", + "hash_cont_tokens": "7916d26928435f1a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "4d4126ac9a91ac47", + "hash_cont_tokens": "81836c52a10e6ffd" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "592f80ad364d686a", + "hash_cont_tokens": "f5d669014a273483" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "7f837322b1b62ac1", + "hash_cont_tokens": "6b31cf265df9b81b" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "05a8ef0dd10b4bba", + "hash_cont_tokens": "4b3ac60441ad14ec" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3c7944f0b2c49f64", + "hash_cont_tokens": "f139af481f2a9e74" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "637e934bb716d5ec", + "hash_cont_tokens": "ca79966b90cda0ea" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "3bad229573ed6a9c", + "hash_cont_tokens": "952a2e479fc3a83e" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "70a479e96d02d5d8", + "hash_cont_tokens": "f49476cf49b37d7c" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "0d690fc0db462440", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "4b0fdf8e692dd640", + "hash_cont_tokens": "0065c4bbe6134c1c" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "cfd7092dc8aacd96", + "hash_cont_tokens": "9a178e9ec050bf3e" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "e820abadeb7ebfb3", + "hash_cont_tokens": "7f48ddfffa64eb41" + } + } +} \ No newline at end of file diff --git a/eval-results/PygmalionAI/metharme-1.3b/results_2023-09-22T18-39-45.920651.json b/eval-results/PygmalionAI/metharme-1.3b/results_2023-09-22T18-39-45.920651.json new file mode 100644 index 0000000000000000000000000000000000000000..5316cc76206aea3702eed4022bf593ccf985575a --- /dev/null +++ b/eval-results/PygmalionAI/metharme-1.3b/results_2023-09-22T18-39-45.920651.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "PygmalionAI/metharme-1.3b", + "model_sha": "62ec4ff53042f692ef0661e54f371747214707a4", + "model_size": "2.65 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001572986577181208, + "em_stderr": 0.00040584511324177333, + "f1": 0.04728187919463099, + "f1_stderr": 0.0012123660755283244 + }, + "harness|gsm8k|5": { + "acc": 0.0075815011372251705, + "acc_stderr": 0.002389281512077243 + }, + "harness|winogrande|5": { + "acc": 0.5643251775848461, + "acc_stderr": 0.01393570973961571 + }, + "all": { + "em": 0.001572986577181208, + "em_stderr": 0.00040584511324177333, + "f1": 0.04728187919463099, + "f1_stderr": 0.0012123660755283244, + "acc": 0.2859533393610357, + "acc_stderr": 0.008162495625846476 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "4bf3f6ba1bae765a", + "hash_cont_tokens": "6af6d9639d4092d4" + }, + "truncated": 439, + "non-truncated": 9097, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "ef516f9ffbe76423", + "hash_cont_tokens": "371295175b7a0ec0" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c469718508f43cab", + "hash_cont_tokens": "87eeb79172195781" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2456, + "non-padded": 78, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "401c6c49053f17ab", + "hash_cont_tokens": "559fffe259ac30e8" + }, + "total_evaluation_time_secondes": "6578.710257053375", + "truncated": 439, + "non-truncated": 12950, + "padded": 2456, + "non-padded": 10933, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PygmalionAI/mythalion-13b/results_2023-09-13T15-43-56.959580.json b/eval-results/PygmalionAI/mythalion-13b/results_2023-09-13T15-43-56.959580.json new file mode 100644 index 0000000000000000000000000000000000000000..23caa8ea1bf607af9da081a94ec2577041954b9c --- /dev/null +++ b/eval-results/PygmalionAI/mythalion-13b/results_2023-09-13T15-43-56.959580.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "PygmalionAI/mythalion-13b", + "model_sha": "24916f62b8243a7e4646ea53eeb45d890cbd308f", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5767918088737202, + "acc_stderr": 0.014438036220848032, + "acc_norm": 0.6126279863481229, + "acc_norm_stderr": 0.014235872487909869 + }, + "harness|hellaswag|10": { + "acc": 0.644991037641904, + "acc_stderr": 0.004775380866948015, + "acc_norm": 0.8380800637323242, + "acc_norm_stderr": 0.0036762448867232646 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526066, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526066 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5460526315789473, + "acc_stderr": 0.04051646342874142, + "acc_norm": 0.5460526315789473, + "acc_norm_stderr": 0.04051646342874142 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6150943396226415, + "acc_stderr": 0.02994649856769995, + "acc_norm": 0.6150943396226415, + "acc_norm_stderr": 0.02994649856769995 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.04076663253918567, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.04076663253918567 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5491329479768786, + "acc_stderr": 0.0379401267469703, + "acc_norm": 0.5491329479768786, + "acc_norm_stderr": 0.0379401267469703 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.04440521906179328, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.04440521906179328 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4340425531914894, + "acc_stderr": 0.032400380867927465, + "acc_norm": 0.4340425531914894, + "acc_norm_stderr": 0.032400380867927465 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.04339138322579861, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.04339138322579861 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3306878306878307, + "acc_stderr": 0.024229965298425075, + "acc_norm": 0.3306878306878307, + "acc_norm_stderr": 0.024229965298425075 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.04375888492727061, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.04375888492727061 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6387096774193548, + "acc_stderr": 0.02732754844795754, + "acc_norm": 0.6387096774193548, + "acc_norm_stderr": 0.02732754844795754 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4482758620689655, + "acc_stderr": 0.03499113137676744, + "acc_norm": 0.4482758620689655, + "acc_norm_stderr": 0.03499113137676744 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.036085410115739666, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.036085410115739666 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7121212121212122, + "acc_stderr": 0.03225883512300992, + "acc_norm": 0.7121212121212122, + "acc_norm_stderr": 0.03225883512300992 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8031088082901554, + "acc_stderr": 0.028697873971860688, + "acc_norm": 0.8031088082901554, + "acc_norm_stderr": 0.028697873971860688 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.517948717948718, + "acc_stderr": 0.025334667080954925, + "acc_norm": 0.517948717948718, + "acc_norm_stderr": 0.025334667080954925 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.027940457136228416, + "acc_norm": 0.3, + "acc_norm_stderr": 0.027940457136228416 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5756302521008403, + "acc_stderr": 0.032104790510157764, + "acc_norm": 0.5756302521008403, + "acc_norm_stderr": 0.032104790510157764 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.03822746937658753, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.03822746937658753 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7394495412844037, + "acc_stderr": 0.01881918203485007, + "acc_norm": 0.7394495412844037, + "acc_norm_stderr": 0.01881918203485007 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.033247089118091176, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.033247089118091176 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7696078431372549, + "acc_stderr": 0.029554292605695066, + "acc_norm": 0.7696078431372549, + "acc_norm_stderr": 0.029554292605695066 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229966, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229966 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.672645739910314, + "acc_stderr": 0.03149384670994131, + "acc_norm": 0.672645739910314, + "acc_norm_stderr": 0.03149384670994131 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6412213740458015, + "acc_stderr": 0.04206739313864908, + "acc_norm": 0.6412213740458015, + "acc_norm_stderr": 0.04206739313864908 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908705, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7129629629629629, + "acc_stderr": 0.043733130409147614, + "acc_norm": 0.7129629629629629, + "acc_norm_stderr": 0.043733130409147614 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6871165644171779, + "acc_stderr": 0.036429145782924055, + "acc_norm": 0.6871165644171779, + "acc_norm_stderr": 0.036429145782924055 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.0457237235873743, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.0457237235873743 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7184466019417476, + "acc_stderr": 0.04453254836326467, + "acc_norm": 0.7184466019417476, + "acc_norm_stderr": 0.04453254836326467 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8205128205128205, + "acc_stderr": 0.025140935950335435, + "acc_norm": 0.8205128205128205, + "acc_norm_stderr": 0.025140935950335435 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7713920817369093, + "acc_stderr": 0.015016884698539866, + "acc_norm": 0.7713920817369093, + "acc_norm_stderr": 0.015016884698539866 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6416184971098265, + "acc_stderr": 0.025816756791584194, + "acc_norm": 0.6416184971098265, + "acc_norm_stderr": 0.025816756791584194 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4581005586592179, + "acc_stderr": 0.016663683295020527, + "acc_norm": 0.4581005586592179, + "acc_norm_stderr": 0.016663683295020527 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6372549019607843, + "acc_stderr": 0.027530078447110303, + "acc_norm": 0.6372549019607843, + "acc_norm_stderr": 0.027530078447110303 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.639871382636656, + "acc_stderr": 0.027264297599804012, + "acc_norm": 0.639871382636656, + "acc_norm_stderr": 0.027264297599804012 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6265432098765432, + "acc_stderr": 0.026915003011380154, + "acc_norm": 0.6265432098765432, + "acc_norm_stderr": 0.026915003011380154 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4148936170212766, + "acc_stderr": 0.0293922365846125, + "acc_norm": 0.4148936170212766, + "acc_norm_stderr": 0.0293922365846125 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.43089960886571055, + "acc_stderr": 0.012647695889547228, + "acc_norm": 0.43089960886571055, + "acc_norm_stderr": 0.012647695889547228 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.03032024326500413, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.03032024326500413 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5915032679738562, + "acc_stderr": 0.019886221037501862, + "acc_norm": 0.5915032679738562, + "acc_norm_stderr": 0.019886221037501862 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6530612244897959, + "acc_stderr": 0.030472526026726496, + "acc_norm": 0.6530612244897959, + "acc_norm_stderr": 0.030472526026726496 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7213930348258707, + "acc_stderr": 0.031700561834973086, + "acc_norm": 0.7213930348258707, + "acc_norm_stderr": 0.031700561834973086 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.83, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.783625730994152, + "acc_stderr": 0.031581495393387324, + "acc_norm": 0.783625730994152, + "acc_norm_stderr": 0.031581495393387324 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3219094247246022, + "mc1_stderr": 0.016355567611960404, + "mc2": 0.46562168990109065, + "mc2_stderr": 0.015291610692060842 + }, + "all": { + "acc": 0.5668139361802185, + "acc_stderr": 0.03433655004935063, + "acc_norm": 0.5706940243762324, + "acc_norm_stderr": 0.03431449412200888, + "mc1": 0.3219094247246022, + "mc1_stderr": 0.016355567611960404, + "mc2": 0.46562168990109065, + "mc2_stderr": 0.015291610692060842 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6340.482659339905", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PygmalionAI/mythalion-13b/results_2023-10-26T08-48-40.818758.json b/eval-results/PygmalionAI/mythalion-13b/results_2023-10-26T08-48-40.818758.json new file mode 100644 index 0000000000000000000000000000000000000000..f0dc6e617c92a7899e521e0cd8961970ae827271 --- /dev/null +++ b/eval-results/PygmalionAI/mythalion-13b/results_2023-10-26T08-48-40.818758.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "PygmalionAI/mythalion-13b", + "model_sha": "69b215c5aedd1d7601d06119e674b28e7754b569", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.005243288590604027, + "em_stderr": 0.0007396052260778182, + "f1": 0.07011430369127479, + "f1_stderr": 0.0015312669887699872 + }, + "harness|gsm8k|5": { + "acc": 0.1326762699014405, + "acc_stderr": 0.009343929131442217 + }, + "harness|winogrande|5": { + "acc": 0.7742699289660616, + "acc_stderr": 0.011749626260902552 + }, + "all": { + "em": 0.005243288590604027, + "em_stderr": 0.0007396052260778182, + "f1": 0.07011430369127479, + "f1_stderr": 0.0015312669887699872, + "acc": 0.453473099433751, + "acc_stderr": 0.010546777696172384 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "a2e75d73b560624f" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "0715dbc6a3029f9a" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "50f0c35c8835b7a8" + }, + "total_evaluation_time_secondes": "12331.343548059464", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PygmalionAI/pygmalion-1.3b/results_2023-07-19T14-47-14.842065.json b/eval-results/PygmalionAI/pygmalion-1.3b/results_2023-07-19T14-47-14.842065.json new file mode 100644 index 0000000000000000000000000000000000000000..adf632905c8a7abc29088852330b5beeddb3e061 --- /dev/null +++ b/eval-results/PygmalionAI/pygmalion-1.3b/results_2023-07-19T14-47-14.842065.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.2525597269624573, + "acc_stderr": 0.01269672898020771, + "acc_norm": 0.28071672354948807, + "acc_norm_stderr": 0.013131238126975572 + }, + "harness|hellaswag|10": { + "acc": 0.38458474407488547, + "acc_stderr": 0.0048550272483981445, + "acc_norm": 0.469627564230233, + "acc_norm_stderr": 0.004980566907790454 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036625, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036625 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.035914440841969694, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.035914440841969694 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.21710526315789475, + "acc_stderr": 0.03355045304882921, + "acc_norm": 0.21710526315789475, + "acc_norm_stderr": 0.03355045304882921 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.22264150943396227, + "acc_stderr": 0.025604233470899095, + "acc_norm": 0.22264150943396227, + "acc_norm_stderr": 0.025604233470899095 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.25, + "acc_stderr": 0.03621034121889507, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03621034121889507 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2658959537572254, + "acc_stderr": 0.0336876293225943, + "acc_norm": 0.2658959537572254, + "acc_norm_stderr": 0.0336876293225943 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171453, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171453 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.251063829787234, + "acc_stderr": 0.028346963777162452, + "acc_norm": 0.251063829787234, + "acc_norm_stderr": 0.028346963777162452 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.20175438596491227, + "acc_stderr": 0.037752050135836386, + "acc_norm": 0.20175438596491227, + "acc_norm_stderr": 0.037752050135836386 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2620689655172414, + "acc_stderr": 0.036646663372252565, + "acc_norm": 0.2620689655172414, + "acc_norm_stderr": 0.036646663372252565 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24338624338624337, + "acc_stderr": 0.02210112878741543, + "acc_norm": 0.24338624338624337, + "acc_norm_stderr": 0.02210112878741543 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.03809523809523811, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.03809523809523811 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.1935483870967742, + "acc_stderr": 0.02247525852553606, + "acc_norm": 0.1935483870967742, + "acc_norm_stderr": 0.02247525852553606 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.17733990147783252, + "acc_stderr": 0.026874337276808345, + "acc_norm": 0.17733990147783252, + "acc_norm_stderr": 0.026874337276808345 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036622, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036622 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.032250781083062896, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.032250781083062896 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.17676767676767677, + "acc_stderr": 0.027178752639044915, + "acc_norm": 0.17676767676767677, + "acc_norm_stderr": 0.027178752639044915 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.18134715025906736, + "acc_stderr": 0.02780703236068609, + "acc_norm": 0.18134715025906736, + "acc_norm_stderr": 0.02780703236068609 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2205128205128205, + "acc_stderr": 0.02102067268082791, + "acc_norm": 0.2205128205128205, + "acc_norm_stderr": 0.02102067268082791 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24814814814814815, + "acc_stderr": 0.0263357394040558, + "acc_norm": 0.24814814814814815, + "acc_norm_stderr": 0.0263357394040558 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.25630252100840334, + "acc_stderr": 0.02835962087053395, + "acc_norm": 0.25630252100840334, + "acc_norm_stderr": 0.02835962087053395 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3576158940397351, + "acc_stderr": 0.03913453431177258, + "acc_norm": 0.3576158940397351, + "acc_norm_stderr": 0.03913453431177258 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1981651376146789, + "acc_stderr": 0.017090573804217888, + "acc_norm": 0.1981651376146789, + "acc_norm_stderr": 0.017090573804217888 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.1712962962962963, + "acc_stderr": 0.025695341643824685, + "acc_norm": 0.1712962962962963, + "acc_norm_stderr": 0.025695341643824685 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.28270042194092826, + "acc_stderr": 0.02931281415395592, + "acc_norm": 0.28270042194092826, + "acc_norm_stderr": 0.02931281415395592 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.31390134529147984, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.31390134529147984, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2644628099173554, + "acc_stderr": 0.040261875275912046, + "acc_norm": 0.2644628099173554, + "acc_norm_stderr": 0.040261875275912046 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.19631901840490798, + "acc_stderr": 0.031207970394709218, + "acc_norm": 0.19631901840490798, + "acc_norm_stderr": 0.031207970394709218 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.04432804055291519, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.04432804055291519 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.18446601941747573, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.18446601941747573, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2863247863247863, + "acc_stderr": 0.029614323690456648, + "acc_norm": 0.2863247863247863, + "acc_norm_stderr": 0.029614323690456648 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.23754789272030652, + "acc_stderr": 0.015218733046150191, + "acc_norm": 0.23754789272030652, + "acc_norm_stderr": 0.015218733046150191 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2398843930635838, + "acc_stderr": 0.022989592543123567, + "acc_norm": 0.2398843930635838, + "acc_norm_stderr": 0.022989592543123567 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23687150837988827, + "acc_stderr": 0.014219570788103987, + "acc_norm": 0.23687150837988827, + "acc_norm_stderr": 0.014219570788103987 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22875816993464052, + "acc_stderr": 0.024051029739912255, + "acc_norm": 0.22875816993464052, + "acc_norm_stderr": 0.024051029739912255 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.19935691318327975, + "acc_stderr": 0.022691033780549656, + "acc_norm": 0.19935691318327975, + "acc_norm_stderr": 0.022691033780549656 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.0227797190887334, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.0227797190887334 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2198581560283688, + "acc_stderr": 0.024706141070705477, + "acc_norm": 0.2198581560283688, + "acc_norm_stderr": 0.024706141070705477 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2503259452411995, + "acc_stderr": 0.01106415102716544, + "acc_norm": 0.2503259452411995, + "acc_norm_stderr": 0.01106415102716544 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.19117647058823528, + "acc_stderr": 0.02388688192244034, + "acc_norm": 0.19117647058823528, + "acc_norm_stderr": 0.02388688192244034 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.22712418300653595, + "acc_stderr": 0.016949853279212383, + "acc_norm": 0.22712418300653595, + "acc_norm_stderr": 0.016949853279212383 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.041723430387053825, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.041723430387053825 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.24489795918367346, + "acc_stderr": 0.027529637440174917, + "acc_norm": 0.24489795918367346, + "acc_norm_stderr": 0.027529637440174917 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.21890547263681592, + "acc_stderr": 0.029239174636647, + "acc_norm": 0.21890547263681592, + "acc_norm_stderr": 0.029239174636647 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.26506024096385544, + "acc_stderr": 0.03436024037944967, + "acc_norm": 0.26506024096385544, + "acc_norm_stderr": 0.03436024037944967 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.30409356725146197, + "acc_stderr": 0.03528211258245232, + "acc_norm": 0.30409356725146197, + "acc_norm_stderr": 0.03528211258245232 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.21542227662178703, + "mc1_stderr": 0.014391902652427674, + "mc2": 0.37636128873514013, + "mc2_stderr": 0.015435916407328693 + }, + "all": { + "acc": 0.24382493996301155, + "acc_stderr": 0.03121239851944796, + "acc_norm": 0.24574358092474674, + "acc_norm_stderr": 0.031221890872094742, + "mc1": 0.21542227662178703, + "mc1_stderr": 0.014391902652427674, + "mc2": 0.37636128873514013, + "mc2_stderr": 0.015435916407328693 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "PygmalionAI/pygmalion-1.3b", + "model_sha": "bef2c90128c00ff6f16c0f397463423b7d988e17", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "573b1b078b6e9deb", + "hash_cont_tokens": "22424bcffb42ecdf" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "f0fd0caf4d4c1110", + "hash_cont_tokens": "62a15ef112ea07d6" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "f076ac6b177ca28c", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "059827606e6b0780", + "hash_cont_tokens": "ec7e2288ab5f1ce9" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "1dd0dab88aa9e4b2", + "hash_cont_tokens": "d7e922da5bc6d1bf" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "d51eb5246cbe2173", + "hash_cont_tokens": "08933598b321179c" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "2337a7f17800c6ec", + "hash_cont_tokens": "bc82b3cc5072f164" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "e394ebbb8ceace76", + "hash_cont_tokens": "3bc45e0c4b6d612d" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "9221fbdf710a6f67", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "ebe2748d21b2ba41", + "hash_cont_tokens": "d839b8186e0f3d94" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "bfecefb08ffb7faa", + "hash_cont_tokens": "3c16f9c45a7a7272" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "2ac8aec9025dc58b", + "hash_cont_tokens": "16f654508cdc19c4" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "faf44c77f43368ef", + "hash_cont_tokens": "a3a24586c7218684" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "280c7f12abde10a5", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "217a841c86d2d992", + "hash_cont_tokens": "43818b3dc0c7496f" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "354267c0f98aad3b", + "hash_cont_tokens": "4f0a3e41169314a8" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "4f5e8d051d04dde0", + "hash_cont_tokens": "7e14ccd1e2688bb8" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "cd12bec1d5448dda", + "hash_cont_tokens": "317e29ee6bba387d" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "c549e395850984fe", + "hash_cont_tokens": "c01a9b75f55e32e0" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "81b06f5caa221f97", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "ad626d781102fe51", + "hash_cont_tokens": "edb2063e955bd5ca" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "2c0d3f2eacc6bbd5", + "hash_cont_tokens": "8000de09bc1dc113" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "aada51d0571db37b", + "hash_cont_tokens": "dcd6a0ada4ab8e0b" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "6e47d696116edd01", + "hash_cont_tokens": "47a5e5973f50fe17" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "0e8ee6c9e572e3c4", + "hash_cont_tokens": "812f79117b9593de" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8fa2bf90de3b07e7", + "hash_cont_tokens": "b4c405890ebd3ee1" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fabb8f176276af2f", + "hash_cont_tokens": "8d468d84a686647d" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3e86d13ef021476a", + "hash_cont_tokens": "e5d02f8f1c5dcf31" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a132b5e9c9531b36", + "hash_cont_tokens": "4c32e38c066727bc" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "f8f6fe5143776cb4", + "hash_cont_tokens": "9416ad85fd6f4a2c" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "e28121967b27a315", + "hash_cont_tokens": "57cc212706ddcdf4" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "bdbe90efb4a1c4ce", + "hash_cont_tokens": "8c5c954092a64343" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "b8f58f05dc082011", + "hash_cont_tokens": "e5ab34a54e3f5b7c" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "3af911bf93093a85", + "hash_cont_tokens": "f3276c80ce1b205b" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "1dd2240eb90b9a70", + "hash_cont_tokens": "7982edf99219e1b0" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f3de2f8181824a79", + "hash_cont_tokens": "ed73d516c5552dd0" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "0c2a1dd63cc74137", + "hash_cont_tokens": "549d9b32b8a90e4e" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "08e3527985f33aab", + "hash_cont_tokens": "ddf5241e450210d6" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "bf7216a648529f68", + "hash_cont_tokens": "eb791fcbee9e0682" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "28f5891c956afd65", + "hash_cont_tokens": "c66b1f3b46001b09" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6de88b824d4f64c3", + "hash_cont_tokens": "27795e9c98bdeda8" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "5ef855d01044fd83", + "hash_cont_tokens": "874c5b0b496cbe8a" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "1840e0b96d7e619e", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "02483f6b53dc13ac", + "hash_cont_tokens": "313ee361fbdbab3c" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "93202e79d594dde4", + "hash_cont_tokens": "fe7747dc69c4909e" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "41c03f41d2ba9fe7", + "hash_cont_tokens": "e0d0ad58a3f1ff22" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "d83bcb6dd08809ac", + "hash_cont_tokens": "c55a10a018de0228" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "65c70474c8a5d205", + "hash_cont_tokens": "7916d26928435f1a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "4d4126ac9a91ac47", + "hash_cont_tokens": "81836c52a10e6ffd" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "592f80ad364d686a", + "hash_cont_tokens": "f5d669014a273483" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "7f837322b1b62ac1", + "hash_cont_tokens": "6b31cf265df9b81b" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "05a8ef0dd10b4bba", + "hash_cont_tokens": "4b3ac60441ad14ec" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3c7944f0b2c49f64", + "hash_cont_tokens": "f139af481f2a9e74" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "637e934bb716d5ec", + "hash_cont_tokens": "ca79966b90cda0ea" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "3bad229573ed6a9c", + "hash_cont_tokens": "952a2e479fc3a83e" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "70a479e96d02d5d8", + "hash_cont_tokens": "f49476cf49b37d7c" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "0d690fc0db462440", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "4b0fdf8e692dd640", + "hash_cont_tokens": "0065c4bbe6134c1c" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "cfd7092dc8aacd96", + "hash_cont_tokens": "9a178e9ec050bf3e" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "e820abadeb7ebfb3", + "hash_cont_tokens": "7f48ddfffa64eb41" + } + } +} \ No newline at end of file diff --git a/eval-results/PygmalionAI/pygmalion-1.3b/results_2023-10-15T07-13-21.177207.json b/eval-results/PygmalionAI/pygmalion-1.3b/results_2023-10-15T07-13-21.177207.json new file mode 100644 index 0000000000000000000000000000000000000000..8d4a135be6ea6482946a4edbd957ca5e6517fcfb --- /dev/null +++ b/eval-results/PygmalionAI/pygmalion-1.3b/results_2023-10-15T07-13-21.177207.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "PygmalionAI/pygmalion-1.3b", + "model_sha": "bef2c90128c00ff6f16c0f397463423b7d988e17", + "model_size": "2.65 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.02946728187919463, + "em_stderr": 0.0017318679706719317, + "f1": 0.06647021812080542, + "f1_stderr": 0.002046982940584873 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.500394632991318, + "acc_stderr": 0.014052481306049516 + }, + "all": { + "em": 0.02946728187919463, + "em_stderr": 0.0017318679706719317, + "f1": 0.06647021812080542, + "f1_stderr": 0.002046982940584873, + "acc": 0.250197316495659, + "acc_stderr": 0.007026240653024758 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "4bf3f6ba1bae765a", + "hash_cont_tokens": "77c022d5743695ee" + }, + "truncated": 439, + "non-truncated": 9097, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "ef516f9ffbe76423", + "hash_cont_tokens": "7a889e46f3226053" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c469718508f43cab", + "hash_cont_tokens": "87eeb79172195781" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2456, + "non-padded": 78, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "401c6c49053f17ab", + "hash_cont_tokens": "4113f538b15422a0" + }, + "total_evaluation_time_secondes": "4024.7635774612427", + "truncated": 439, + "non-truncated": 12950, + "padded": 2456, + "non-padded": 10933, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PygmalionAI/pygmalion-2-13b/results_2023-10-03T17-49-20.721820.json b/eval-results/PygmalionAI/pygmalion-2-13b/results_2023-10-03T17-49-20.721820.json new file mode 100644 index 0000000000000000000000000000000000000000..335f755c73cf34906abf63a9b3602504413bc364 --- /dev/null +++ b/eval-results/PygmalionAI/pygmalion-2-13b/results_2023-10-03T17-49-20.721820.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "PygmalionAI/pygmalion-2-13b", + "model_sha": "3cdc103995ccd5fc7fd2cb5f51f71b510466f5fc", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.568259385665529, + "acc_stderr": 0.014474591427196206, + "acc_norm": 0.6032423208191127, + "acc_norm_stderr": 0.014296513020180646 + }, + "harness|hellaswag|10": { + "acc": 0.6139215295757817, + "acc_stderr": 0.004858539527872461, + "acc_norm": 0.8237402907787293, + "acc_norm_stderr": 0.0038026223415290133 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252606, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252606 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5328947368421053, + "acc_stderr": 0.040601270352363966, + "acc_norm": 0.5328947368421053, + "acc_norm_stderr": 0.040601270352363966 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6415094339622641, + "acc_stderr": 0.029514703583981762, + "acc_norm": 0.6415094339622641, + "acc_norm_stderr": 0.029514703583981762 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6041666666666666, + "acc_stderr": 0.04089465449325582, + "acc_norm": 0.6041666666666666, + "acc_norm_stderr": 0.04089465449325582 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5433526011560693, + "acc_stderr": 0.03798106566014498, + "acc_norm": 0.5433526011560693, + "acc_norm_stderr": 0.03798106566014498 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.04440521906179327, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.04440521906179327 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4127659574468085, + "acc_stderr": 0.03218471141400351, + "acc_norm": 0.4127659574468085, + "acc_norm_stderr": 0.03218471141400351 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.043727482902780064, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.043727482902780064 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3306878306878307, + "acc_stderr": 0.024229965298425072, + "acc_norm": 0.3306878306878307, + "acc_norm_stderr": 0.024229965298425072 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.043435254289490965, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.043435254289490965 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6516129032258065, + "acc_stderr": 0.027104826328100944, + "acc_norm": 0.6516129032258065, + "acc_norm_stderr": 0.027104826328100944 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.47783251231527096, + "acc_stderr": 0.03514528562175008, + "acc_norm": 0.47783251231527096, + "acc_norm_stderr": 0.03514528562175008 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6787878787878788, + "acc_stderr": 0.0364620496325381, + "acc_norm": 0.6787878787878788, + "acc_norm_stderr": 0.0364620496325381 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7070707070707071, + "acc_stderr": 0.032424979581788166, + "acc_norm": 0.7070707070707071, + "acc_norm_stderr": 0.032424979581788166 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8031088082901554, + "acc_stderr": 0.028697873971860688, + "acc_norm": 0.8031088082901554, + "acc_norm_stderr": 0.028697873971860688 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5153846153846153, + "acc_stderr": 0.025339003010106515, + "acc_norm": 0.5153846153846153, + "acc_norm_stderr": 0.025339003010106515 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.27037037037037037, + "acc_stderr": 0.02708037281514565, + "acc_norm": 0.27037037037037037, + "acc_norm_stderr": 0.02708037281514565 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.0322529423239964, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.0322529423239964 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.038020397601079024, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.038020397601079024 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7504587155963303, + "acc_stderr": 0.018553897629501628, + "acc_norm": 0.7504587155963303, + "acc_norm_stderr": 0.018553897629501628 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4027777777777778, + "acc_stderr": 0.033448873829978666, + "acc_norm": 0.4027777777777778, + "acc_norm_stderr": 0.033448873829978666 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.75, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.75, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.759493670886076, + "acc_stderr": 0.027820781981149685, + "acc_norm": 0.759493670886076, + "acc_norm_stderr": 0.027820781981149685 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6547085201793722, + "acc_stderr": 0.03191100192835794, + "acc_norm": 0.6547085201793722, + "acc_norm_stderr": 0.03191100192835794 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6259541984732825, + "acc_stderr": 0.042438692422305246, + "acc_norm": 0.6259541984732825, + "acc_norm_stderr": 0.042438692422305246 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.03984979653302872, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.03984979653302872 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7129629629629629, + "acc_stderr": 0.043733130409147614, + "acc_norm": 0.7129629629629629, + "acc_norm_stderr": 0.043733130409147614 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6993865030674846, + "acc_stderr": 0.03602511318806771, + "acc_norm": 0.6993865030674846, + "acc_norm_stderr": 0.03602511318806771 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3392857142857143, + "acc_stderr": 0.04493949068613539, + "acc_norm": 0.3392857142857143, + "acc_norm_stderr": 0.04493949068613539 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7087378640776699, + "acc_stderr": 0.044986763205729224, + "acc_norm": 0.7087378640776699, + "acc_norm_stderr": 0.044986763205729224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.024414947304543678, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.024414947304543678 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7637292464878672, + "acc_stderr": 0.01519047371703751, + "acc_norm": 0.7637292464878672, + "acc_norm_stderr": 0.01519047371703751 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.638728323699422, + "acc_stderr": 0.025862201852277895, + "acc_norm": 0.638728323699422, + "acc_norm_stderr": 0.025862201852277895 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4122905027932961, + "acc_stderr": 0.01646320023811452, + "acc_norm": 0.4122905027932961, + "acc_norm_stderr": 0.01646320023811452 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6274509803921569, + "acc_stderr": 0.027684181883302898, + "acc_norm": 0.6274509803921569, + "acc_norm_stderr": 0.027684181883302898 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6463022508038585, + "acc_stderr": 0.02715520810320086, + "acc_norm": 0.6463022508038585, + "acc_norm_stderr": 0.02715520810320086 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6358024691358025, + "acc_stderr": 0.026774929899722327, + "acc_norm": 0.6358024691358025, + "acc_norm_stderr": 0.026774929899722327 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3900709219858156, + "acc_stderr": 0.029097675599463926, + "acc_norm": 0.3900709219858156, + "acc_norm_stderr": 0.029097675599463926 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.41199478487614083, + "acc_stderr": 0.01257087103214607, + "acc_norm": 0.41199478487614083, + "acc_norm_stderr": 0.01257087103214607 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5514705882352942, + "acc_stderr": 0.030211479609121596, + "acc_norm": 0.5514705882352942, + "acc_norm_stderr": 0.030211479609121596 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.565359477124183, + "acc_stderr": 0.02005426920072646, + "acc_norm": 0.565359477124183, + "acc_norm_stderr": 0.02005426920072646 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6181818181818182, + "acc_stderr": 0.046534298079135075, + "acc_norm": 0.6181818181818182, + "acc_norm_stderr": 0.046534298079135075 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6408163265306123, + "acc_stderr": 0.030713560455108493, + "acc_norm": 0.6408163265306123, + "acc_norm_stderr": 0.030713560455108493 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.736318407960199, + "acc_stderr": 0.031157150869355558, + "acc_norm": 0.736318407960199, + "acc_norm_stderr": 0.031157150869355558 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.81, + "acc_stderr": 0.039427724440366255, + "acc_norm": 0.81, + "acc_norm_stderr": 0.039427724440366255 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4578313253012048, + "acc_stderr": 0.0387862677100236, + "acc_norm": 0.4578313253012048, + "acc_norm_stderr": 0.0387862677100236 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.03188578017686398, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.03188578017686398 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.27906976744186046, + "mc1_stderr": 0.015702107090627904, + "mc2": 0.4222097304728649, + "mc2_stderr": 0.014406750015914481 + }, + "all": { + "acc": 0.5612445914530442, + "acc_stderr": 0.034411456369991295, + "acc_norm": 0.5653937727472228, + "acc_norm_stderr": 0.03439054119044284, + "mc1": 0.27906976744186046, + "mc1_stderr": 0.015702107090627904, + "mc2": 0.4222097304728649, + "mc2_stderr": 0.014406750015914481 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6366.528665304184", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PygmalionAI/pygmalion-2-13b/results_2023-10-23T10-43-41.239191.json b/eval-results/PygmalionAI/pygmalion-2-13b/results_2023-10-23T10-43-41.239191.json new file mode 100644 index 0000000000000000000000000000000000000000..f5a802766296685fcb528ae154130e9d5650dc68 --- /dev/null +++ b/eval-results/PygmalionAI/pygmalion-2-13b/results_2023-10-23T10-43-41.239191.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "PygmalionAI/pygmalion-2-13b", + "model_sha": "3cdc103995ccd5fc7fd2cb5f51f71b510466f5fc", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0016778523489932886, + "em_stderr": 0.0004191330178826867, + "f1": 0.06134752516778516, + "f1_stderr": 0.0013751962272216705 + }, + "harness|gsm8k|5": { + "acc": 0.11751326762699014, + "acc_stderr": 0.008870331256489975 + }, + "harness|winogrande|5": { + "acc": 0.7805840568271507, + "acc_stderr": 0.01163126836060778 + }, + "all": { + "em": 0.0016778523489932886, + "em_stderr": 0.0004191330178826867, + "f1": 0.06134752516778516, + "f1_stderr": 0.0013751962272216705, + "acc": 0.4490486622270704, + "acc_stderr": 0.010250799808548879 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "37acf9420f65e289" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "fd823af388cba8de" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "0914e13e00a54703" + }, + "total_evaluation_time_secondes": "12393.247506856918", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PygmalionAI/pygmalion-2-7b/results_2023-10-08T20-22-41.887829.json b/eval-results/PygmalionAI/pygmalion-2-7b/results_2023-10-08T20-22-41.887829.json new file mode 100644 index 0000000000000000000000000000000000000000..3b647990e07a66f9960f7869ef671378986d7dd4 --- /dev/null +++ b/eval-results/PygmalionAI/pygmalion-2-7b/results_2023-10-08T20-22-41.887829.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "PygmalionAI/pygmalion-2-7b", + "model_sha": "983f8ad5c156f4a0e4d2b7b5f1146981ad2e8a8b", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5025597269624573, + "acc_stderr": 0.01461119932984378, + "acc_norm": 0.5401023890784983, + "acc_norm_stderr": 0.01456431885692485 + }, + "harness|hellaswag|10": { + "acc": 0.5846444931288588, + "acc_stderr": 0.004917761181740162, + "acc_norm": 0.7823142800238996, + "acc_norm_stderr": 0.004118291804519085 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5259259259259259, + "acc_stderr": 0.04313531696750575, + "acc_norm": 0.5259259259259259, + "acc_norm_stderr": 0.04313531696750575 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4144736842105263, + "acc_stderr": 0.04008973785779206, + "acc_norm": 0.4144736842105263, + "acc_norm_stderr": 0.04008973785779206 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4981132075471698, + "acc_stderr": 0.030772653642075664, + "acc_norm": 0.4981132075471698, + "acc_norm_stderr": 0.030772653642075664 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4930555555555556, + "acc_stderr": 0.04180806750294938, + "acc_norm": 0.4930555555555556, + "acc_norm_stderr": 0.04180806750294938 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4161849710982659, + "acc_stderr": 0.03758517775404947, + "acc_norm": 0.4161849710982659, + "acc_norm_stderr": 0.03758517775404947 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.13725490196078433, + "acc_stderr": 0.03424084669891521, + "acc_norm": 0.13725490196078433, + "acc_norm_stderr": 0.03424084669891521 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.425531914893617, + "acc_stderr": 0.03232146916224468, + "acc_norm": 0.425531914893617, + "acc_norm_stderr": 0.03232146916224468 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322004, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322004 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30687830687830686, + "acc_stderr": 0.023752928712112143, + "acc_norm": 0.30687830687830686, + "acc_norm_stderr": 0.023752928712112143 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.040406101782088394, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.040406101782088394 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5419354838709678, + "acc_stderr": 0.028343787250540618, + "acc_norm": 0.5419354838709678, + "acc_norm_stderr": 0.028343787250540618 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3448275862068966, + "acc_stderr": 0.03344283744280459, + "acc_norm": 0.3448275862068966, + "acc_norm_stderr": 0.03344283744280459 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6303030303030303, + "acc_stderr": 0.03769430314512567, + "acc_norm": 0.6303030303030303, + "acc_norm_stderr": 0.03769430314512567 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6060606060606061, + "acc_stderr": 0.03481285338232963, + "acc_norm": 0.6060606060606061, + "acc_norm_stderr": 0.03481285338232963 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7202072538860104, + "acc_stderr": 0.032396370467357036, + "acc_norm": 0.7202072538860104, + "acc_norm_stderr": 0.032396370467357036 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.45384615384615384, + "acc_stderr": 0.025242770987126177, + "acc_norm": 0.45384615384615384, + "acc_norm_stderr": 0.025242770987126177 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.026719240783712166, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.026719240783712166 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4369747899159664, + "acc_stderr": 0.03221943636566196, + "acc_norm": 0.4369747899159664, + "acc_norm_stderr": 0.03221943636566196 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943342, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943342 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6587155963302752, + "acc_stderr": 0.020328612816592446, + "acc_norm": 0.6587155963302752, + "acc_norm_stderr": 0.020328612816592446 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.38425925925925924, + "acc_stderr": 0.03317354514310742, + "acc_norm": 0.38425925925925924, + "acc_norm_stderr": 0.03317354514310742 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6617647058823529, + "acc_stderr": 0.0332057461294543, + "acc_norm": 0.6617647058823529, + "acc_norm_stderr": 0.0332057461294543 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7172995780590717, + "acc_stderr": 0.029312814153955924, + "acc_norm": 0.7172995780590717, + "acc_norm_stderr": 0.029312814153955924 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5560538116591929, + "acc_stderr": 0.03334625674242728, + "acc_norm": 0.5560538116591929, + "acc_norm_stderr": 0.03334625674242728 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6030534351145038, + "acc_stderr": 0.04291135671009224, + "acc_norm": 0.6030534351145038, + "acc_norm_stderr": 0.04291135671009224 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6115702479338843, + "acc_stderr": 0.04449270350068382, + "acc_norm": 0.6115702479338843, + "acc_norm_stderr": 0.04449270350068382 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5462962962962963, + "acc_stderr": 0.04812917324536823, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.04812917324536823 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5644171779141104, + "acc_stderr": 0.03895632464138936, + "acc_norm": 0.5644171779141104, + "acc_norm_stderr": 0.03895632464138936 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.38392857142857145, + "acc_stderr": 0.04616143075028547, + "acc_norm": 0.38392857142857145, + "acc_norm_stderr": 0.04616143075028547 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6310679611650486, + "acc_stderr": 0.0477761518115674, + "acc_norm": 0.6310679611650486, + "acc_norm_stderr": 0.0477761518115674 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7264957264957265, + "acc_stderr": 0.02920254015343117, + "acc_norm": 0.7264957264957265, + "acc_norm_stderr": 0.02920254015343117 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6832694763729247, + "acc_stderr": 0.016635566427712568, + "acc_norm": 0.6832694763729247, + "acc_norm_stderr": 0.016635566427712568 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5289017341040463, + "acc_stderr": 0.026874085883518348, + "acc_norm": 0.5289017341040463, + "acc_norm_stderr": 0.026874085883518348 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2536312849162011, + "acc_stderr": 0.014551553659369916, + "acc_norm": 0.2536312849162011, + "acc_norm_stderr": 0.014551553659369916 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5490196078431373, + "acc_stderr": 0.028491993586171566, + "acc_norm": 0.5490196078431373, + "acc_norm_stderr": 0.028491993586171566 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.617363344051447, + "acc_stderr": 0.02760468902858199, + "acc_norm": 0.617363344051447, + "acc_norm_stderr": 0.02760468902858199 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.02780165621232366, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.02780165621232366 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3900709219858156, + "acc_stderr": 0.029097675599463926, + "acc_norm": 0.3900709219858156, + "acc_norm_stderr": 0.029097675599463926 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3670143415906128, + "acc_stderr": 0.012310264244842132, + "acc_norm": 0.3670143415906128, + "acc_norm_stderr": 0.012310264244842132 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.030320243265004137, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.030320243265004137 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.46405228758169936, + "acc_stderr": 0.020175488765484043, + "acc_norm": 0.46405228758169936, + "acc_norm_stderr": 0.020175488765484043 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5363636363636364, + "acc_stderr": 0.04776449162396197, + "acc_norm": 0.5363636363636364, + "acc_norm_stderr": 0.04776449162396197 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5836734693877551, + "acc_stderr": 0.03155782816556165, + "acc_norm": 0.5836734693877551, + "acc_norm_stderr": 0.03155782816556165 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6915422885572139, + "acc_stderr": 0.03265819588512699, + "acc_norm": 0.6915422885572139, + "acc_norm_stderr": 0.03265819588512699 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.40963855421686746, + "acc_stderr": 0.03828401115079022, + "acc_norm": 0.40963855421686746, + "acc_norm_stderr": 0.03828401115079022 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.695906432748538, + "acc_stderr": 0.0352821125824523, + "acc_norm": 0.695906432748538, + "acc_norm_stderr": 0.0352821125824523 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2864137086903305, + "mc1_stderr": 0.01582614243950235, + "mc2": 0.4378491944225627, + "mc2_stderr": 0.014339426796993134 + }, + "all": { + "acc": 0.49287508581343986, + "acc_stderr": 0.03498236330341224, + "acc_norm": 0.4968617374915938, + "acc_norm_stderr": 0.03496801839069801, + "mc1": 0.2864137086903305, + "mc1_stderr": 0.01582614243950235, + "mc2": 0.4378491944225627, + "mc2_stderr": 0.014339426796993134 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4223.705117940903", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PygmalionAI/pygmalion-2-7b/results_2023-10-28T09-33-13.706982.json b/eval-results/PygmalionAI/pygmalion-2-7b/results_2023-10-28T09-33-13.706982.json new file mode 100644 index 0000000000000000000000000000000000000000..d6b720219fafd998508aba1f2036470aa83d3c1e --- /dev/null +++ b/eval-results/PygmalionAI/pygmalion-2-7b/results_2023-10-28T09-33-13.706982.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "PygmalionAI/pygmalion-2-7b", + "model_sha": "983f8ad5c156f4a0e4d2b7b5f1146981ad2e8a8b", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001153523489932886, + "em_stderr": 0.00034761798968571027, + "f1": 0.05976614932885909, + "f1_stderr": 0.0013611207374076375 + }, + "harness|gsm8k|5": { + "acc": 0.06368460955269144, + "acc_stderr": 0.006726213078805692 + }, + "harness|winogrande|5": { + "acc": 0.7513812154696132, + "acc_stderr": 0.012147314713403105 + }, + "all": { + "em": 0.001153523489932886, + "em_stderr": 0.00034761798968571027, + "f1": 0.05976614932885909, + "f1_stderr": 0.0013611207374076375, + "acc": 0.4075329125111523, + "acc_stderr": 0.009436763896104398 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "2167ae9f02da0f70" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "89a8bc3a6f5b5b44" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "55396e315d79d7d7" + }, + "total_evaluation_time_secondes": "9649.295262098312", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PygmalionAI/pygmalion-2.7b/results_2023-07-19T16-36-05.422128.json b/eval-results/PygmalionAI/pygmalion-2.7b/results_2023-07-19T16-36-05.422128.json new file mode 100644 index 0000000000000000000000000000000000000000..df2cd8f9fbe6b288b339c2b15b4dedab1f187f16 --- /dev/null +++ b/eval-results/PygmalionAI/pygmalion-2.7b/results_2023-07-19T16-36-05.422128.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.2909556313993174, + "acc_stderr": 0.013273077865907581, + "acc_norm": 0.32764505119453924, + "acc_norm_stderr": 0.013715847940719342 + }, + "harness|hellaswag|10": { + "acc": 0.4191396136227843, + "acc_stderr": 0.0049240987118645725, + "acc_norm": 0.5413264289982075, + "acc_norm_stderr": 0.004972708369656545 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.18518518518518517, + "acc_stderr": 0.03355677216313142, + "acc_norm": 0.18518518518518517, + "acc_norm_stderr": 0.03355677216313142 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2188679245283019, + "acc_stderr": 0.02544786382510863, + "acc_norm": 0.2188679245283019, + "acc_norm_stderr": 0.02544786382510863 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.21965317919075145, + "acc_stderr": 0.031568093627031744, + "acc_norm": 0.21965317919075145, + "acc_norm_stderr": 0.031568093627031744 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237655, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237655 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748141, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748141 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2206896551724138, + "acc_stderr": 0.03455930201924811, + "acc_norm": 0.2206896551724138, + "acc_norm_stderr": 0.03455930201924811 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.022569897074918407, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.022569897074918407 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.1984126984126984, + "acc_stderr": 0.035670166752768614, + "acc_norm": 0.1984126984126984, + "acc_norm_stderr": 0.035670166752768614 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.18064516129032257, + "acc_stderr": 0.021886178567172544, + "acc_norm": 0.18064516129032257, + "acc_norm_stderr": 0.021886178567172544 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.16748768472906403, + "acc_stderr": 0.026273086047535418, + "acc_norm": 0.16748768472906403, + "acc_norm_stderr": 0.026273086047535418 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.17676767676767677, + "acc_stderr": 0.027178752639044915, + "acc_norm": 0.17676767676767677, + "acc_norm_stderr": 0.027178752639044915 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.19689119170984457, + "acc_stderr": 0.028697873971860674, + "acc_norm": 0.19689119170984457, + "acc_norm_stderr": 0.028697873971860674 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.20256410256410257, + "acc_stderr": 0.02037766097037138, + "acc_norm": 0.20256410256410257, + "acc_norm_stderr": 0.02037766097037138 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.22962962962962963, + "acc_stderr": 0.02564410863926764, + "acc_norm": 0.22962962962962963, + "acc_norm_stderr": 0.02564410863926764 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21008403361344538, + "acc_stderr": 0.026461398717471874, + "acc_norm": 0.21008403361344538, + "acc_norm_stderr": 0.026461398717471874 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.1986754966887417, + "acc_stderr": 0.03257847384436776, + "acc_norm": 0.1986754966887417, + "acc_norm_stderr": 0.03257847384436776 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.21467889908256882, + "acc_stderr": 0.017604304149256483, + "acc_norm": 0.21467889908256882, + "acc_norm_stderr": 0.017604304149256483 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.16203703703703703, + "acc_stderr": 0.02513045365226846, + "acc_norm": 0.16203703703703703, + "acc_norm_stderr": 0.02513045365226846 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.030778554678693268, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.030778554678693268 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.2825112107623318, + "acc_stderr": 0.03021683101150877, + "acc_norm": 0.2825112107623318, + "acc_norm_stderr": 0.03021683101150877 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.24427480916030533, + "acc_stderr": 0.03768335959728742, + "acc_norm": 0.24427480916030533, + "acc_norm_stderr": 0.03768335959728742 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.1941747572815534, + "acc_stderr": 0.03916667762822584, + "acc_norm": 0.1941747572815534, + "acc_norm_stderr": 0.03916667762822584 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2905982905982906, + "acc_stderr": 0.02974504857267404, + "acc_norm": 0.2905982905982906, + "acc_norm_stderr": 0.02974504857267404 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2567049808429119, + "acc_stderr": 0.015620480263064536, + "acc_norm": 0.2567049808429119, + "acc_norm_stderr": 0.015620480263064536 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2514450867052023, + "acc_stderr": 0.02335736578587404, + "acc_norm": 0.2514450867052023, + "acc_norm_stderr": 0.02335736578587404 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.02428861946604611, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.02428861946604611 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1864951768488746, + "acc_stderr": 0.02212243977248077, + "acc_norm": 0.1864951768488746, + "acc_norm_stderr": 0.02212243977248077 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2716049382716049, + "acc_stderr": 0.02474862449053737, + "acc_norm": 0.2716049382716049, + "acc_norm_stderr": 0.02474862449053737 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2198581560283688, + "acc_stderr": 0.024706141070705477, + "acc_norm": 0.2198581560283688, + "acc_norm_stderr": 0.024706141070705477 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24511082138200782, + "acc_stderr": 0.010986307870045514, + "acc_norm": 0.24511082138200782, + "acc_norm_stderr": 0.010986307870045514 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.18382352941176472, + "acc_stderr": 0.02352924218519311, + "acc_norm": 0.18382352941176472, + "acc_norm_stderr": 0.02352924218519311 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25163398692810457, + "acc_stderr": 0.017555818091322256, + "acc_norm": 0.25163398692810457, + "acc_norm_stderr": 0.017555818091322256 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.23636363636363636, + "acc_stderr": 0.04069306319721377, + "acc_norm": 0.23636363636363636, + "acc_norm_stderr": 0.04069306319721377 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.19183673469387755, + "acc_stderr": 0.025206963154225392, + "acc_norm": 0.19183673469387755, + "acc_norm_stderr": 0.025206963154225392 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.25870646766169153, + "acc_stderr": 0.030965903123573037, + "acc_norm": 0.25870646766169153, + "acc_norm_stderr": 0.030965903123573037 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2710843373493976, + "acc_stderr": 0.03460579907553027, + "acc_norm": 0.2710843373493976, + "acc_norm_stderr": 0.03460579907553027 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.30994152046783624, + "acc_stderr": 0.03546976959393163, + "acc_norm": 0.30994152046783624, + "acc_norm_stderr": 0.03546976959393163 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.21542227662178703, + "mc1_stderr": 0.014391902652427678, + "mc2": 0.3717154965847934, + "mc2_stderr": 0.014343379743869154 + }, + "all": { + "acc": 0.23696774704445156, + "acc_stderr": 0.03077301607940244, + "acc_norm": 0.23966056458971674, + "acc_norm_stderr": 0.03078134454944657, + "mc1": 0.21542227662178703, + "mc1_stderr": 0.014391902652427678, + "mc2": 0.3717154965847934, + "mc2_stderr": 0.014343379743869154 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "PygmalionAI/pygmalion-2.7b", + "model_sha": "9533805293bc48e8ddfe9dc1940d8cbc5662113e", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + } + } +} \ No newline at end of file diff --git a/eval-results/PygmalionAI/pygmalion-2.7b/results_2023-09-22T20-17-59.683847.json b/eval-results/PygmalionAI/pygmalion-2.7b/results_2023-09-22T20-17-59.683847.json new file mode 100644 index 0000000000000000000000000000000000000000..d0d4cd566bfd83edfd166457bf10839d4ad79506 --- /dev/null +++ b/eval-results/PygmalionAI/pygmalion-2.7b/results_2023-09-22T20-17-59.683847.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "PygmalionAI/pygmalion-2.7b", + "model_sha": "9533805293bc48e8ddfe9dc1940d8cbc5662113e", + "model_size": "4.95 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.04320469798657718, + "em_stderr": 0.0020821626664430564, + "f1": 0.08408347315436249, + "f1_stderr": 0.0023636579014392274 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5651144435674822, + "acc_stderr": 0.013932814110418024 + }, + "all": { + "em": 0.04320469798657718, + "em_stderr": 0.0020821626664430564, + "f1": 0.08408347315436249, + "f1_stderr": 0.0023636579014392274, + "acc": 0.2825572217837411, + "acc_stderr": 0.006966407055209012 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f21277d2c2d2e06c", + "hash_cont_tokens": "1e706169a5b110bf" + }, + "truncated": 382, + "non-truncated": 9154, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "1b3169001fceb827" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "26cd3631535039d0", + "hash_cont_tokens": "7ab80d09846e6d84" + }, + "total_evaluation_time_secondes": "14273.363525390625", + "truncated": 382, + "non-truncated": 13007, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PygmalionAI/pygmalion-350m/results_2023-07-19T14-13-12.933882.json b/eval-results/PygmalionAI/pygmalion-350m/results_2023-07-19T14-13-12.933882.json new file mode 100644 index 0000000000000000000000000000000000000000..a155ae68e125c78493c078a20b4f346d5011dc4b --- /dev/null +++ b/eval-results/PygmalionAI/pygmalion-350m/results_2023-07-19T14-13-12.933882.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.20819112627986347, + "acc_stderr": 0.01186486611844807, + "acc_norm": 0.25, + "acc_norm_stderr": 0.012653835621466646 + }, + "harness|hellaswag|10": { + "acc": 0.32901812387970525, + "acc_stderr": 0.004688963175758139, + "acc_norm": 0.37801234813782114, + "acc_norm_stderr": 0.004838997427699765 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.03749850709174023, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.03749850709174023 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17105263157894737, + "acc_stderr": 0.030643607071677088, + "acc_norm": 0.17105263157894737, + "acc_norm_stderr": 0.030643607071677088 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2037735849056604, + "acc_stderr": 0.0247907845017754, + "acc_norm": 0.2037735849056604, + "acc_norm_stderr": 0.0247907845017754 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2361111111111111, + "acc_stderr": 0.03551446610810826, + "acc_norm": 0.2361111111111111, + "acc_norm_stderr": 0.03551446610810826 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909281, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909281 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2023121387283237, + "acc_stderr": 0.03063114553919882, + "acc_norm": 0.2023121387283237, + "acc_norm_stderr": 0.03063114553919882 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179964, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179964 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3446808510638298, + "acc_stderr": 0.03106898596312215, + "acc_norm": 0.3446808510638298, + "acc_norm_stderr": 0.03106898596312215 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.038351539543994194, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.038351539543994194 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.21379310344827587, + "acc_stderr": 0.034165204477475494, + "acc_norm": 0.21379310344827587, + "acc_norm_stderr": 0.034165204477475494 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.022418042891113942, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.022418042891113942 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.1746031746031746, + "acc_stderr": 0.033954900208561116, + "acc_norm": 0.1746031746031746, + "acc_norm_stderr": 0.033954900208561116 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25161290322580643, + "acc_stderr": 0.024685979286239956, + "acc_norm": 0.25161290322580643, + "acc_norm_stderr": 0.024685979286239956 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2660098522167488, + "acc_stderr": 0.03108982600293752, + "acc_norm": 0.2660098522167488, + "acc_norm_stderr": 0.03108982600293752 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.3393939393939394, + "acc_stderr": 0.036974422050315967, + "acc_norm": 0.3393939393939394, + "acc_norm_stderr": 0.036974422050315967 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3434343434343434, + "acc_stderr": 0.03383201223244442, + "acc_norm": 0.3434343434343434, + "acc_norm_stderr": 0.03383201223244442 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.3160621761658031, + "acc_stderr": 0.033553973696861736, + "acc_norm": 0.3160621761658031, + "acc_norm_stderr": 0.033553973696861736 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.31025641025641026, + "acc_stderr": 0.023454674889404288, + "acc_norm": 0.31025641025641026, + "acc_norm_stderr": 0.023454674889404288 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.02592887613276611, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.02592887613276611 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.24369747899159663, + "acc_stderr": 0.02788682807838057, + "acc_norm": 0.24369747899159663, + "acc_norm_stderr": 0.02788682807838057 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23178807947019867, + "acc_stderr": 0.034454062719870546, + "acc_norm": 0.23178807947019867, + "acc_norm_stderr": 0.034454062719870546 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.27155963302752295, + "acc_stderr": 0.019069098363191435, + "acc_norm": 0.27155963302752295, + "acc_norm_stderr": 0.019069098363191435 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.031321798030832904, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.031321798030832904 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2489451476793249, + "acc_stderr": 0.028146970599422644, + "acc_norm": 0.2489451476793249, + "acc_norm_stderr": 0.028146970599422644 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.23318385650224216, + "acc_stderr": 0.028380391147094716, + "acc_norm": 0.23318385650224216, + "acc_norm_stderr": 0.028380391147094716 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.24427480916030533, + "acc_stderr": 0.03768335959728742, + "acc_norm": 0.24427480916030533, + "acc_norm_stderr": 0.03768335959728742 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.3305785123966942, + "acc_stderr": 0.042943408452120954, + "acc_norm": 0.3305785123966942, + "acc_norm_stderr": 0.042943408452120954 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.25766871165644173, + "acc_stderr": 0.03436150827846917, + "acc_norm": 0.25766871165644173, + "acc_norm_stderr": 0.03436150827846917 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.038946411200447915, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.038946411200447915 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.1650485436893204, + "acc_stderr": 0.036756688322331886, + "acc_norm": 0.1650485436893204, + "acc_norm_stderr": 0.036756688322331886 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.24786324786324787, + "acc_stderr": 0.028286324075564393, + "acc_norm": 0.24786324786324787, + "acc_norm_stderr": 0.028286324075564393 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2260536398467433, + "acc_stderr": 0.014957458504335825, + "acc_norm": 0.2260536398467433, + "acc_norm_stderr": 0.014957458504335825 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.30057803468208094, + "acc_stderr": 0.024685316867257796, + "acc_norm": 0.30057803468208094, + "acc_norm_stderr": 0.024685316867257796 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.25163398692810457, + "acc_stderr": 0.024848018263875195, + "acc_norm": 0.25163398692810457, + "acc_norm_stderr": 0.024848018263875195 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2347266881028939, + "acc_stderr": 0.02407180588767704, + "acc_norm": 0.2347266881028939, + "acc_norm_stderr": 0.02407180588767704 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.23148148148148148, + "acc_stderr": 0.023468429832451163, + "acc_norm": 0.23148148148148148, + "acc_norm_stderr": 0.023468429832451163 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.25177304964539005, + "acc_stderr": 0.0258921511567094, + "acc_norm": 0.25177304964539005, + "acc_norm_stderr": 0.0258921511567094 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2379400260756193, + "acc_stderr": 0.01087570078769422, + "acc_norm": 0.2379400260756193, + "acc_norm_stderr": 0.01087570078769422 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.44485294117647056, + "acc_stderr": 0.030187532060329376, + "acc_norm": 0.44485294117647056, + "acc_norm_stderr": 0.030187532060329376 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.23039215686274508, + "acc_stderr": 0.01703522925803404, + "acc_norm": 0.23039215686274508, + "acc_norm_stderr": 0.01703522925803404 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.24545454545454545, + "acc_stderr": 0.04122066502878284, + "acc_norm": 0.24545454545454545, + "acc_norm_stderr": 0.04122066502878284 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.37142857142857144, + "acc_stderr": 0.03093285879278986, + "acc_norm": 0.37142857142857144, + "acc_norm_stderr": 0.03093285879278986 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.27860696517412936, + "acc_stderr": 0.031700561834973086, + "acc_norm": 0.27860696517412936, + "acc_norm_stderr": 0.031700561834973086 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2469879518072289, + "acc_stderr": 0.03357351982064537, + "acc_norm": 0.2469879518072289, + "acc_norm_stderr": 0.03357351982064537 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.1695906432748538, + "acc_stderr": 0.028782108105401712, + "acc_norm": 0.1695906432748538, + "acc_norm_stderr": 0.028782108105401712 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2386780905752754, + "mc1_stderr": 0.014922629695456416, + "mc2": 0.4040948097037574, + "mc2_stderr": 0.014932250520436091 + }, + "all": { + "acc": 0.2572390652907836, + "acc_stderr": 0.03146655976289831, + "acc_norm": 0.258778100849737, + "acc_norm_stderr": 0.031482475080779, + "mc1": 0.2386780905752754, + "mc1_stderr": 0.014922629695456416, + "mc2": 0.4040948097037574, + "mc2_stderr": 0.014932250520436091 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "PygmalionAI/pygmalion-350m", + "model_sha": "d65832d913f6b396e2ffb64c373d9383c9da9303", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2e52476df896898b", + "hash_cont_tokens": "28e2701291693338" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "a5079f2e8402bdc3", + "hash_cont_tokens": "30e348bce778fa10" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "094c3a171105c12e", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "fe68bfcf91b9075e", + "hash_cont_tokens": "705516ff46ec26dc" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "4d77ecaf04a26dfe", + "hash_cont_tokens": "881af7bd65854d45" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "7353edcfcf72d221", + "hash_cont_tokens": "e760cc7be5ddbe71" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "162bb9f7b3cd706e", + "hash_cont_tokens": "37477257cf9eeb0a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "63d442b13b5d85b6", + "hash_cont_tokens": "3f04694ac6f92548" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "99db48cd6b077b68", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "4bc7d55623070a07", + "hash_cont_tokens": "15b2112308ef7b2b" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e83395ed75fa03d5", + "hash_cont_tokens": "a67ba9facbae0268" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "7f508f7828fe5ba6", + "hash_cont_tokens": "40630b2e3e33ca08" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "0fb01b8731db8d81", + "hash_cont_tokens": "4085a0ba4a98cf79" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "8c8460fe570b556e", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "16e0aa20b920aa11", + "hash_cont_tokens": "f15de85dda56bf9a" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "bc236ab739e1c15b", + "hash_cont_tokens": "35b673589f562c55" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "eec634c59e67082e", + "hash_cont_tokens": "1fec337497bf988f" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "551d76303aaf3f4e", + "hash_cont_tokens": "85d6a2e58f1aa799" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "532728846623b114", + "hash_cont_tokens": "6a362d8f09b66319" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "8aaecba1a0475c64", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "2afe2320ca29933a", + "hash_cont_tokens": "7186426999d40201" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "2ba3b67fb2446a06", + "hash_cont_tokens": "97e729fbed631d26" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "10e55771dbb42b2c", + "hash_cont_tokens": "2d5af91609bd4d0d" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "6d8596e5edbe236d", + "hash_cont_tokens": "2553c38072fe59e9" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "3fb9fd43f1792a28", + "hash_cont_tokens": "967f1a6377c5dada" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "51f21e325fe493bc", + "hash_cont_tokens": "5cbe4530fc364ed8" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "78a8e9b40bc5418c", + "hash_cont_tokens": "3c15870aa9a751c8" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "44525d3009ded4a4", + "hash_cont_tokens": "75f6aa84e7959e70" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "76e98460e3320e1c", + "hash_cont_tokens": "7bfc49a85b0e6b0f" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "f47dbaece0632444", + "hash_cont_tokens": "5ced294bf867b6fa" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d685add8792a69d2", + "hash_cont_tokens": "9ffbe637167399d6" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "10fa751069aea803", + "hash_cont_tokens": "25c58237091f9ea7" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "2b245a8312dd0ee8", + "hash_cont_tokens": "19500e048c94127a" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "fa3b5b3bf631cd40", + "hash_cont_tokens": "0135bf601685a8b0" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "a7cc14eb97a963c1", + "hash_cont_tokens": "350bc807db8602e4" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "5a27a3a18e11300c", + "hash_cont_tokens": "944bf06e08c9e841" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "5355beafda861ea0", + "hash_cont_tokens": "a9ec061d9a865f49" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "85bf654d3221129b", + "hash_cont_tokens": "3813b356ad4675eb" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "5f8c6e6a21145296", + "hash_cont_tokens": "4250ef4e0ecec581" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "1cf278ba4dac7b93", + "hash_cont_tokens": "c4fb7cc44b48985a" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "67df50e49cb50049", + "hash_cont_tokens": "f6301f26d3421bfe" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e254e479a1dd95e6", + "hash_cont_tokens": "4bea1308c2dedd32" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "836b977dd80307df", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "3d9d2c0b97a586f9", + "hash_cont_tokens": "d87f2c7e8fda82f9" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "b354e905172e9a92", + "hash_cont_tokens": "098675117a7f6f77" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "e0f5580d6e0bd639", + "hash_cont_tokens": "bd59c34597b05651" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "e66c2273b0b50f8a", + "hash_cont_tokens": "03bcb0a0f9d4f331" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "72c74dca625bae21", + "hash_cont_tokens": "4b9e620ce1055d4a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "139ea332c437abef", + "hash_cont_tokens": "3f04832c8adc4e0a" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "9e4929005482ae10", + "hash_cont_tokens": "767ed1231cb8e258" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "7105767805e28747", + "hash_cont_tokens": "f0b059007537e041" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f04f0a03ea895b5b", + "hash_cont_tokens": "3bc5fb58666e5e8b" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "46fbbd942e3b6db5", + "hash_cont_tokens": "190e8f92d03650fe" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "4b9217ec408da4d4", + "hash_cont_tokens": "1bda889eaab363c0" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "9eadb993a592c2bf", + "hash_cont_tokens": "859ddf07f8d0ab66" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "18f0e119974d9136", + "hash_cont_tokens": "7fdcb74bc758e7bd" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "9a26a58deec29cba", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "4b0d85cf3b0bf65b", + "hash_cont_tokens": "456a90466d8efd2a" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "b0e8f149dfd2fa76", + "hash_cont_tokens": "6d21235f853c8d4b" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "6e0e57e58e2d03ff", + "hash_cont_tokens": "a67a79a7e9449644" + } + } +} \ No newline at end of file diff --git a/eval-results/PygmalionAI/pygmalion-350m/results_2023-10-14T15-33-29.542088.json b/eval-results/PygmalionAI/pygmalion-350m/results_2023-10-14T15-33-29.542088.json new file mode 100644 index 0000000000000000000000000000000000000000..385be3d61d1a0aa276197742f19a3181515c9846 --- /dev/null +++ b/eval-results/PygmalionAI/pygmalion-350m/results_2023-10-14T15-33-29.542088.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "PygmalionAI/pygmalion-350m", + "model_sha": "d65832d913f6b396e2ffb64c373d9383c9da9303", + "model_size": "631.71 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001363255033557047, + "em_stderr": 0.0003778609196460963, + "f1": 0.03894609899328867, + "f1_stderr": 0.0011582048286439316 + }, + "harness|gsm8k|5": { + "acc": 0.00530705079605762, + "acc_stderr": 0.002001305720948079 + }, + "harness|winogrande|5": { + "acc": 0.5027624309392266, + "acc_stderr": 0.014052271211616433 + }, + "all": { + "em": 0.001363255033557047, + "em_stderr": 0.0003778609196460963, + "f1": 0.03894609899328867, + "f1_stderr": 0.0011582048286439316, + "acc": 0.2540347408676421, + "acc_stderr": 0.008026788466282256 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "e74b23fd6ab24722", + "hash_cont_tokens": "7f99bc2d39ad8858" + }, + "truncated": 384, + "non-truncated": 9152, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "a2243014cab6a7a0", + "hash_cont_tokens": "8244846f7d563110" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "0a8020a0b9bd626c", + "hash_cont_tokens": "d75b4039559457e2" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "409bf3c4619f5fc0", + "hash_cont_tokens": "39ab949cf8d8be00" + }, + "total_evaluation_time_secondes": "7005.662182807922", + "truncated": 384, + "non-truncated": 13005, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PygmalionAI/pygmalion-6b/results_2023-07-18T11-25-58.847315.json b/eval-results/PygmalionAI/pygmalion-6b/results_2023-07-18T11-25-58.847315.json new file mode 100644 index 0000000000000000000000000000000000000000..d1f8b9c60d5bc5c4ce78859c43aec30f3955d9bb --- /dev/null +++ b/eval-results/PygmalionAI/pygmalion-6b/results_2023-07-18T11-25-58.847315.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.35921501706484643, + "acc_stderr": 0.01402022415583914, + "acc_norm": 0.3890784982935154, + "acc_norm_stderr": 0.014247309976045607 + }, + "harness|hellaswag|10": { + "acc": 0.4736108344951205, + "acc_stderr": 0.00498282691668715, + "acc_norm": 0.6483768173670583, + "acc_norm_stderr": 0.004765012078929368 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2074074074074074, + "acc_stderr": 0.03502553170678318, + "acc_norm": 0.2074074074074074, + "acc_norm_stderr": 0.03502553170678318 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.20394736842105263, + "acc_stderr": 0.03279000406310052, + "acc_norm": 0.20394736842105263, + "acc_norm_stderr": 0.03279000406310052 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.25660377358490566, + "acc_stderr": 0.02688064788905197, + "acc_norm": 0.25660377358490566, + "acc_norm_stderr": 0.02688064788905197 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.25, + "acc_stderr": 0.03621034121889507, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03621034121889507 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720683, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720683 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2832369942196532, + "acc_stderr": 0.034355680560478746, + "acc_norm": 0.2832369942196532, + "acc_norm_stderr": 0.034355680560478746 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2765957446808511, + "acc_stderr": 0.02924188386962882, + "acc_norm": 0.2765957446808511, + "acc_norm_stderr": 0.02924188386962882 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748141, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748141 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.33793103448275863, + "acc_stderr": 0.039417076320648906, + "acc_norm": 0.33793103448275863, + "acc_norm_stderr": 0.039417076320648906 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24338624338624337, + "acc_stderr": 0.02210112878741544, + "acc_norm": 0.24338624338624337, + "acc_norm_stderr": 0.02210112878741544 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.038932596106046734, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.038932596106046734 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.27419354838709675, + "acc_stderr": 0.025378139970885196, + "acc_norm": 0.27419354838709675, + "acc_norm_stderr": 0.025378139970885196 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.24630541871921183, + "acc_stderr": 0.030315099285617715, + "acc_norm": 0.24630541871921183, + "acc_norm_stderr": 0.030315099285617715 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.32727272727272727, + "acc_stderr": 0.03663974994391242, + "acc_norm": 0.32727272727272727, + "acc_norm_stderr": 0.03663974994391242 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2474747474747475, + "acc_stderr": 0.030746300742124484, + "acc_norm": 0.2474747474747475, + "acc_norm_stderr": 0.030746300742124484 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.3316062176165803, + "acc_stderr": 0.03397636541089116, + "acc_norm": 0.3316062176165803, + "acc_norm_stderr": 0.03397636541089116 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.28974358974358977, + "acc_stderr": 0.023000628243687964, + "acc_norm": 0.28974358974358977, + "acc_norm_stderr": 0.023000628243687964 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.02730914058823018, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.02730914058823018 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3067226890756303, + "acc_stderr": 0.029953823891887037, + "acc_norm": 0.3067226890756303, + "acc_norm_stderr": 0.029953823891887037 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2251655629139073, + "acc_stderr": 0.03410435282008936, + "acc_norm": 0.2251655629139073, + "acc_norm_stderr": 0.03410435282008936 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.26422018348623855, + "acc_stderr": 0.01890416417151019, + "acc_norm": 0.26422018348623855, + "acc_norm_stderr": 0.01890416417151019 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.18055555555555555, + "acc_stderr": 0.026232878971491652, + "acc_norm": 0.18055555555555555, + "acc_norm_stderr": 0.026232878971491652 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.03283472056108566, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.03283472056108566 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.29957805907172996, + "acc_stderr": 0.029818024749753095, + "acc_norm": 0.29957805907172996, + "acc_norm_stderr": 0.029818024749753095 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.336322869955157, + "acc_stderr": 0.031708824268455, + "acc_norm": 0.336322869955157, + "acc_norm_stderr": 0.031708824268455 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.25190839694656486, + "acc_stderr": 0.038073871163060866, + "acc_norm": 0.25190839694656486, + "acc_norm_stderr": 0.038073871163060866 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.3884297520661157, + "acc_stderr": 0.04449270350068382, + "acc_norm": 0.3884297520661157, + "acc_norm_stderr": 0.04449270350068382 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.3055555555555556, + "acc_stderr": 0.044531975073749834, + "acc_norm": 0.3055555555555556, + "acc_norm_stderr": 0.044531975073749834 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2331288343558282, + "acc_stderr": 0.0332201579577674, + "acc_norm": 0.2331288343558282, + "acc_norm_stderr": 0.0332201579577674 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285713, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285713 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.20388349514563106, + "acc_stderr": 0.03989139859531773, + "acc_norm": 0.20388349514563106, + "acc_norm_stderr": 0.03989139859531773 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2948717948717949, + "acc_stderr": 0.029872577708891148, + "acc_norm": 0.2948717948717949, + "acc_norm_stderr": 0.029872577708891148 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.29757343550446996, + "acc_stderr": 0.01634911191290942, + "acc_norm": 0.29757343550446996, + "acc_norm_stderr": 0.01634911191290942 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.3092485549132948, + "acc_stderr": 0.02488314057007176, + "acc_norm": 0.3092485549132948, + "acc_norm_stderr": 0.02488314057007176 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23687150837988827, + "acc_stderr": 0.01421957078810399, + "acc_norm": 0.23687150837988827, + "acc_norm_stderr": 0.01421957078810399 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.3954248366013072, + "acc_stderr": 0.027996723180631435, + "acc_norm": 0.3954248366013072, + "acc_norm_stderr": 0.027996723180631435 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24115755627009647, + "acc_stderr": 0.024296594034763426, + "acc_norm": 0.24115755627009647, + "acc_norm_stderr": 0.024296594034763426 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2932098765432099, + "acc_stderr": 0.025329888171900922, + "acc_norm": 0.2932098765432099, + "acc_norm_stderr": 0.025329888171900922 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2801418439716312, + "acc_stderr": 0.026789172351140242, + "acc_norm": 0.2801418439716312, + "acc_norm_stderr": 0.026789172351140242 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2842242503259452, + "acc_stderr": 0.01151988059651607, + "acc_norm": 0.2842242503259452, + "acc_norm_stderr": 0.01151988059651607 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.026799562024887667, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.026799562024887667 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.018120224251484587, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.018120224251484587 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2636363636363636, + "acc_stderr": 0.04220224692971987, + "acc_norm": 0.2636363636363636, + "acc_norm_stderr": 0.04220224692971987 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.30612244897959184, + "acc_stderr": 0.02950489645459596, + "acc_norm": 0.30612244897959184, + "acc_norm_stderr": 0.02950489645459596 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.32338308457711445, + "acc_stderr": 0.03307615947979033, + "acc_norm": 0.32338308457711445, + "acc_norm_stderr": 0.03307615947979033 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.25903614457831325, + "acc_stderr": 0.034106466140718564, + "acc_norm": 0.25903614457831325, + "acc_norm_stderr": 0.034106466140718564 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.036155076303109344, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.036155076303109344 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2558139534883721, + "mc1_stderr": 0.015274176219283361, + "mc2": 0.40384428426429253, + "mc2_stderr": 0.013994112647339067 + }, + "all": { + "acc": 0.2846009683260307, + "acc_stderr": 0.03261590673337149, + "acc_norm": 0.28806926432773594, + "acc_norm_stderr": 0.03261606386866723, + "mc1": 0.2558139534883721, + "mc1_stderr": 0.015274176219283361, + "mc2": 0.40384428426429253, + "mc2_stderr": 0.013994112647339067 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "PygmalionAI/pygmalion-6b", + "model_sha": "30e2405100eac6bd53f75964cc7345eeafd19f7d", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + } + } +} \ No newline at end of file diff --git a/eval-results/PygmalionAI/pygmalion-6b/results_2023-09-17T16-08-36.166689.json b/eval-results/PygmalionAI/pygmalion-6b/results_2023-09-17T16-08-36.166689.json new file mode 100644 index 0000000000000000000000000000000000000000..738e2baf2f23b41e172a7540836f2d70a1c154fd --- /dev/null +++ b/eval-results/PygmalionAI/pygmalion-6b/results_2023-09-17T16-08-36.166689.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "PygmalionAI/pygmalion-6b", + "model_sha": "30e2405100eac6bd53f75964cc7345eeafd19f7d", + "model_size": "11.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.1787961409395973, + "em_stderr": 0.003924137464801004, + "f1": 0.2375230704697985, + "f1_stderr": 0.003994427199624895 + }, + "harness|gsm8k|5": { + "acc": 0.02047005307050796, + "acc_stderr": 0.003900413385915718 + }, + "harness|winogrande|5": { + "acc": 0.6250986582478295, + "acc_stderr": 0.013605544523788012 + }, + "all": { + "em": 0.1787961409395973, + "em_stderr": 0.003924137464801004, + "f1": 0.2375230704697985, + "f1_stderr": 0.003994427199624895, + "acc": 0.32278435565916874, + "acc_stderr": 0.008752978954851866 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f21277d2c2d2e06c", + "hash_cont_tokens": "912d9cbb35a382d9" + }, + "truncated": 382, + "non-truncated": 9154, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "ec82a0c6a2e35d98" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "26cd3631535039d0", + "hash_cont_tokens": "68e1e6296d7b0ffe" + }, + "total_evaluation_time_secondes": "8665.233153820038", + "truncated": 382, + "non-truncated": 13007, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/PygmalionAI/pygmalion-6b/results_2023-10-08T20-04-23.834964.json b/eval-results/PygmalionAI/pygmalion-6b/results_2023-10-08T20-04-23.834964.json new file mode 100644 index 0000000000000000000000000000000000000000..98116ddd6fe3c241c4dad0f55251068cb699b52c --- /dev/null +++ b/eval-results/PygmalionAI/pygmalion-6b/results_2023-10-08T20-04-23.834964.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "PygmalionAI/pygmalion-6b", + "model_sha": "2a0d74449c8fbf0378194e95f64aa92e16297294", + "model_size": "11.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.3728668941979522, + "acc_stderr": 0.014131176760131165, + "acc_norm": 0.4052901023890785, + "acc_norm_stderr": 0.014346869060229323 + }, + "harness|hellaswag|10": { + "acc": 0.5053774148575981, + "acc_stderr": 0.004989492828168535, + "acc_norm": 0.6746664011153157, + "acc_norm_stderr": 0.004675418774314239 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768081, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768081 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.03785714465066653, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.03785714465066653 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3092105263157895, + "acc_stderr": 0.037610708698674805, + "acc_norm": 0.3092105263157895, + "acc_norm_stderr": 0.037610708698674805 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.23773584905660378, + "acc_stderr": 0.0261998088075619, + "acc_norm": 0.23773584905660378, + "acc_norm_stderr": 0.0261998088075619 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.20833333333333334, + "acc_stderr": 0.03396116205845333, + "acc_norm": 0.20833333333333334, + "acc_norm_stderr": 0.03396116205845333 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.13, + "acc_stderr": 0.03379976689896308, + "acc_norm": 0.13, + "acc_norm_stderr": 0.03379976689896308 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.23, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.23, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2543352601156069, + "acc_stderr": 0.0332055644308557, + "acc_norm": 0.2543352601156069, + "acc_norm_stderr": 0.0332055644308557 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.18627450980392157, + "acc_stderr": 0.03873958714149351, + "acc_norm": 0.18627450980392157, + "acc_norm_stderr": 0.03873958714149351 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3404255319148936, + "acc_stderr": 0.030976692998534436, + "acc_norm": 0.3404255319148936, + "acc_norm_stderr": 0.030976692998534436 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.040493392977481425, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.040493392977481425 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2689655172413793, + "acc_stderr": 0.03695183311650232, + "acc_norm": 0.2689655172413793, + "acc_norm_stderr": 0.03695183311650232 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.022182037202948368, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.022182037202948368 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.03809523809523811, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.03809523809523811 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.23548387096774193, + "acc_stderr": 0.02413763242933771, + "acc_norm": 0.23548387096774193, + "acc_norm_stderr": 0.02413763242933771 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.23645320197044334, + "acc_stderr": 0.029896114291733552, + "acc_norm": 0.23645320197044334, + "acc_norm_stderr": 0.029896114291733552 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.20707070707070707, + "acc_stderr": 0.028869778460267042, + "acc_norm": 0.20707070707070707, + "acc_norm_stderr": 0.028869778460267042 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.22797927461139897, + "acc_stderr": 0.03027690994517826, + "acc_norm": 0.22797927461139897, + "acc_norm_stderr": 0.03027690994517826 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2512820512820513, + "acc_stderr": 0.021992016662370526, + "acc_norm": 0.2512820512820513, + "acc_norm_stderr": 0.021992016662370526 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.02696242432507383, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.02696242432507383 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2184873949579832, + "acc_stderr": 0.02684151432295894, + "acc_norm": 0.2184873949579832, + "acc_norm_stderr": 0.02684151432295894 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2185430463576159, + "acc_stderr": 0.03374235550425694, + "acc_norm": 0.2185430463576159, + "acc_norm_stderr": 0.03374235550425694 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.26788990825688075, + "acc_stderr": 0.018987462257978652, + "acc_norm": 0.26788990825688075, + "acc_norm_stderr": 0.018987462257978652 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.1574074074074074, + "acc_stderr": 0.02483717351824239, + "acc_norm": 0.1574074074074074, + "acc_norm_stderr": 0.02483717351824239 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.3088235294117647, + "acc_stderr": 0.03242661719827218, + "acc_norm": 0.3088235294117647, + "acc_norm_stderr": 0.03242661719827218 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2616033755274262, + "acc_stderr": 0.028609516716994934, + "acc_norm": 0.2616033755274262, + "acc_norm_stderr": 0.028609516716994934 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3542600896860987, + "acc_stderr": 0.032100621541349864, + "acc_norm": 0.3542600896860987, + "acc_norm_stderr": 0.032100621541349864 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.20610687022900764, + "acc_stderr": 0.03547771004159464, + "acc_norm": 0.20610687022900764, + "acc_norm_stderr": 0.03547771004159464 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.3305785123966942, + "acc_stderr": 0.04294340845212094, + "acc_norm": 0.3305785123966942, + "acc_norm_stderr": 0.04294340845212094 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.3425925925925926, + "acc_stderr": 0.04587904741301811, + "acc_norm": 0.3425925925925926, + "acc_norm_stderr": 0.04587904741301811 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2331288343558282, + "acc_stderr": 0.033220157957767414, + "acc_norm": 0.2331288343558282, + "acc_norm_stderr": 0.033220157957767414 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.26785714285714285, + "acc_stderr": 0.04203277291467764, + "acc_norm": 0.26785714285714285, + "acc_norm_stderr": 0.04203277291467764 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.21359223300970873, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.21359223300970873, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.32905982905982906, + "acc_stderr": 0.03078232157768816, + "acc_norm": 0.32905982905982906, + "acc_norm_stderr": 0.03078232157768816 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2822477650063857, + "acc_stderr": 0.016095302969878555, + "acc_norm": 0.2822477650063857, + "acc_norm_stderr": 0.016095302969878555 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.27167630057803466, + "acc_stderr": 0.023948512905468365, + "acc_norm": 0.27167630057803466, + "acc_norm_stderr": 0.023948512905468365 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2346368715083799, + "acc_stderr": 0.014173044098303667, + "acc_norm": 0.2346368715083799, + "acc_norm_stderr": 0.014173044098303667 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.026090162504279053, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.026090162504279053 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2604501607717042, + "acc_stderr": 0.024926723224845557, + "acc_norm": 0.2604501607717042, + "acc_norm_stderr": 0.024926723224845557 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.28703703703703703, + "acc_stderr": 0.025171041915309684, + "acc_norm": 0.28703703703703703, + "acc_norm_stderr": 0.025171041915309684 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3049645390070922, + "acc_stderr": 0.027464708442022128, + "acc_norm": 0.3049645390070922, + "acc_norm_stderr": 0.027464708442022128 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.27444589308996087, + "acc_stderr": 0.011397043163078154, + "acc_norm": 0.27444589308996087, + "acc_norm_stderr": 0.011397043163078154 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.16911764705882354, + "acc_stderr": 0.02277086801011301, + "acc_norm": 0.16911764705882354, + "acc_norm_stderr": 0.02277086801011301 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.27941176470588236, + "acc_stderr": 0.018152871051538816, + "acc_norm": 0.27941176470588236, + "acc_norm_stderr": 0.018152871051538816 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.3, + "acc_stderr": 0.04389311454644287, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04389311454644287 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3142857142857143, + "acc_stderr": 0.029719329422417465, + "acc_norm": 0.3142857142857143, + "acc_norm_stderr": 0.029719329422417465 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.25870646766169153, + "acc_stderr": 0.030965903123573037, + "acc_norm": 0.25870646766169153, + "acc_norm_stderr": 0.030965903123573037 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2710843373493976, + "acc_stderr": 0.03460579907553026, + "acc_norm": 0.2710843373493976, + "acc_norm_stderr": 0.03460579907553026 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.034462962170884265, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.034462962170884265 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.20195838433292534, + "mc1_stderr": 0.014053957441512359, + "mc2": 0.3253448533993895, + "mc2_stderr": 0.013862486209403098 + }, + "all": { + "acc": 0.26347154250909116, + "acc_stderr": 0.03165492423612406, + "acc_norm": 0.26689039326246145, + "acc_norm_stderr": 0.03165325674877226, + "mc1": 0.20195838433292534, + "mc1_stderr": 0.014053957441512359, + "mc2": 0.3253448533993895, + "mc2_stderr": 0.013862486209403098 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "ed17e576dbafa5da" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4685, + "non-padded": 2, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "0875c25c8fc0a94d" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40045, + "non-padded": 123, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "18cfffb76bc8f0d1" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "16b3626c8a5e3797" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 680, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "21f0989f5760198a" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "f7d801bfd913884d" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "23f9089575432d5a" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "04b8293f2ab7fbbf" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "7994d94bfa36d003" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "a2c91752be5b1798" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "db71da66ed82b921" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "e81cf9738ad7e157" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "4a2d5f00cb00d9b7" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e9bcfaa6beefb456" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "6f8215a3de7eebd1" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "aacac708cd4c5a61" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "16b6c6e390eb7cea" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "4130880a19c4edb0" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "96b81f570a84328b" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "e3a7592f84b44888" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "f9edf462e8201551" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 16, + "non-truncated": 6120, + "padded": 6120, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "ecf7754754c2bb76" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "30b07e31cf9b5c6f" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "4d1dc7c4ad251829" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "d36b9d9f0f4424fe" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "a0a7af55ac7ae037" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "84fd36aa004c8578" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "0893dfcb83435e7d", + "hash_cont_tokens": "24012b7d40528568" + }, + "total_evaluation_time_secondes": "2765.3237206935883", + "truncated": 1492, + "non-truncated": 109527, + "padded": 109290, + "non-padded": 1729, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Rardilit/Panther_v1/results_2023-08-04T12-58-09.538898.json b/eval-results/Rardilit/Panther_v1/results_2023-08-04T12-58-09.538898.json new file mode 100644 index 0000000000000000000000000000000000000000..33f73b57d66839a3259c52a574da62b5f59db8aa --- /dev/null +++ b/eval-results/Rardilit/Panther_v1/results_2023-08-04T12-58-09.538898.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.22696245733788395, + "acc_stderr": 0.012240491536132861, + "acc_norm": 0.22696245733788395, + "acc_norm_stderr": 0.012240491536132861 + }, + "harness|hellaswag|10": { + "acc": 0.2504481179047998, + "acc_stderr": 0.004323856300539177, + "acc_norm": 0.2504481179047998, + "acc_norm_stderr": 0.004323856300539177 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.18518518518518517, + "acc_stderr": 0.03355677216313142, + "acc_norm": 0.18518518518518517, + "acc_norm_stderr": 0.03355677216313142 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.21509433962264152, + "acc_stderr": 0.02528839450289137, + "acc_norm": 0.21509433962264152, + "acc_norm_stderr": 0.02528839450289137 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749874, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749874 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813365 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.20899470899470898, + "acc_stderr": 0.02094048156533486, + "acc_norm": 0.20899470899470898, + "acc_norm_stderr": 0.02094048156533486 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04040610178208841, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04040610178208841 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.1774193548387097, + "acc_stderr": 0.02173254068932927, + "acc_norm": 0.1774193548387097, + "acc_norm_stderr": 0.02173254068932927 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.15270935960591134, + "acc_stderr": 0.02530890453938063, + "acc_norm": 0.15270935960591134, + "acc_norm_stderr": 0.02530890453938063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.17676767676767677, + "acc_stderr": 0.027178752639044915, + "acc_norm": 0.17676767676767677, + "acc_norm_stderr": 0.027178752639044915 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.19689119170984457, + "acc_stderr": 0.028697873971860664, + "acc_norm": 0.19689119170984457, + "acc_norm_stderr": 0.028697873971860664 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.20256410256410257, + "acc_stderr": 0.020377660970371372, + "acc_norm": 0.20256410256410257, + "acc_norm_stderr": 0.020377660970371372 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2111111111111111, + "acc_stderr": 0.024882116857655075, + "acc_norm": 0.2111111111111111, + "acc_norm_stderr": 0.024882116857655075 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21008403361344538, + "acc_stderr": 0.026461398717471874, + "acc_norm": 0.21008403361344538, + "acc_norm_stderr": 0.026461398717471874 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.1986754966887417, + "acc_stderr": 0.03257847384436776, + "acc_norm": 0.1986754966887417, + "acc_norm_stderr": 0.03257847384436776 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1926605504587156, + "acc_stderr": 0.016909276884936094, + "acc_norm": 0.1926605504587156, + "acc_norm_stderr": 0.016909276884936094 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.1527777777777778, + "acc_stderr": 0.024536326026134224, + "acc_norm": 0.1527777777777778, + "acc_norm_stderr": 0.024536326026134224 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.31390134529147984, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.31390134529147984, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2905982905982906, + "acc_stderr": 0.02974504857267404, + "acc_norm": 0.2905982905982906, + "acc_norm_stderr": 0.02974504857267404 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.23754789272030652, + "acc_stderr": 0.015218733046150193, + "acc_norm": 0.23754789272030652, + "acc_norm_stderr": 0.015218733046150193 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.023929155517351284, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.023929155517351284 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1864951768488746, + "acc_stderr": 0.02212243977248077, + "acc_norm": 0.1864951768488746, + "acc_norm_stderr": 0.02212243977248077 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21604938271604937, + "acc_stderr": 0.022899162918445806, + "acc_norm": 0.21604938271604937, + "acc_norm_stderr": 0.022899162918445806 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23404255319148937, + "acc_stderr": 0.025257861359432417, + "acc_norm": 0.23404255319148937, + "acc_norm_stderr": 0.025257861359432417 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142692, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142692 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.18382352941176472, + "acc_stderr": 0.023529242185193106, + "acc_norm": 0.18382352941176472, + "acc_norm_stderr": 0.023529242185193106 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25, + "acc_stderr": 0.01751781884501444, + "acc_norm": 0.25, + "acc_norm_stderr": 0.01751781884501444 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.18775510204081633, + "acc_stderr": 0.02500025603954621, + "acc_norm": 0.18775510204081633, + "acc_norm_stderr": 0.02500025603954621 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401465, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401465 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.28313253012048195, + "acc_stderr": 0.03507295431370518, + "acc_norm": 0.28313253012048195, + "acc_norm_stderr": 0.03507295431370518 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 1.0, + "mc1_stderr": 0.0, + "mc2": NaN, + "mc2_stderr": NaN + }, + "all": { + "acc": 0.2314240573187148, + "acc_stderr": 0.03071122006512167, + "acc_norm": 0.2314240573187148, + "acc_norm_stderr": 0.03071122006512167, + "mc1": 1.0, + "mc1_stderr": 0.0, + "mc2": NaN, + "mc2_stderr": NaN + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "Rardilit/Panther_v1", + "model_sha": "", + "model_dtype": "torch.float16", + "lighteval_sha": "5f779c2b88600e81a25d5dd5a059c8902022e8fd", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "5146.653201818466", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} diff --git a/eval-results/Rardilit/Panther_v1/results_2023-08-12T09-09-59.978775.json b/eval-results/Rardilit/Panther_v1/results_2023-08-12T09-09-59.978775.json new file mode 100644 index 0000000000000000000000000000000000000000..50ec9de4d42b2d287e57ae1714758fdbde153cf8 --- /dev/null +++ b/eval-results/Rardilit/Panther_v1/results_2023-08-12T09-09-59.978775.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.22696245733788395, + "acc_stderr": 0.012240491536132861, + "acc_norm": 0.22696245733788395, + "acc_norm_stderr": 0.012240491536132861 + }, + "harness|hellaswag|10": { + "acc": 0.2504481179047998, + "acc_stderr": 0.004323856300539177, + "acc_norm": 0.2504481179047998, + "acc_norm_stderr": 0.004323856300539177 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.18518518518518517, + "acc_stderr": 0.03355677216313142, + "acc_norm": 0.18518518518518517, + "acc_norm_stderr": 0.03355677216313142 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.21509433962264152, + "acc_stderr": 0.02528839450289137, + "acc_norm": 0.21509433962264152, + "acc_norm_stderr": 0.02528839450289137 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749874, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749874 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813365 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.20899470899470898, + "acc_stderr": 0.02094048156533486, + "acc_norm": 0.20899470899470898, + "acc_norm_stderr": 0.02094048156533486 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04040610178208841, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04040610178208841 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.1774193548387097, + "acc_stderr": 0.02173254068932927, + "acc_norm": 0.1774193548387097, + "acc_norm_stderr": 0.02173254068932927 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.15270935960591134, + "acc_stderr": 0.02530890453938063, + "acc_norm": 0.15270935960591134, + "acc_norm_stderr": 0.02530890453938063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.17676767676767677, + "acc_stderr": 0.027178752639044915, + "acc_norm": 0.17676767676767677, + "acc_norm_stderr": 0.027178752639044915 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.19689119170984457, + "acc_stderr": 0.028697873971860664, + "acc_norm": 0.19689119170984457, + "acc_norm_stderr": 0.028697873971860664 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.20256410256410257, + "acc_stderr": 0.020377660970371372, + "acc_norm": 0.20256410256410257, + "acc_norm_stderr": 0.020377660970371372 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2111111111111111, + "acc_stderr": 0.024882116857655075, + "acc_norm": 0.2111111111111111, + "acc_norm_stderr": 0.024882116857655075 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21008403361344538, + "acc_stderr": 0.026461398717471874, + "acc_norm": 0.21008403361344538, + "acc_norm_stderr": 0.026461398717471874 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.1986754966887417, + "acc_stderr": 0.03257847384436776, + "acc_norm": 0.1986754966887417, + "acc_norm_stderr": 0.03257847384436776 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1926605504587156, + "acc_stderr": 0.016909276884936094, + "acc_norm": 0.1926605504587156, + "acc_norm_stderr": 0.016909276884936094 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.1527777777777778, + "acc_stderr": 0.024536326026134224, + "acc_norm": 0.1527777777777778, + "acc_norm_stderr": 0.024536326026134224 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.31390134529147984, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.31390134529147984, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2905982905982906, + "acc_stderr": 0.02974504857267404, + "acc_norm": 0.2905982905982906, + "acc_norm_stderr": 0.02974504857267404 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.23754789272030652, + "acc_stderr": 0.015218733046150193, + "acc_norm": 0.23754789272030652, + "acc_norm_stderr": 0.015218733046150193 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.023929155517351284, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.023929155517351284 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1864951768488746, + "acc_stderr": 0.02212243977248077, + "acc_norm": 0.1864951768488746, + "acc_norm_stderr": 0.02212243977248077 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21604938271604937, + "acc_stderr": 0.022899162918445806, + "acc_norm": 0.21604938271604937, + "acc_norm_stderr": 0.022899162918445806 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23404255319148937, + "acc_stderr": 0.025257861359432417, + "acc_norm": 0.23404255319148937, + "acc_norm_stderr": 0.025257861359432417 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142692, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142692 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.18382352941176472, + "acc_stderr": 0.023529242185193106, + "acc_norm": 0.18382352941176472, + "acc_norm_stderr": 0.023529242185193106 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25, + "acc_stderr": 0.01751781884501444, + "acc_norm": 0.25, + "acc_norm_stderr": 0.01751781884501444 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.18775510204081633, + "acc_stderr": 0.02500025603954621, + "acc_norm": 0.18775510204081633, + "acc_norm_stderr": 0.02500025603954621 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401465, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401465 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.28313253012048195, + "acc_stderr": 0.03507295431370518, + "acc_norm": 0.28313253012048195, + "acc_norm_stderr": 0.03507295431370518 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 1.0, + "mc1_stderr": 0.0, + "mc2": NaN, + "mc2_stderr": NaN + }, + "all": { + "acc": 0.2314240573187148, + "acc_stderr": 0.03071122006512167, + "acc_norm": 0.2314240573187148, + "acc_norm_stderr": 0.03071122006512167, + "mc1": 1.0, + "mc1_stderr": 0.0, + "mc2": NaN, + "mc2_stderr": NaN + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "Rardilit/Panther_v1", + "model_sha": "c47493294aa5154feb72bcba31d7e99cbe02d4fa", + "model_dtype": "torch.float16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "4702.244509458542", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Rardilit/Panther_v1/results_2023-09-17T07-57-01.737780.json b/eval-results/Rardilit/Panther_v1/results_2023-09-17T07-57-01.737780.json new file mode 100644 index 0000000000000000000000000000000000000000..aaf71ba73550ba63a052585156e4b6f4d191ab90 --- /dev/null +++ b/eval-results/Rardilit/Panther_v1/results_2023-09-17T07-57-01.737780.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Rardilit/Panther_v1", + "model_sha": "c47493294aa5154feb72bcba31d7e99cbe02d4fa", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.4956590370955012, + "acc_stderr": 0.014051956064076911 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0, + "acc": 0.2478295185477506, + "acc_stderr": 0.007025978032038456 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "d62a3b26770557a9" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "8401e6188d643544" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "f150732b0323f26d" + }, + "total_evaluation_time_secondes": "58607.05862283707", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/RobbeD/OpenLlama-Platypus-3B/results_2023-08-29T10-12-53.419020.json b/eval-results/RobbeD/OpenLlama-Platypus-3B/results_2023-08-29T10-12-53.419020.json new file mode 100644 index 0000000000000000000000000000000000000000..540fd94aa207ee80695827ed9eff959f7327db60 --- /dev/null +++ b/eval-results/RobbeD/OpenLlama-Platypus-3B/results_2023-08-29T10-12-53.419020.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "RobbeD/OpenLlama-Platypus-3B", + "model_sha": "d3a0bf8e1181be02cc9c4c4cdfedaedacaefbfac", + "model_dtype": "torch.float16", + "lighteval_sha": "50335bc85f38603883267af91727d6de99f3ee04", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.39334470989761094, + "acc_stderr": 0.014275101465693024, + "acc_norm": 0.4121160409556314, + "acc_norm_stderr": 0.0143839153022254 + }, + "harness|hellaswag|10": { + "acc": 0.5363473411670981, + "acc_stderr": 0.004976579655169282, + "acc_norm": 0.7166899024098785, + "acc_norm_stderr": 0.0044968477732506415 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2814814814814815, + "acc_stderr": 0.03885004245800254, + "acc_norm": 0.2814814814814815, + "acc_norm_stderr": 0.03885004245800254 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3223684210526316, + "acc_stderr": 0.03803510248351585, + "acc_norm": 0.3223684210526316, + "acc_norm_stderr": 0.03803510248351585 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.3018867924528302, + "acc_stderr": 0.02825420034443865, + "acc_norm": 0.3018867924528302, + "acc_norm_stderr": 0.02825420034443865 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2986111111111111, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.2986111111111111, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952344, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952344 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.27167630057803466, + "acc_stderr": 0.03391750322321658, + "acc_norm": 0.27167630057803466, + "acc_norm_stderr": 0.03391750322321658 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.04389869956808778, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.04389869956808778 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2765957446808511, + "acc_stderr": 0.029241883869628827, + "acc_norm": 0.2765957446808511, + "acc_norm_stderr": 0.029241883869628827 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748142, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748142 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.23448275862068965, + "acc_stderr": 0.035306258743465914, + "acc_norm": 0.23448275862068965, + "acc_norm_stderr": 0.035306258743465914 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2804232804232804, + "acc_stderr": 0.02313528797432565, + "acc_norm": 0.2804232804232804, + "acc_norm_stderr": 0.02313528797432565 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.03932537680392871, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.03932537680392871 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25483870967741934, + "acc_stderr": 0.024790118459332208, + "acc_norm": 0.25483870967741934, + "acc_norm_stderr": 0.024790118459332208 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.23645320197044334, + "acc_stderr": 0.029896114291733545, + "acc_norm": 0.23645320197044334, + "acc_norm_stderr": 0.029896114291733545 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.38181818181818183, + "acc_stderr": 0.03793713171165634, + "acc_norm": 0.38181818181818183, + "acc_norm_stderr": 0.03793713171165634 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.21717171717171718, + "acc_stderr": 0.029376616484945633, + "acc_norm": 0.21717171717171718, + "acc_norm_stderr": 0.029376616484945633 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.31088082901554404, + "acc_stderr": 0.03340361906276585, + "acc_norm": 0.31088082901554404, + "acc_norm_stderr": 0.03340361906276585 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3, + "acc_stderr": 0.02323458108842849, + "acc_norm": 0.3, + "acc_norm_stderr": 0.02323458108842849 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.027309140588230175, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.027309140588230175 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.029597329730978075, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.029597329730978075 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.29357798165137616, + "acc_stderr": 0.01952515112263966, + "acc_norm": 0.29357798165137616, + "acc_norm_stderr": 0.01952515112263966 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.03167468706828977, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.03167468706828977 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.3137254901960784, + "acc_stderr": 0.03256685484460388, + "acc_norm": 0.3137254901960784, + "acc_norm_stderr": 0.03256685484460388 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.3670886075949367, + "acc_stderr": 0.03137624072561619, + "acc_norm": 0.3670886075949367, + "acc_norm_stderr": 0.03137624072561619 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.37668161434977576, + "acc_stderr": 0.032521134899291884, + "acc_norm": 0.37668161434977576, + "acc_norm_stderr": 0.032521134899291884 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.32061068702290074, + "acc_stderr": 0.040933292298342784, + "acc_norm": 0.32061068702290074, + "acc_norm_stderr": 0.040933292298342784 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.35537190082644626, + "acc_stderr": 0.04369236326573981, + "acc_norm": 0.35537190082644626, + "acc_norm_stderr": 0.04369236326573981 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.28703703703703703, + "acc_stderr": 0.043733130409147614, + "acc_norm": 0.28703703703703703, + "acc_norm_stderr": 0.043733130409147614 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3067484662576687, + "acc_stderr": 0.036230899157241474, + "acc_norm": 0.3067484662576687, + "acc_norm_stderr": 0.036230899157241474 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.29464285714285715, + "acc_stderr": 0.0432704093257873, + "acc_norm": 0.29464285714285715, + "acc_norm_stderr": 0.0432704093257873 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.21359223300970873, + "acc_stderr": 0.04058042015646036, + "acc_norm": 0.21359223300970873, + "acc_norm_stderr": 0.04058042015646036 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.030882736974138646, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.030882736974138646 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.3231162196679438, + "acc_stderr": 0.016723726512343048, + "acc_norm": 0.3231162196679438, + "acc_norm_stderr": 0.016723726512343048 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.29190751445086704, + "acc_stderr": 0.02447699407624732, + "acc_norm": 0.29190751445086704, + "acc_norm_stderr": 0.02447699407624732 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25139664804469275, + "acc_stderr": 0.014508979453553969, + "acc_norm": 0.25139664804469275, + "acc_norm_stderr": 0.014508979453553969 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.28104575163398693, + "acc_stderr": 0.02573885479781873, + "acc_norm": 0.28104575163398693, + "acc_norm_stderr": 0.02573885479781873 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3279742765273312, + "acc_stderr": 0.02666441088693761, + "acc_norm": 0.3279742765273312, + "acc_norm_stderr": 0.02666441088693761 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2993827160493827, + "acc_stderr": 0.02548311560119547, + "acc_norm": 0.2993827160493827, + "acc_norm_stderr": 0.02548311560119547 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.28368794326241137, + "acc_stderr": 0.026891709428343954, + "acc_norm": 0.28368794326241137, + "acc_norm_stderr": 0.026891709428343954 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3044328552803129, + "acc_stderr": 0.011752877592597568, + "acc_norm": 0.3044328552803129, + "acc_norm_stderr": 0.011752877592597568 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.36764705882352944, + "acc_stderr": 0.029289413409403192, + "acc_norm": 0.36764705882352944, + "acc_norm_stderr": 0.029289413409403192 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.27941176470588236, + "acc_stderr": 0.018152871051538812, + "acc_norm": 0.27941176470588236, + "acc_norm_stderr": 0.018152871051538812 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2818181818181818, + "acc_stderr": 0.04309118709946458, + "acc_norm": 0.2818181818181818, + "acc_norm_stderr": 0.04309118709946458 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.20816326530612245, + "acc_stderr": 0.025991117672813292, + "acc_norm": 0.20816326530612245, + "acc_norm_stderr": 0.025991117672813292 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.03333333333333334, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.03333333333333334 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3192771084337349, + "acc_stderr": 0.036293353299478595, + "acc_norm": 0.3192771084337349, + "acc_norm_stderr": 0.036293353299478595 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3742690058479532, + "acc_stderr": 0.03711601185389481, + "acc_norm": 0.3742690058479532, + "acc_norm_stderr": 0.03711601185389481 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2215422276621787, + "mc1_stderr": 0.014537867601301139, + "mc2": 0.36454492667971283, + "mc2_stderr": 0.013858874132673827 + }, + "all": { + "acc": 0.30419856236505294, + "acc_stderr": 0.03331530889753519, + "acc_norm": 0.3075733740989648, + "acc_norm_stderr": 0.03330902215100321, + "mc1": 0.2215422276621787, + "mc1_stderr": 0.014537867601301139, + "mc2": 0.36454492667971283, + "mc2_stderr": 0.013858874132673827 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "7cefb32e2563a8e3", + "hash_cont_tokens": "69111ccf8c982ca3" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "e4a72fc2bbea66ff", + "hash_cont_tokens": "95e9e7b994fc9459" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40144, + "non-padded": 24, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "1430bf2cb1d054e2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "c4f45f8ebf944893", + "hash_cont_tokens": "1d81fa80e3039a08" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "7b6c0659a104d6af", + "hash_cont_tokens": "66af3c333e2e33b4" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ca33ffee63980ac1", + "hash_cont_tokens": "aaaffbddbbdeecf6" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "a6aba95384c46b37", + "hash_cont_tokens": "26e3b69d5fb27bb2" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "95d92a1a2c158e2c", + "hash_cont_tokens": "439194ce25a22be1" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "70284e3c06933186", + "hash_cont_tokens": "61d2a6a419b64891" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "028608b4301fcfd2", + "hash_cont_tokens": "c6e8af4875843f62" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "02619f96ae20cf1e", + "hash_cont_tokens": "16dc0a68339e577b" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0282a73e02cf4b34", + "hash_cont_tokens": "0002f8908e2c5604" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5d0425cf2abddd51", + "hash_cont_tokens": "e76629783418737c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "560574f683641143", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "dc3987c35bc329e5", + "hash_cont_tokens": "29089b8b7020611e" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "be83fdd674b48356", + "hash_cont_tokens": "1a48dc73e5858180" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "00155bf1a1a1ebc7", + "hash_cont_tokens": "70817a7ac9f44af2" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "ce05b52b00498cf6", + "hash_cont_tokens": "5f0fe4a20633fc93" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "728bd41242158358", + "hash_cont_tokens": "f6e9cfb72237b427" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "190511206bf21530", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "2bc219567947ac68", + "hash_cont_tokens": "b433f62158dd2580" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "8477b93b8643d23f", + "hash_cont_tokens": "684af197bf78c021" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "0e15ea7b43890b3c", + "hash_cont_tokens": "54a0f1c97373f6fc" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "142b719c7d7d4fe0", + "hash_cont_tokens": "91dc522e4e4e91c3" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4bf76efe7796945e", + "hash_cont_tokens": "f275c901b3d285f9" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "e3a453e5fb044f52", + "hash_cont_tokens": "0bd598173199fc25" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "f47a1c2b0c018aff", + "hash_cont_tokens": "39a93706184f896b" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "35bc9ee85a563c15", + "hash_cont_tokens": "f0399631229c4bbe" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62a083d4ceb83864", + "hash_cont_tokens": "28c1f7c11bf85409" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "cd96d409604783e4", + "hash_cont_tokens": "8c47901880333cb3" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "3c716ffc27f83e15", + "hash_cont_tokens": "f249c949ec94fca0" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "fd8217f7edf722f8", + "hash_cont_tokens": "ddd1c111a92fc7bb" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a54112084a848a44", + "hash_cont_tokens": "2529d55ec490f81f" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "89cf33fb840f27be", + "hash_cont_tokens": "b34590804e071493" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "0a2b6ab3ae0e3b7c", + "hash_cont_tokens": "92acdd467ed943e1" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f28777a6fdce1d2b", + "hash_cont_tokens": "a6034ed95a124315" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "8282921a7a07bd5a", + "hash_cont_tokens": "74ff4b135356f4df" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "3aa62568b80ee7ca", + "hash_cont_tokens": "7c8e30f486ff156a" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "731b1d04f2da3d9a", + "hash_cont_tokens": "a457f0c06facf520" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96e1af14c8358ac2", + "hash_cont_tokens": "64c3774d71dc7eb8" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "bc2e4bf4e7cf5c39", + "hash_cont_tokens": "66b726b356a02feb" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "abed130d5c3867a4", + "hash_cont_tokens": "f08457005b652d25" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "83d7d50bc2ebab43", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "57004a232a08258a", + "hash_cont_tokens": "647bcbd68f292558" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "bb9518d436087f70", + "hash_cont_tokens": "5a7b498edf3beb7f" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1365, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3edebd0b46a85682", + "hash_cont_tokens": "1999ef9e9c46608f" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "815607301732a13f", + "hash_cont_tokens": "6017425ca4648660" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "952254859587db3e", + "hash_cont_tokens": "6e39384b9c0a8cc2" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "1429d150f124f76e", + "hash_cont_tokens": "87b66d935a56bb5e" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "9f8bfa3b87b58a38", + "hash_cont_tokens": "e7d0d323ac74ab59" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "f638aace411a0bd9", + "hash_cont_tokens": "0ff990d9cc38024d" + }, + "truncated": 168, + "non-truncated": 5968, + "padded": 5968, + "non-padded": 168, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c0f160879d378d4d", + "hash_cont_tokens": "a271b36d0db8278e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "548450e483004f15", + "hash_cont_tokens": "defde1e859d464f7" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "47f43ebfaa773712", + "hash_cont_tokens": "14bc759bc8de7252" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "0350ab02a3d50c5f", + "hash_cont_tokens": "b708a77b01f2529c" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "e010003b38f6d86a", + "hash_cont_tokens": "b4962d9e583b12c0" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "99959731e92e9eb1", + "hash_cont_tokens": "e19f8e17c9c18790" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "841a69043fcd7645", + "hash_cont_tokens": "397a75462a9735e3" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6faa0998b440e497", + "hash_cont_tokens": "6e5059a6697f3e71" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "fe347abbeff2a4c1", + "hash_cont_tokens": "a48530ac09baa92c" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3f79e8edf26f0efd", + "hash_cont_tokens": "ce4faf0c896cc73e" + }, + "total_evaluation_time_secondes": "9316.381895780563", + "truncated": 1644, + "non-truncated": 109375, + "padded": 109332, + "non-padded": 1687, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/RobbeD/OpenLlama-Platypus-3B/results_2023-09-23T06-28-14.000432.json b/eval-results/RobbeD/OpenLlama-Platypus-3B/results_2023-09-23T06-28-14.000432.json new file mode 100644 index 0000000000000000000000000000000000000000..26f7f4b1e33731a1c2624a377ead73e8cd7d617e --- /dev/null +++ b/eval-results/RobbeD/OpenLlama-Platypus-3B/results_2023-09-23T06-28-14.000432.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "RobbeD/OpenLlama-Platypus-3B", + "model_sha": "d3a0bf8e1181be02cc9c4c4cdfedaedacaefbfac", + "model_size": "6.4 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.06145134228187919, + "em_stderr": 0.002459425856611146, + "f1": 0.11012269295302003, + "f1_stderr": 0.002656818706713483 + }, + "harness|gsm8k|5": { + "acc": 0.011372251705837756, + "acc_stderr": 0.0029206661987887473 + }, + "harness|winogrande|5": { + "acc": 0.65982636148382, + "acc_stderr": 0.013315218762417397 + }, + "all": { + "em": 0.06145134228187919, + "em_stderr": 0.002459425856611146, + "f1": 0.11012269295302003, + "f1_stderr": 0.002656818706713483, + "acc": 0.3355993065948289, + "acc_stderr": 0.008117942480603072 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a65c9eacad86ea52", + "hash_cont_tokens": "6d9f865eb4331a9d" + }, + "truncated": 980, + "non-truncated": 8556, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bf7d8c6b5e4f7948", + "hash_cont_tokens": "90f50c9d9fc8215e" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "647d8b2cafc100bc", + "hash_cont_tokens": "828897df1f4f08a1" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2433, + "non-padded": 101, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a65e1c92b9137d17", + "hash_cont_tokens": "ebc8747c646a4043" + }, + "total_evaluation_time_secondes": "9155.799797058105", + "truncated": 980, + "non-truncated": 12409, + "padded": 2433, + "non-padded": 10956, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/RobbeD/Orca-Platypus-3B/results_2023-08-29T10-07-29.426848.json b/eval-results/RobbeD/Orca-Platypus-3B/results_2023-08-29T10-07-29.426848.json new file mode 100644 index 0000000000000000000000000000000000000000..df9d3d8a00b284aab32e34a34c59796dab607a43 --- /dev/null +++ b/eval-results/RobbeD/Orca-Platypus-3B/results_2023-08-29T10-07-29.426848.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "RobbeD/Orca-Platypus-3B", + "model_sha": "243f51d75ed6d425addde839740f6fd5bcc4630f", + "model_dtype": "torch.float16", + "lighteval_sha": "2a8c20b0e409b2fa39ec86fd1fe08f884b0ac2c8", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.3993174061433447, + "acc_stderr": 0.014312094557946707, + "acc_norm": 0.4308873720136519, + "acc_norm_stderr": 0.014471133392642476 + }, + "harness|hellaswag|10": { + "acc": 0.4967138020314678, + "acc_stderr": 0.004989673640014264, + "acc_norm": 0.6532563234415455, + "acc_norm_stderr": 0.004749606196363324 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.34074074074074073, + "acc_stderr": 0.040943762699967926, + "acc_norm": 0.34074074074074073, + "acc_norm_stderr": 0.040943762699967926 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.28289473684210525, + "acc_stderr": 0.03665349695640767, + "acc_norm": 0.28289473684210525, + "acc_norm_stderr": 0.03665349695640767 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2792452830188679, + "acc_stderr": 0.027611163402399715, + "acc_norm": 0.2792452830188679, + "acc_norm_stderr": 0.027611163402399715 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3055555555555556, + "acc_stderr": 0.03852084696008534, + "acc_norm": 0.3055555555555556, + "acc_norm_stderr": 0.03852084696008534 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.15, + "acc_stderr": 0.035887028128263714, + "acc_norm": 0.15, + "acc_norm_stderr": 0.035887028128263714 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.23699421965317918, + "acc_stderr": 0.03242414757483098, + "acc_norm": 0.23699421965317918, + "acc_norm_stderr": 0.03242414757483098 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179963, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179963 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.25957446808510637, + "acc_stderr": 0.028659179374292323, + "acc_norm": 0.25957446808510637, + "acc_norm_stderr": 0.028659179374292323 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21929824561403508, + "acc_stderr": 0.03892431106518754, + "acc_norm": 0.21929824561403508, + "acc_norm_stderr": 0.03892431106518754 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2689655172413793, + "acc_stderr": 0.036951833116502325, + "acc_norm": 0.2689655172413793, + "acc_norm_stderr": 0.036951833116502325 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2566137566137566, + "acc_stderr": 0.022494510767503154, + "acc_norm": 0.2566137566137566, + "acc_norm_stderr": 0.022494510767503154 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.19047619047619047, + "acc_stderr": 0.03512207412302052, + "acc_norm": 0.19047619047619047, + "acc_norm_stderr": 0.03512207412302052 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.22258064516129034, + "acc_stderr": 0.023664216671642535, + "acc_norm": 0.22258064516129034, + "acc_norm_stderr": 0.023664216671642535 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2315270935960591, + "acc_stderr": 0.029678333141444437, + "acc_norm": 0.2315270935960591, + "acc_norm_stderr": 0.029678333141444437 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252606, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252606 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.30303030303030304, + "acc_stderr": 0.035886248000917075, + "acc_norm": 0.30303030303030304, + "acc_norm_stderr": 0.035886248000917075 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.03173071239071724, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.03173071239071724 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.25906735751295334, + "acc_stderr": 0.031618779179354094, + "acc_norm": 0.25906735751295334, + "acc_norm_stderr": 0.031618779179354094 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.23846153846153847, + "acc_stderr": 0.021606294494647727, + "acc_norm": 0.23846153846153847, + "acc_norm_stderr": 0.021606294494647727 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.026719240783712163, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.026719240783712163 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.22268907563025211, + "acc_stderr": 0.027025433498882364, + "acc_norm": 0.22268907563025211, + "acc_norm_stderr": 0.027025433498882364 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.24403669724770644, + "acc_stderr": 0.01841528635141641, + "acc_norm": 0.24403669724770644, + "acc_norm_stderr": 0.01841528635141641 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.20833333333333334, + "acc_stderr": 0.027696910713093936, + "acc_norm": 0.20833333333333334, + "acc_norm_stderr": 0.027696910713093936 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.24019607843137256, + "acc_stderr": 0.02998373305591361, + "acc_norm": 0.24019607843137256, + "acc_norm_stderr": 0.02998373305591361 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.26582278481012656, + "acc_stderr": 0.028756799629658342, + "acc_norm": 0.26582278481012656, + "acc_norm_stderr": 0.028756799629658342 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.2600896860986547, + "acc_stderr": 0.029442495585857476, + "acc_norm": 0.2600896860986547, + "acc_norm_stderr": 0.029442495585857476 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.20610687022900764, + "acc_stderr": 0.035477710041594626, + "acc_norm": 0.20610687022900764, + "acc_norm_stderr": 0.035477710041594626 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.4049586776859504, + "acc_stderr": 0.044811377559424694, + "acc_norm": 0.4049586776859504, + "acc_norm_stderr": 0.044811377559424694 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.3055555555555556, + "acc_stderr": 0.044531975073749834, + "acc_norm": 0.3055555555555556, + "acc_norm_stderr": 0.044531975073749834 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3128834355828221, + "acc_stderr": 0.036429145782924055, + "acc_norm": 0.3128834355828221, + "acc_norm_stderr": 0.036429145782924055 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.19642857142857142, + "acc_stderr": 0.03770970049347018, + "acc_norm": 0.19642857142857142, + "acc_norm_stderr": 0.03770970049347018 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.2524271844660194, + "acc_stderr": 0.04301250399690877, + "acc_norm": 0.2524271844660194, + "acc_norm_stderr": 0.04301250399690877 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.3076923076923077, + "acc_stderr": 0.03023638994217309, + "acc_norm": 0.3076923076923077, + "acc_norm_stderr": 0.03023638994217309 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.34738186462324394, + "acc_stderr": 0.01702667174865574, + "acc_norm": 0.34738186462324394, + "acc_norm_stderr": 0.01702667174865574 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.30346820809248554, + "acc_stderr": 0.02475241196091721, + "acc_norm": 0.30346820809248554, + "acc_norm_stderr": 0.02475241196091721 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2446927374301676, + "acc_stderr": 0.014378169884098447, + "acc_norm": 0.2446927374301676, + "acc_norm_stderr": 0.014378169884098447 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.28104575163398693, + "acc_stderr": 0.025738854797818723, + "acc_norm": 0.28104575163398693, + "acc_norm_stderr": 0.025738854797818723 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3633440514469453, + "acc_stderr": 0.027316847674192707, + "acc_norm": 0.3633440514469453, + "acc_norm_stderr": 0.027316847674192707 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2716049382716049, + "acc_stderr": 0.024748624490537365, + "acc_norm": 0.2716049382716049, + "acc_norm_stderr": 0.024748624490537365 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2553191489361702, + "acc_stderr": 0.026011992930902006, + "acc_norm": 0.2553191489361702, + "acc_norm_stderr": 0.026011992930902006 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142692, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142692 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.16176470588235295, + "acc_stderr": 0.02236867256288675, + "acc_norm": 0.16176470588235295, + "acc_norm_stderr": 0.02236867256288675 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.018433427649401892, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.018433427649401892 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.19090909090909092, + "acc_stderr": 0.03764425585984926, + "acc_norm": 0.19090909090909092, + "acc_norm_stderr": 0.03764425585984926 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.1836734693877551, + "acc_stderr": 0.02478907133200767, + "acc_norm": 0.1836734693877551, + "acc_norm_stderr": 0.02478907133200767 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.31840796019900497, + "acc_stderr": 0.032941184790540944, + "acc_norm": 0.31840796019900497, + "acc_norm_stderr": 0.032941184790540944 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110175, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110175 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.30120481927710846, + "acc_stderr": 0.035716092300534796, + "acc_norm": 0.30120481927710846, + "acc_norm_stderr": 0.035716092300534796 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3391812865497076, + "acc_stderr": 0.03631053496488905, + "acc_norm": 0.3391812865497076, + "acc_norm_stderr": 0.03631053496488905 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.27539779681762544, + "mc1_stderr": 0.01563813566777552, + "mc2": 0.41928517905056045, + "mc2_stderr": 0.0152672030417133 + }, + "all": { + "acc": 0.27366722319077513, + "acc_stderr": 0.03210093803398038, + "acc_norm": 0.2768555704328155, + "acc_norm_stderr": 0.0320995646677269, + "mc1": 0.27539779681762544, + "mc1_stderr": 0.01563813566777552, + "mc2": 0.41928517905056045, + "mc2_stderr": 0.0152672030417133 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "99ff49c78917d666", + "hash_cont_tokens": "568988b9c3bfc83c" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "27b384658a4b826e", + "hash_cont_tokens": "5966c7ceee7144f8" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40153, + "non-padded": 15, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "dac91b437d631599", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "06cd9a69af842291", + "hash_cont_tokens": "b408913f391dc598" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "7e0363633bd4c661", + "hash_cont_tokens": "4ab285fa2a75c029" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "a1b916a7277078b4", + "hash_cont_tokens": "15baabbd71328cbe" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "af46942ff5deb21d", + "hash_cont_tokens": "96c880c9478a4037" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "5882d6931ded2237", + "hash_cont_tokens": "6268ee610a672867" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "b24180b880da9cdc", + "hash_cont_tokens": "7b194ff8e7e390ce" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "9bc1d680b14c82ee", + "hash_cont_tokens": "2fe5eee1df1b81bb" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "79aced2bcafe02e4", + "hash_cont_tokens": "499ffd87e7a60146" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "3e657aa09cc216ff", + "hash_cont_tokens": "e5df51bb12073b7b" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5f521206bd8121ad", + "hash_cont_tokens": "4abfe03c09581bce" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "b12ce1e36c118558", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "221bbd7b0d39e269", + "hash_cont_tokens": "4dc3a1c45702aea2" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "d475018fde7b68bf", + "hash_cont_tokens": "abfc7c631218ed32" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "964e79b20780ee59", + "hash_cont_tokens": "195db06c037d7c81" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 569, + "non-padded": 11, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "829b84905d5794d7", + "hash_cont_tokens": "4274dfcea97c4e27" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "83233577e0f66071", + "hash_cont_tokens": "aadc96b61f4bea54" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "b45c36cf0fc38f67", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "47f5c034c56e090f", + "hash_cont_tokens": "6ea5c6b690913b0f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "13286ca334f1e8e7", + "hash_cont_tokens": "befe57dcb5a5a7d3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e3a3351b698e7311", + "hash_cont_tokens": "8da78e4005b8faf9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "6639a9e4f4eb57c5", + "hash_cont_tokens": "ff5ae57ff23b53d1" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "cfe8f73d53615fc7", + "hash_cont_tokens": "db85309de1591035" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "1f8541aadce8b236", + "hash_cont_tokens": "6890e2bc35a602ef" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8da2d7f4edfdafd5", + "hash_cont_tokens": "6132e48ff0edea66" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "52328f9dec1844ed", + "hash_cont_tokens": "d201a0126c9a530c" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "04d97c91eee4e141", + "hash_cont_tokens": "596c4f1066a38e91" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d8d05cf169bd7639", + "hash_cont_tokens": "fcefc753d295e446" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "03f858b330d55fed", + "hash_cont_tokens": "a4a552f563078902" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "ce2ca0558b9a5f27", + "hash_cont_tokens": "85dbbdba6017eaec" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a3884e14c3c038b5", + "hash_cont_tokens": "7d705edd113a3d4d" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b3f5f4615f906023", + "hash_cont_tokens": "211397dca1d04c0a" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "0d806b9b33c54432", + "hash_cont_tokens": "b196c68db4825727" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "4c9f4c1de8d94adf", + "hash_cont_tokens": "ffc3b70128684ad0" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "4e565cd482620bbe", + "hash_cont_tokens": "bcaed810d47c62aa" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "13cbfca1b5b84f78", + "hash_cont_tokens": "ea7ff206c4da6f57" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "bf707bcaadcd1b7f", + "hash_cont_tokens": "4a853cb5874d2adc" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "78808255dea01f83", + "hash_cont_tokens": "9e40b162dc928ce5" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6bab60a3ce133e17", + "hash_cont_tokens": "c93d7596aa2246ea" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "d0fcde4d547d9832", + "hash_cont_tokens": "af4b0ee8ee2bb07f" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "78c8a1b611a22020", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "690c7a1333c1030b", + "hash_cont_tokens": "5b068e21debc566e" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "de74e3025a1cd4e3", + "hash_cont_tokens": "8d79c8c8d3b1fa75" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1384, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "77cf2aceb27a9b48", + "hash_cont_tokens": "30d3a442342e5f19" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c149e4bfa0bd49e2", + "hash_cont_tokens": "231f307b052cc303" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "8e8dd2f09979a669", + "hash_cont_tokens": "faaa18e05a96eb91" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "beb7b4488967bf13", + "hash_cont_tokens": "3fa5ef4207c2fae2" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "6dead6c7a78a877e", + "hash_cont_tokens": "711398f4a1641e99" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "a3cf3a06ebd3a4c2", + "hash_cont_tokens": "5c9515fd601cb0d7" + }, + "truncated": 92, + "non-truncated": 6044, + "padded": 6032, + "non-padded": 104, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "8ef46fa5025f8036", + "hash_cont_tokens": "bb99427ea7c63f48" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "592938a865df4169", + "hash_cont_tokens": "cdbe1515e8c6e3ce" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "6708e93b0c611917", + "hash_cont_tokens": "c54f38d507746b57" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d9c3e621c2145453", + "hash_cont_tokens": "16d346d36b44190b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "862a1d43b0709cc8", + "hash_cont_tokens": "e329121c50bb2b96" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "0f8b3d09b9f523d6", + "hash_cont_tokens": "446207f22323db3e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "543430e3d6af520f", + "hash_cont_tokens": "30dcb20b1aeaf10b" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "a9f37ee284fec309", + "hash_cont_tokens": "f8476c0c6f07dff2" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "bc9ef61861cd1b47", + "hash_cont_tokens": "d07001d4d0214aa3" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5718915646c336d4", + "hash_cont_tokens": "be8494d5ebf3309a" + }, + "total_evaluation_time_secondes": "9171.633256673813", + "truncated": 1568, + "non-truncated": 109451, + "padded": 109413, + "non-padded": 1606, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/RoversX/llama-2-7b-hf-small-shards-Samantha-V1-SFT/results_2023-08-12T09-30-09.236602.json b/eval-results/RoversX/llama-2-7b-hf-small-shards-Samantha-V1-SFT/results_2023-08-12T09-30-09.236602.json new file mode 100644 index 0000000000000000000000000000000000000000..97d43ead3566f8b595d2cec3c96de9fa5615d4c1 --- /dev/null +++ b/eval-results/RoversX/llama-2-7b-hf-small-shards-Samantha-V1-SFT/results_2023-08-12T09-30-09.236602.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.4863481228668942, + "acc_stderr": 0.01460594342986095, + "acc_norm": 0.5315699658703071, + "acc_norm_stderr": 0.014582236460866977 + }, + "harness|hellaswag|10": { + "acc": 0.5819557857000598, + "acc_stderr": 0.004922294797766665, + "acc_norm": 0.7771360286795459, + "acc_norm_stderr": 0.004153172511339343 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.43703703703703706, + "acc_stderr": 0.04284958639753399, + "acc_norm": 0.43703703703703706, + "acc_norm_stderr": 0.04284958639753399 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4276315789473684, + "acc_stderr": 0.04026097083296559, + "acc_norm": 0.4276315789473684, + "acc_norm_stderr": 0.04026097083296559 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4528301886792453, + "acc_stderr": 0.03063562795796182, + "acc_norm": 0.4528301886792453, + "acc_norm_stderr": 0.03063562795796182 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.04155319955593146, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.04155319955593146 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.41040462427745666, + "acc_stderr": 0.03750757044895537, + "acc_norm": 0.41040462427745666, + "acc_norm_stderr": 0.03750757044895537 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.0379328118530781, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.0379328118530781 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.40425531914893614, + "acc_stderr": 0.032081157507886836, + "acc_norm": 0.40425531914893614, + "acc_norm_stderr": 0.032081157507886836 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322004, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322004 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.022418042891113946, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.022418042891113946 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.04104947269903394, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.04104947269903394 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.41935483870967744, + "acc_stderr": 0.028071588901091855, + "acc_norm": 0.41935483870967744, + "acc_norm_stderr": 0.028071588901091855 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.33497536945812806, + "acc_stderr": 0.033208527423483104, + "acc_norm": 0.33497536945812806, + "acc_norm_stderr": 0.033208527423483104 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5393939393939394, + "acc_stderr": 0.03892207016552012, + "acc_norm": 0.5393939393939394, + "acc_norm_stderr": 0.03892207016552012 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5050505050505051, + "acc_stderr": 0.035621707606254015, + "acc_norm": 0.5050505050505051, + "acc_norm_stderr": 0.035621707606254015 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6373056994818653, + "acc_stderr": 0.03469713791704372, + "acc_norm": 0.6373056994818653, + "acc_norm_stderr": 0.03469713791704372 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.40512820512820513, + "acc_stderr": 0.024890471769938145, + "acc_norm": 0.40512820512820513, + "acc_norm_stderr": 0.024890471769938145 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.27037037037037037, + "acc_stderr": 0.027080372815145668, + "acc_norm": 0.27037037037037037, + "acc_norm_stderr": 0.027080372815145668 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3907563025210084, + "acc_stderr": 0.03169380235712997, + "acc_norm": 0.3907563025210084, + "acc_norm_stderr": 0.03169380235712997 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.03734535676787198, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.03734535676787198 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.5779816513761468, + "acc_stderr": 0.02117499140776317, + "acc_norm": 0.5779816513761468, + "acc_norm_stderr": 0.02117499140776317 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2175925925925926, + "acc_stderr": 0.028139689444859683, + "acc_norm": 0.2175925925925926, + "acc_norm_stderr": 0.028139689444859683 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.47549019607843135, + "acc_stderr": 0.035050931943487976, + "acc_norm": 0.47549019607843135, + "acc_norm_stderr": 0.035050931943487976 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.5316455696202531, + "acc_stderr": 0.03248197400511075, + "acc_norm": 0.5316455696202531, + "acc_norm_stderr": 0.03248197400511075 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5246636771300448, + "acc_stderr": 0.03351695167652628, + "acc_norm": 0.5246636771300448, + "acc_norm_stderr": 0.03351695167652628 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.42748091603053434, + "acc_stderr": 0.04338920305792401, + "acc_norm": 0.42748091603053434, + "acc_norm_stderr": 0.04338920305792401 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6198347107438017, + "acc_stderr": 0.04431324501968432, + "acc_norm": 0.6198347107438017, + "acc_norm_stderr": 0.04431324501968432 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.04830366024635331, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.04830366024635331 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4294478527607362, + "acc_stderr": 0.038890666191127216, + "acc_norm": 0.4294478527607362, + "acc_norm_stderr": 0.038890666191127216 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5048543689320388, + "acc_stderr": 0.04950504382128921, + "acc_norm": 0.5048543689320388, + "acc_norm_stderr": 0.04950504382128921 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6495726495726496, + "acc_stderr": 0.031256108244218796, + "acc_norm": 0.6495726495726496, + "acc_norm_stderr": 0.031256108244218796 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5925925925925926, + "acc_stderr": 0.017570705239256558, + "acc_norm": 0.5925925925925926, + "acc_norm_stderr": 0.017570705239256558 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.48554913294797686, + "acc_stderr": 0.02690784985628254, + "acc_norm": 0.48554913294797686, + "acc_norm_stderr": 0.02690784985628254 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4673202614379085, + "acc_stderr": 0.028568699752225875, + "acc_norm": 0.4673202614379085, + "acc_norm_stderr": 0.028568699752225875 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5498392282958199, + "acc_stderr": 0.028256660723360177, + "acc_norm": 0.5498392282958199, + "acc_norm_stderr": 0.028256660723360177 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.4876543209876543, + "acc_stderr": 0.027812262269327228, + "acc_norm": 0.4876543209876543, + "acc_norm_stderr": 0.027812262269327228 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3723404255319149, + "acc_stderr": 0.028838921471251455, + "acc_norm": 0.3723404255319149, + "acc_norm_stderr": 0.028838921471251455 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.32920469361147325, + "acc_stderr": 0.012002091666902295, + "acc_norm": 0.32920469361147325, + "acc_norm_stderr": 0.012002091666902295 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4375, + "acc_stderr": 0.030134614954403924, + "acc_norm": 0.4375, + "acc_norm_stderr": 0.030134614954403924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.434640522875817, + "acc_stderr": 0.02005426920072646, + "acc_norm": 0.434640522875817, + "acc_norm_stderr": 0.02005426920072646 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.4636363636363636, + "acc_stderr": 0.047764491623961985, + "acc_norm": 0.4636363636363636, + "acc_norm_stderr": 0.047764491623961985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.35918367346938773, + "acc_stderr": 0.030713560455108493, + "acc_norm": 0.35918367346938773, + "acc_norm_stderr": 0.030713560455108493 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6019900497512438, + "acc_stderr": 0.03461199429040013, + "acc_norm": 0.6019900497512438, + "acc_norm_stderr": 0.03461199429040013 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.37349397590361444, + "acc_stderr": 0.03765845117168861, + "acc_norm": 0.37349397590361444, + "acc_norm_stderr": 0.03765845117168861 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6432748538011696, + "acc_stderr": 0.03674013002860954, + "acc_norm": 0.6432748538011696, + "acc_norm_stderr": 0.03674013002860954 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2998776009791922, + "mc1_stderr": 0.01604035296671363, + "mc2": 0.452817098826332, + "mc2_stderr": 0.014565353994630556 + }, + "all": { + "acc": 0.4380525445994556, + "acc_stderr": 0.03522455386284959, + "acc_norm": 0.44212715622628435, + "acc_norm_stderr": 0.0352111160788594, + "mc1": 0.2998776009791922, + "mc1_stderr": 0.01604035296671363, + "mc2": 0.452817098826332, + "mc2_stderr": 0.014565353994630556 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "RoversX/llama-2-7b-hf-small-shards-Samantha-V1-SFT", + "model_sha": "c39cee3821269e7fdffa690c2d0836c74dfebd25", + "model_dtype": "4bit", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "19882.240529060364", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/RoversX/llama-2-7b-hf-small-shards-Samantha-V1-SFT/results_2023-10-13T04-33-28.538192.json b/eval-results/RoversX/llama-2-7b-hf-small-shards-Samantha-V1-SFT/results_2023-10-13T04-33-28.538192.json new file mode 100644 index 0000000000000000000000000000000000000000..c4043913d7565069e83e94ca82c77835ac5dc52d --- /dev/null +++ b/eval-results/RoversX/llama-2-7b-hf-small-shards-Samantha-V1-SFT/results_2023-10-13T04-33-28.538192.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "RoversX/llama-2-7b-hf-small-shards-Samantha-V1-SFT", + "model_sha": "c39cee3821269e7fdffa690c2d0836c74dfebd25", + "model_size": "3.57 GB", + "model_dtype": "4bit", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0012583892617449664, + "em_stderr": 0.00036305608931188796, + "f1": 0.052196937919463185, + "f1_stderr": 0.0012732861194066877 + }, + "harness|gsm8k|5": { + "acc": 0.06368460955269144, + "acc_stderr": 0.006726213078805692 + }, + "harness|winogrande|5": { + "acc": 0.7379636937647988, + "acc_stderr": 0.012358944431637557 + }, + "all": { + "em": 0.0012583892617449664, + "em_stderr": 0.00036305608931188796, + "f1": 0.052196937919463185, + "f1_stderr": 0.0012732861194066877, + "acc": 0.4008241516587451, + "acc_stderr": 0.009542578755221624 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "4407ac68d4b5a4a2" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "bcc0174fa16e79d4" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "60c8258398819585" + }, + "total_evaluation_time_secondes": "10260.226199150085", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/S4sch/zephyr-neural-chat-frankenmerge11b/results_2023-12-04T17-40-46.451568.json b/eval-results/S4sch/zephyr-neural-chat-frankenmerge11b/results_2023-12-04T17-40-46.451568.json new file mode 100644 index 0000000000000000000000000000000000000000..8557e0108259bc726bdbf3ffcd980d8fed68cdf5 --- /dev/null +++ b/eval-results/S4sch/zephyr-neural-chat-frankenmerge11b/results_2023-12-04T17-40-46.451568.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 155398.176782479, + "end_time": 166511.344114317, + "total_evaluation_time_secondes": "11113.16733183802", + "model_name": "S4sch/zephyr-neural-chat-frankenmerge11b", + "model_sha": "f915831e904e0dcda760873aa16a35daf5ac9e6d", + "model_dtype": "torch.float16", + "model_size": "22.0 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5930034129692833, + "acc_stderr": 0.014356399418009124, + "acc_norm": 0.6151877133105802, + "acc_norm_stderr": 0.014218371065251109 + }, + "harness|hellaswag|10": { + "acc": 0.6596295558653654, + "acc_stderr": 0.004728653488866922, + "acc_norm": 0.8408683529177454, + "acc_norm_stderr": 0.0036505121583062794 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6, + "acc_stderr": 0.04232073695151589, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04232073695151589 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6644736842105263, + "acc_stderr": 0.03842498559395269, + "acc_norm": 0.6644736842105263, + "acc_norm_stderr": 0.03842498559395269 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6188679245283019, + "acc_stderr": 0.02989060968628665, + "acc_norm": 0.6188679245283019, + "acc_norm_stderr": 0.02989060968628665 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7430555555555556, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.7430555555555556, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.03692820767264866, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.03692820767264866 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.46078431372549017, + "acc_stderr": 0.04959859966384181, + "acc_norm": 0.46078431372549017, + "acc_norm_stderr": 0.04959859966384181 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.73, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.73, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.548936170212766, + "acc_stderr": 0.032529096196131965, + "acc_norm": 0.548936170212766, + "acc_norm_stderr": 0.032529096196131965 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.42105263157894735, + "acc_stderr": 0.046446020912223177, + "acc_norm": 0.42105263157894735, + "acc_norm_stderr": 0.046446020912223177 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4689655172413793, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.4689655172413793, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4021164021164021, + "acc_stderr": 0.02525303255499769, + "acc_norm": 0.4021164021164021, + "acc_norm_stderr": 0.02525303255499769 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.44, + "acc_stderr": 0.049888765156985884, + "acc_norm": 0.44, + "acc_norm_stderr": 0.049888765156985884 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7225806451612903, + "acc_stderr": 0.025470196835900055, + "acc_norm": 0.7225806451612903, + "acc_norm_stderr": 0.025470196835900055 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4630541871921182, + "acc_stderr": 0.035083705204426656, + "acc_norm": 0.4630541871921182, + "acc_norm_stderr": 0.035083705204426656 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7515151515151515, + "acc_stderr": 0.033744026441394036, + "acc_norm": 0.7515151515151515, + "acc_norm_stderr": 0.033744026441394036 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7828282828282829, + "acc_stderr": 0.02937661648494563, + "acc_norm": 0.7828282828282829, + "acc_norm_stderr": 0.02937661648494563 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8652849740932642, + "acc_stderr": 0.024639789097709443, + "acc_norm": 0.8652849740932642, + "acc_norm_stderr": 0.024639789097709443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6153846153846154, + "acc_stderr": 0.02466674491518721, + "acc_norm": 0.6153846153846154, + "acc_norm_stderr": 0.02466674491518721 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3592592592592593, + "acc_stderr": 0.029252905927251976, + "acc_norm": 0.3592592592592593, + "acc_norm_stderr": 0.029252905927251976 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6638655462184874, + "acc_stderr": 0.03068473711513536, + "acc_norm": 0.6638655462184874, + "acc_norm_stderr": 0.03068473711513536 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.0386155754625517, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.0386155754625517 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8238532110091743, + "acc_stderr": 0.01633288239343136, + "acc_norm": 0.8238532110091743, + "acc_norm_stderr": 0.01633288239343136 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4583333333333333, + "acc_stderr": 0.03398110890294636, + "acc_norm": 0.4583333333333333, + "acc_norm_stderr": 0.03398110890294636 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7941176470588235, + "acc_stderr": 0.028379449451588667, + "acc_norm": 0.7941176470588235, + "acc_norm_stderr": 0.028379449451588667 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229966, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229966 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7040358744394619, + "acc_stderr": 0.030636591348699803, + "acc_norm": 0.7040358744394619, + "acc_norm_stderr": 0.030636591348699803 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6717557251908397, + "acc_stderr": 0.04118438565806298, + "acc_norm": 0.6717557251908397, + "acc_norm_stderr": 0.04118438565806298 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7768595041322314, + "acc_stderr": 0.03800754475228733, + "acc_norm": 0.7768595041322314, + "acc_norm_stderr": 0.03800754475228733 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.04133119440243839, + "acc_norm": 0.7592592592592593, + "acc_norm_stderr": 0.04133119440243839 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7177914110429447, + "acc_stderr": 0.03536117886664742, + "acc_norm": 0.7177914110429447, + "acc_norm_stderr": 0.03536117886664742 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4375, + "acc_stderr": 0.04708567521880525, + "acc_norm": 0.4375, + "acc_norm_stderr": 0.04708567521880525 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8589743589743589, + "acc_stderr": 0.022801382534597552, + "acc_norm": 0.8589743589743589, + "acc_norm_stderr": 0.022801382534597552 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.789272030651341, + "acc_stderr": 0.014583812465862538, + "acc_norm": 0.789272030651341, + "acc_norm_stderr": 0.014583812465862538 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.661849710982659, + "acc_stderr": 0.025469770149400175, + "acc_norm": 0.661849710982659, + "acc_norm_stderr": 0.025469770149400175 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3843575418994413, + "acc_stderr": 0.016269088663959402, + "acc_norm": 0.3843575418994413, + "acc_norm_stderr": 0.016269088663959402 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6503267973856209, + "acc_stderr": 0.027305308076274695, + "acc_norm": 0.6503267973856209, + "acc_norm_stderr": 0.027305308076274695 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6784565916398714, + "acc_stderr": 0.026527724079528872, + "acc_norm": 0.6784565916398714, + "acc_norm_stderr": 0.026527724079528872 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7006172839506173, + "acc_stderr": 0.025483115601195455, + "acc_norm": 0.7006172839506173, + "acc_norm_stderr": 0.025483115601195455 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.48936170212765956, + "acc_stderr": 0.02982074719142248, + "acc_norm": 0.48936170212765956, + "acc_norm_stderr": 0.02982074719142248 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4556714471968709, + "acc_stderr": 0.01271994954303221, + "acc_norm": 0.4556714471968709, + "acc_norm_stderr": 0.01271994954303221 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6213235294117647, + "acc_stderr": 0.029465133639776132, + "acc_norm": 0.6213235294117647, + "acc_norm_stderr": 0.029465133639776132 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6421568627450981, + "acc_stderr": 0.019393058402355435, + "acc_norm": 0.6421568627450981, + "acc_norm_stderr": 0.019393058402355435 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6448979591836734, + "acc_stderr": 0.03063565515038764, + "acc_norm": 0.6448979591836734, + "acc_norm_stderr": 0.03063565515038764 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8308457711442786, + "acc_stderr": 0.02650859065623328, + "acc_norm": 0.8308457711442786, + "acc_norm_stderr": 0.02650859065623328 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.83, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5120481927710844, + "acc_stderr": 0.03891364495835816, + "acc_norm": 0.5120481927710844, + "acc_norm_stderr": 0.03891364495835816 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7719298245614035, + "acc_stderr": 0.032180937956023566, + "acc_norm": 0.7719298245614035, + "acc_norm_stderr": 0.032180937956023566 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.4418604651162791, + "mc1_stderr": 0.017384767478986218, + "mc2": 0.6062876441761156, + "mc2_stderr": 0.0158161206163554 + }, + "harness|winogrande|5": { + "acc": 0.7624309392265194, + "acc_stderr": 0.011961298905803159 + }, + "harness|gsm8k|5": { + "acc": 0.07429871114480667, + "acc_stderr": 0.007223844172845574 + }, + "all": { + "acc": 0.6090298979840253, + "acc_stderr": 0.032809646949895625, + "acc_norm": 0.618940969899117, + "acc_norm_stderr": 0.033576053409858746, + "mc1": 0.4418604651162791, + "mc1_stderr": 0.017384767478986218, + "mc2": 0.6062876441761156, + "mc2_stderr": 0.0158161206163554 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "5b3d8e3112fb3bc3" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "2ab4d9dd35e6654e" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/SLAM-group/NewHope/results_2023-08-02T16-20-26.294433.json b/eval-results/SLAM-group/NewHope/results_2023-08-02T16-20-26.294433.json new file mode 100644 index 0000000000000000000000000000000000000000..0a14c202f75824a25720c80257d132753a3d6bdc --- /dev/null +++ b/eval-results/SLAM-group/NewHope/results_2023-08-02T16-20-26.294433.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5767918088737202, + "acc_stderr": 0.014438036220848022, + "acc_norm": 0.6092150170648464, + "acc_norm_stderr": 0.014258563880513782 + }, + "harness|hellaswag|10": { + "acc": 0.6366261700856403, + "acc_stderr": 0.004799882248494812, + "acc_norm": 0.8399721171081458, + "acc_norm_stderr": 0.003658826208101608 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.43703703703703706, + "acc_stderr": 0.04284958639753399, + "acc_norm": 0.43703703703703706, + "acc_norm_stderr": 0.04284958639753399 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5328947368421053, + "acc_stderr": 0.040601270352363966, + "acc_norm": 0.5328947368421053, + "acc_norm_stderr": 0.040601270352363966 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5811320754716981, + "acc_stderr": 0.03036505082911521, + "acc_norm": 0.5811320754716981, + "acc_norm_stderr": 0.03036505082911521 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5763888888888888, + "acc_stderr": 0.04132125019723369, + "acc_norm": 0.5763888888888888, + "acc_norm_stderr": 0.04132125019723369 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952344, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952344 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5260115606936416, + "acc_stderr": 0.038073017265045125, + "acc_norm": 0.5260115606936416, + "acc_norm_stderr": 0.038073017265045125 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.0433643270799318, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.0433643270799318 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526094, + "acc_norm": 0.67, + "acc_norm_stderr": 0.047258156262526094 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4595744680851064, + "acc_stderr": 0.03257901482099835, + "acc_norm": 0.4595744680851064, + "acc_norm_stderr": 0.03257901482099835 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.04142439719489361, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.04142439719489361 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4689655172413793, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.4689655172413793, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.023919984164047732, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.023919984164047732 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.042857142857142816, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.042857142857142816 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6870967741935484, + "acc_stderr": 0.02637756702864586, + "acc_norm": 0.6870967741935484, + "acc_norm_stderr": 0.02637756702864586 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.39901477832512317, + "acc_stderr": 0.03445487686264715, + "acc_norm": 0.39901477832512317, + "acc_norm_stderr": 0.03445487686264715 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6606060606060606, + "acc_stderr": 0.03697442205031595, + "acc_norm": 0.6606060606060606, + "acc_norm_stderr": 0.03697442205031595 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7070707070707071, + "acc_stderr": 0.032424979581788166, + "acc_norm": 0.7070707070707071, + "acc_norm_stderr": 0.032424979581788166 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8341968911917098, + "acc_stderr": 0.026839845022314415, + "acc_norm": 0.8341968911917098, + "acc_norm_stderr": 0.026839845022314415 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5256410256410257, + "acc_stderr": 0.025317649726448656, + "acc_norm": 0.5256410256410257, + "acc_norm_stderr": 0.025317649726448656 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32592592592592595, + "acc_stderr": 0.02857834836547307, + "acc_norm": 0.32592592592592595, + "acc_norm_stderr": 0.02857834836547307 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6134453781512605, + "acc_stderr": 0.03163145807552379, + "acc_norm": 0.6134453781512605, + "acc_norm_stderr": 0.03163145807552379 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.03822746937658753, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.03822746937658753 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7211009174311926, + "acc_stderr": 0.01922746887646351, + "acc_norm": 0.7211009174311926, + "acc_norm_stderr": 0.01922746887646351 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4583333333333333, + "acc_stderr": 0.03398110890294636, + "acc_norm": 0.4583333333333333, + "acc_norm_stderr": 0.03398110890294636 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7745098039215687, + "acc_stderr": 0.029331162294251735, + "acc_norm": 0.7745098039215687, + "acc_norm_stderr": 0.029331162294251735 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7426160337552743, + "acc_stderr": 0.028458820991460295, + "acc_norm": 0.7426160337552743, + "acc_norm_stderr": 0.028458820991460295 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6547085201793722, + "acc_stderr": 0.03191100192835794, + "acc_norm": 0.6547085201793722, + "acc_norm_stderr": 0.03191100192835794 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6259541984732825, + "acc_stderr": 0.042438692422305246, + "acc_norm": 0.6259541984732825, + "acc_norm_stderr": 0.042438692422305246 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7107438016528925, + "acc_stderr": 0.04139112727635463, + "acc_norm": 0.7107438016528925, + "acc_norm_stderr": 0.04139112727635463 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.04453197507374983, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.04453197507374983 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6380368098159509, + "acc_stderr": 0.037757007291414416, + "acc_norm": 0.6380368098159509, + "acc_norm_stderr": 0.037757007291414416 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2767857142857143, + "acc_stderr": 0.04246624336697625, + "acc_norm": 0.2767857142857143, + "acc_norm_stderr": 0.04246624336697625 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7991452991452992, + "acc_stderr": 0.026246772946890467, + "acc_norm": 0.7991452991452992, + "acc_norm_stderr": 0.026246772946890467 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7496807151979565, + "acc_stderr": 0.015491088951494569, + "acc_norm": 0.7496807151979565, + "acc_norm_stderr": 0.015491088951494569 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6358381502890174, + "acc_stderr": 0.025906632631016127, + "acc_norm": 0.6358381502890174, + "acc_norm_stderr": 0.025906632631016127 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.38100558659217876, + "acc_stderr": 0.016242028834053613, + "acc_norm": 0.38100558659217876, + "acc_norm_stderr": 0.016242028834053613 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6405228758169934, + "acc_stderr": 0.027475969910660952, + "acc_norm": 0.6405228758169934, + "acc_norm_stderr": 0.027475969910660952 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6366559485530546, + "acc_stderr": 0.027316847674192714, + "acc_norm": 0.6366559485530546, + "acc_norm_stderr": 0.027316847674192714 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6388888888888888, + "acc_stderr": 0.026725868809100793, + "acc_norm": 0.6388888888888888, + "acc_norm_stderr": 0.026725868809100793 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.425531914893617, + "acc_stderr": 0.02949482760014438, + "acc_norm": 0.425531914893617, + "acc_norm_stderr": 0.02949482760014438 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44589308996088656, + "acc_stderr": 0.012695244711379776, + "acc_norm": 0.44589308996088656, + "acc_norm_stderr": 0.012695244711379776 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.03016191193076711, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.03016191193076711 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5473856209150327, + "acc_stderr": 0.020136790918492523, + "acc_norm": 0.5473856209150327, + "acc_norm_stderr": 0.020136790918492523 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6285714285714286, + "acc_stderr": 0.03093285879278985, + "acc_norm": 0.6285714285714286, + "acc_norm_stderr": 0.03093285879278985 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.736318407960199, + "acc_stderr": 0.031157150869355558, + "acc_norm": 0.736318407960199, + "acc_norm_stderr": 0.031157150869355558 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.035887028128263686, + "acc_norm": 0.85, + "acc_norm_stderr": 0.035887028128263686 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4939759036144578, + "acc_stderr": 0.03892212195333045, + "acc_norm": 0.4939759036144578, + "acc_norm_stderr": 0.03892212195333045 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.783625730994152, + "acc_stderr": 0.03158149539338734, + "acc_norm": 0.783625730994152, + "acc_norm_stderr": 0.03158149539338734 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3243574051407589, + "mc1_stderr": 0.016387976779647935, + "mc2": 0.44868368066946906, + "mc2_stderr": 0.015140951474620613 + }, + "all": { + "acc": 0.5588691829632426, + "acc_stderr": 0.03433115773924322, + "acc_norm": 0.5628652703397449, + "acc_norm_stderr": 0.03430877590228174, + "mc1": 0.3243574051407589, + "mc1_stderr": 0.016387976779647935, + "mc2": 0.44868368066946906, + "mc2_stderr": 0.015140951474620613 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "SLAM-group/NewHope", + "model_sha": "560ca6df8335d6d2998ac8f079218816a5742b02", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "7229.0650951862335", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/SaylorTwift/gpt2_test/results_2023-07-19T19-08-58.298962.json b/eval-results/SaylorTwift/gpt2_test/results_2023-07-19T19-08-58.298962.json new file mode 100644 index 0000000000000000000000000000000000000000..c3af158d11bac462af44a0e65d2cb740b78983dd --- /dev/null +++ b/eval-results/SaylorTwift/gpt2_test/results_2023-07-19T19-08-58.298962.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.19965870307167236, + "acc_stderr": 0.011681625756888676, + "acc_norm": 0.21843003412969283, + "acc_norm_stderr": 0.012074291605700978 + }, + "harness|hellaswag|10": { + "acc": 0.2922724556861183, + "acc_stderr": 0.0045387734937465595, + "acc_norm": 0.31597291376219877, + "acc_norm_stderr": 0.00463952045344403 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.03712537833614867, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.03712537833614867 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.16447368421052633, + "acc_stderr": 0.0301675334686327, + "acc_norm": 0.16447368421052633, + "acc_norm_stderr": 0.0301675334686327 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.17, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.17, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.23018867924528302, + "acc_stderr": 0.02590789712240817, + "acc_norm": 0.23018867924528302, + "acc_norm_stderr": 0.02590789712240817 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.03476590104304134, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.03476590104304134 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720685, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720685 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.23699421965317918, + "acc_stderr": 0.03242414757483098, + "acc_norm": 0.23699421965317918, + "acc_norm_stderr": 0.03242414757483098 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.043364327079931785, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.043364327079931785 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.17, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.17, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2680851063829787, + "acc_stderr": 0.028957342788342347, + "acc_norm": 0.2680851063829787, + "acc_norm_stderr": 0.028957342788342347 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.022418042891113942, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.022418042891113942 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.14285714285714285, + "acc_stderr": 0.0312984318574381, + "acc_norm": 0.14285714285714285, + "acc_norm_stderr": 0.0312984318574381 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.15, + "acc_stderr": 0.035887028128263686, + "acc_norm": 0.15, + "acc_norm_stderr": 0.035887028128263686 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3, + "acc_stderr": 0.026069362295335137, + "acc_norm": 0.3, + "acc_norm_stderr": 0.026069362295335137 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.03144712581678242, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.03144712581678242 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.35353535353535354, + "acc_stderr": 0.03406086723547153, + "acc_norm": 0.35353535353535354, + "acc_norm_stderr": 0.03406086723547153 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.36787564766839376, + "acc_stderr": 0.03480175668466036, + "acc_norm": 0.36787564766839376, + "acc_norm_stderr": 0.03480175668466036 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2743589743589744, + "acc_stderr": 0.022622765767493225, + "acc_norm": 0.2743589743589744, + "acc_norm_stderr": 0.022622765767493225 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25555555555555554, + "acc_stderr": 0.02659393910184408, + "acc_norm": 0.25555555555555554, + "acc_norm_stderr": 0.02659393910184408 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.28991596638655465, + "acc_stderr": 0.029472485833136098, + "acc_norm": 0.28991596638655465, + "acc_norm_stderr": 0.029472485833136098 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.036313298039696545, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.036313298039696545 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3467889908256881, + "acc_stderr": 0.020406097104093027, + "acc_norm": 0.3467889908256881, + "acc_norm_stderr": 0.020406097104093027 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.030587591351604243, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.030587591351604243 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2489451476793249, + "acc_stderr": 0.028146970599422644, + "acc_norm": 0.2489451476793249, + "acc_norm_stderr": 0.028146970599422644 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.2914798206278027, + "acc_stderr": 0.030500283176545923, + "acc_norm": 0.2914798206278027, + "acc_norm_stderr": 0.030500283176545923 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.26717557251908397, + "acc_stderr": 0.038808483010823944, + "acc_norm": 0.26717557251908397, + "acc_norm_stderr": 0.038808483010823944 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.3305785123966942, + "acc_stderr": 0.04294340845212094, + "acc_norm": 0.3305785123966942, + "acc_norm_stderr": 0.04294340845212094 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.03957835471980981, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.03957835471980981 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.25766871165644173, + "acc_stderr": 0.03436150827846917, + "acc_norm": 0.25766871165644173, + "acc_norm_stderr": 0.03436150827846917 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.24107142857142858, + "acc_stderr": 0.04059867246952688, + "acc_norm": 0.24107142857142858, + "acc_norm_stderr": 0.04059867246952688 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.33980582524271846, + "acc_stderr": 0.046897659372781356, + "acc_norm": 0.33980582524271846, + "acc_norm_stderr": 0.046897659372781356 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.1794871794871795, + "acc_stderr": 0.025140935950335418, + "acc_norm": 0.1794871794871795, + "acc_norm_stderr": 0.025140935950335418 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.21839080459770116, + "acc_stderr": 0.014774358319934488, + "acc_norm": 0.21839080459770116, + "acc_norm_stderr": 0.014774358319934488 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2398843930635838, + "acc_stderr": 0.02298959254312357, + "acc_norm": 0.2398843930635838, + "acc_norm_stderr": 0.02298959254312357 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.21895424836601307, + "acc_stderr": 0.02367908986180772, + "acc_norm": 0.21895424836601307, + "acc_norm_stderr": 0.02367908986180772 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2508038585209003, + "acc_stderr": 0.024619771956697165, + "acc_norm": 0.2508038585209003, + "acc_norm_stderr": 0.024619771956697165 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.22530864197530864, + "acc_stderr": 0.023246202647819746, + "acc_norm": 0.22530864197530864, + "acc_norm_stderr": 0.023246202647819746 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2695035460992908, + "acc_stderr": 0.02646903681859063, + "acc_norm": 0.2695035460992908, + "acc_norm_stderr": 0.02646903681859063 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24771838331160365, + "acc_stderr": 0.011025499291443737, + "acc_norm": 0.24771838331160365, + "acc_norm_stderr": 0.011025499291443737 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.030161911930767102, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.030161911930767102 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.017848089574913222, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.017848089574913222 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4, + "acc_stderr": 0.031362502409358936, + "acc_norm": 0.4, + "acc_norm_stderr": 0.031362502409358936 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.22885572139303484, + "acc_stderr": 0.029705284056772426, + "acc_norm": 0.22885572139303484, + "acc_norm_stderr": 0.029705284056772426 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.1927710843373494, + "acc_stderr": 0.030709824050565274, + "acc_norm": 0.1927710843373494, + "acc_norm_stderr": 0.030709824050565274 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.0312678171466318, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.0312678171466318 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.22766217870257038, + "mc1_stderr": 0.01467925503211107, + "mc2": 0.4067393224175315, + "mc2_stderr": 0.014921031198907243 + }, + "all": { + "acc": 0.25815735770896303, + "acc_stderr": 0.03144000977290419, + "acc_norm": 0.2588772185417444, + "acc_norm_stderr": 0.03144837270186198, + "mc1": 0.22766217870257038, + "mc1_stderr": 0.01467925503211107, + "mc2": 0.4067393224175315, + "mc2_stderr": 0.014921031198907243 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "SaylorTwift/gpt2_test", + "model_sha": "ef61310a16ffda93bf8f6132e02658482ffc2bcc", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + } + } +} \ No newline at end of file diff --git a/eval-results/SaylorTwift/gpt2_test/results_2023-09-22T16-48-41.866587.json b/eval-results/SaylorTwift/gpt2_test/results_2023-09-22T16-48-41.866587.json new file mode 100644 index 0000000000000000000000000000000000000000..f86b4b35ebbe548413b5a65edeb25b376e5639b9 --- /dev/null +++ b/eval-results/SaylorTwift/gpt2_test/results_2023-09-22T16-48-41.866587.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "SaylorTwift/gpt2_test", + "model_sha": "ef61310a16ffda93bf8f6132e02658482ffc2bcc", + "model_size": "238.85 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0025167785234899327, + "em_stderr": 0.0005131152834514814, + "f1": 0.04780411073825513, + "f1_stderr": 0.0013732412097489425 + }, + "harness|gsm8k|5": { + "acc": 0.003032600454890068, + "acc_stderr": 0.0015145735612245488 + }, + "harness|winogrande|5": { + "acc": 0.5011838989739542, + "acc_stderr": 0.014052446290529009 + }, + "all": { + "em": 0.0025167785234899327, + "em_stderr": 0.0005131152834514814, + "f1": 0.04780411073825513, + "f1_stderr": 0.0013732412097489425, + "acc": 0.25210824971442214, + "acc_stderr": 0.007783509925876779 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "8427c14235d3ac2f" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "245aacececb56efb" + }, + "truncated": 917, + "non-truncated": 402, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "6b3de69e2f87348c", + "hash_cont_tokens": "71d4959875f0b473" + }, + "total_evaluation_time_secondes": "4861.729343414307", + "truncated": 10207, + "non-truncated": 3182, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Taekyoon/llama2-ko-7b-test/results_2023-08-28T09-09-02.494936.json b/eval-results/Taekyoon/llama2-ko-7b-test/results_2023-08-28T09-09-02.494936.json new file mode 100644 index 0000000000000000000000000000000000000000..5b600386e1c4774cb88ec10cf45e654c7d51df22 --- /dev/null +++ b/eval-results/Taekyoon/llama2-ko-7b-test/results_2023-08-28T09-09-02.494936.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "Taekyoon/llama2-ko-7b-test", + "model_sha": "1d9b52cc5832ae0ea37514330d38193b737e1d07", + "model_dtype": "torch.float16", + "lighteval_sha": "2ad22f242d6beb9b9777ea661e05832132fa2939", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.3506825938566553, + "acc_stderr": 0.013944635930726089, + "acc_norm": 0.3779863481228669, + "acc_norm_stderr": 0.014169664520303105 + }, + "harness|hellaswag|10": { + "acc": 0.4697271459868552, + "acc_stderr": 0.004980627287147587, + "acc_norm": 0.6303525194184425, + "acc_norm_stderr": 0.004817227292240292 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.03749850709174022, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.03749850709174022 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3223684210526316, + "acc_stderr": 0.03803510248351585, + "acc_norm": 0.3223684210526316, + "acc_norm_stderr": 0.03803510248351585 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.3320754716981132, + "acc_stderr": 0.02898545565233439, + "acc_norm": 0.3320754716981132, + "acc_norm_stderr": 0.02898545565233439 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.24305555555555555, + "acc_stderr": 0.03586879280080341, + "acc_norm": 0.24305555555555555, + "acc_norm_stderr": 0.03586879280080341 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2774566473988439, + "acc_stderr": 0.034140140070440354, + "acc_norm": 0.2774566473988439, + "acc_norm_stderr": 0.034140140070440354 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237657, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237657 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952344, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952344 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.32340425531914896, + "acc_stderr": 0.030579442773610334, + "acc_norm": 0.32340425531914896, + "acc_norm_stderr": 0.030579442773610334 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.044346007015849245, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.044346007015849245 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135303, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135303 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.022569897074918424, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.022569897074918424 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.15079365079365079, + "acc_stderr": 0.032006864972873916, + "acc_norm": 0.15079365079365079, + "acc_norm_stderr": 0.032006864972873916 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.23225806451612904, + "acc_stderr": 0.02402225613030824, + "acc_norm": 0.23225806451612904, + "acc_norm_stderr": 0.02402225613030824 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.26108374384236455, + "acc_stderr": 0.030903796952114485, + "acc_norm": 0.26108374384236455, + "acc_norm_stderr": 0.030903796952114485 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2787878787878788, + "acc_stderr": 0.03501438706296781, + "acc_norm": 0.2787878787878788, + "acc_norm_stderr": 0.03501438706296781 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.37373737373737376, + "acc_stderr": 0.034468977386593325, + "acc_norm": 0.37373737373737376, + "acc_norm_stderr": 0.034468977386593325 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.3471502590673575, + "acc_stderr": 0.03435696168361355, + "acc_norm": 0.3471502590673575, + "acc_norm_stderr": 0.03435696168361355 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.32051282051282054, + "acc_stderr": 0.023661296393964273, + "acc_norm": 0.32051282051282054, + "acc_norm_stderr": 0.023661296393964273 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.026962424325073835, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.026962424325073835 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.029597329730978086, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.029597329730978086 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.39072847682119205, + "acc_stderr": 0.039837983066598096, + "acc_norm": 0.39072847682119205, + "acc_norm_stderr": 0.039837983066598096 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3357798165137615, + "acc_stderr": 0.020248081396752937, + "acc_norm": 0.3357798165137615, + "acc_norm_stderr": 0.020248081396752937 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4675925925925926, + "acc_stderr": 0.03402801581358966, + "acc_norm": 0.4675925925925926, + "acc_norm_stderr": 0.03402801581358966 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.030190282453501954, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.030190282453501954 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.34080717488789236, + "acc_stderr": 0.0318114974705536, + "acc_norm": 0.34080717488789236, + "acc_norm_stderr": 0.0318114974705536 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3053435114503817, + "acc_stderr": 0.040393149787245626, + "acc_norm": 0.3053435114503817, + "acc_norm_stderr": 0.040393149787245626 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.3305785123966942, + "acc_stderr": 0.04294340845212094, + "acc_norm": 0.3305785123966942, + "acc_norm_stderr": 0.04294340845212094 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.043300437496507416, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.043300437496507416 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.20535714285714285, + "acc_stderr": 0.038342410214190735, + "acc_norm": 0.20535714285714285, + "acc_norm_stderr": 0.038342410214190735 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.36893203883495146, + "acc_stderr": 0.04777615181156739, + "acc_norm": 0.36893203883495146, + "acc_norm_stderr": 0.04777615181156739 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.32051282051282054, + "acc_stderr": 0.03057281131029961, + "acc_norm": 0.32051282051282054, + "acc_norm_stderr": 0.03057281131029961 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.29246487867177523, + "acc_stderr": 0.016267000684598645, + "acc_norm": 0.29246487867177523, + "acc_norm_stderr": 0.016267000684598645 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2832369942196532, + "acc_stderr": 0.024257901705323378, + "acc_norm": 0.2832369942196532, + "acc_norm_stderr": 0.024257901705323378 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24692737430167597, + "acc_stderr": 0.014422292204808836, + "acc_norm": 0.24692737430167597, + "acc_norm_stderr": 0.014422292204808836 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.025829163272757482, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.025829163272757482 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3408360128617363, + "acc_stderr": 0.02692084126077616, + "acc_norm": 0.3408360128617363, + "acc_norm_stderr": 0.02692084126077616 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.30246913580246915, + "acc_stderr": 0.025557653981868055, + "acc_norm": 0.30246913580246915, + "acc_norm_stderr": 0.025557653981868055 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.24822695035460993, + "acc_stderr": 0.025770015644290396, + "acc_norm": 0.24822695035460993, + "acc_norm_stderr": 0.025770015644290396 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.288135593220339, + "acc_stderr": 0.011567140661324563, + "acc_norm": 0.288135593220339, + "acc_norm_stderr": 0.011567140661324563 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4485294117647059, + "acc_stderr": 0.030211479609121593, + "acc_norm": 0.4485294117647059, + "acc_norm_stderr": 0.030211479609121593 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25, + "acc_stderr": 0.01751781884501444, + "acc_norm": 0.25, + "acc_norm_stderr": 0.01751781884501444 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.35454545454545455, + "acc_stderr": 0.04582004841505416, + "acc_norm": 0.35454545454545455, + "acc_norm_stderr": 0.04582004841505416 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.40408163265306124, + "acc_stderr": 0.031414708025865885, + "acc_norm": 0.40408163265306124, + "acc_norm_stderr": 0.031414708025865885 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24875621890547264, + "acc_stderr": 0.030567675938916714, + "acc_norm": 0.24875621890547264, + "acc_norm_stderr": 0.030567675938916714 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909281, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909281 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.30120481927710846, + "acc_stderr": 0.0357160923005348, + "acc_norm": 0.30120481927710846, + "acc_norm_stderr": 0.0357160923005348 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.03565079670708312, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.03565079670708312 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2252141982864137, + "mc1_stderr": 0.014623240768023505, + "mc2": 0.35997350772493825, + "mc2_stderr": 0.014042588070598837 + }, + "all": { + "acc": 0.29939516770443486, + "acc_stderr": 0.03301601100044412, + "acc_norm": 0.3025804071569399, + "acc_norm_stderr": 0.03301705555289615, + "mc1": 0.2252141982864137, + "mc1_stderr": 0.014623240768023505, + "mc2": 0.35997350772493825, + "mc2_stderr": 0.014042588070598837 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4139.476498603821", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Airoboros-L2-13B-2.1-GPTQ/results_2023-08-30T18-43-07.011974.json b/eval-results/TheBloke/Airoboros-L2-13B-2.1-GPTQ/results_2023-08-30T18-43-07.011974.json new file mode 100644 index 0000000000000000000000000000000000000000..37c80dab3457df7aa4279cf9331341b98547f15b --- /dev/null +++ b/eval-results/TheBloke/Airoboros-L2-13B-2.1-GPTQ/results_2023-08-30T18-43-07.011974.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/Airoboros-L2-13B-2.1-GPTQ", + "model_sha": "d90d96e40b9359cb5c35e6b6c8f0eb24896e827b", + "model_dtype": "None", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5401023890784983, + "acc_stderr": 0.01456431885692485, + "acc_norm": 0.5895904436860068, + "acc_norm_stderr": 0.014374922192642664 + }, + "harness|hellaswag|10": { + "acc": 0.6171081457876917, + "acc_stderr": 0.004850988215167537, + "acc_norm": 0.817167894841665, + "acc_norm_stderr": 0.0038573886135330965 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5111111111111111, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.5111111111111111, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5197368421052632, + "acc_stderr": 0.04065771002562605, + "acc_norm": 0.5197368421052632, + "acc_norm_stderr": 0.04065771002562605 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.569811320754717, + "acc_stderr": 0.03047144586718324, + "acc_norm": 0.569811320754717, + "acc_norm_stderr": 0.03047144586718324 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5763888888888888, + "acc_stderr": 0.0413212501972337, + "acc_norm": 0.5763888888888888, + "acc_norm_stderr": 0.0413212501972337 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5028901734104047, + "acc_stderr": 0.038124005659748335, + "acc_norm": 0.5028901734104047, + "acc_norm_stderr": 0.038124005659748335 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.04533838195929776, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.04533838195929776 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.39574468085106385, + "acc_stderr": 0.031967586978353627, + "acc_norm": 0.39574468085106385, + "acc_norm_stderr": 0.031967586978353627 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.0433913832257986, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.0433913832257986 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5517241379310345, + "acc_stderr": 0.04144311810878152, + "acc_norm": 0.5517241379310345, + "acc_norm_stderr": 0.04144311810878152 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3201058201058201, + "acc_stderr": 0.024026846392873506, + "acc_norm": 0.3201058201058201, + "acc_norm_stderr": 0.024026846392873506 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.04104947269903394, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.04104947269903394 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6193548387096774, + "acc_stderr": 0.027621717832907036, + "acc_norm": 0.6193548387096774, + "acc_norm_stderr": 0.027621717832907036 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43842364532019706, + "acc_stderr": 0.03491207857486518, + "acc_norm": 0.43842364532019706, + "acc_norm_stderr": 0.03491207857486518 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6787878787878788, + "acc_stderr": 0.0364620496325381, + "acc_norm": 0.6787878787878788, + "acc_norm_stderr": 0.0364620496325381 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6868686868686869, + "acc_stderr": 0.033042050878136525, + "acc_norm": 0.6868686868686869, + "acc_norm_stderr": 0.033042050878136525 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7772020725388601, + "acc_stderr": 0.030031147977641538, + "acc_norm": 0.7772020725388601, + "acc_norm_stderr": 0.030031147977641538 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.441025641025641, + "acc_stderr": 0.02517404838400074, + "acc_norm": 0.441025641025641, + "acc_norm_stderr": 0.02517404838400074 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085622, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085622 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.44537815126050423, + "acc_stderr": 0.0322841062671639, + "acc_norm": 0.44537815126050423, + "acc_norm_stderr": 0.0322841062671639 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6880733944954128, + "acc_stderr": 0.019862967976707245, + "acc_norm": 0.6880733944954128, + "acc_norm_stderr": 0.019862967976707245 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4027777777777778, + "acc_stderr": 0.033448873829978666, + "acc_norm": 0.4027777777777778, + "acc_norm_stderr": 0.033448873829978666 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7352941176470589, + "acc_stderr": 0.030964517926923403, + "acc_norm": 0.7352941176470589, + "acc_norm_stderr": 0.030964517926923403 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6877637130801688, + "acc_stderr": 0.030165137867847008, + "acc_norm": 0.6877637130801688, + "acc_norm_stderr": 0.030165137867847008 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6278026905829597, + "acc_stderr": 0.03244305283008731, + "acc_norm": 0.6278026905829597, + "acc_norm_stderr": 0.03244305283008731 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6030534351145038, + "acc_stderr": 0.04291135671009224, + "acc_norm": 0.6030534351145038, + "acc_norm_stderr": 0.04291135671009224 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.03984979653302872, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.03984979653302872 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6851851851851852, + "acc_stderr": 0.04489931073591312, + "acc_norm": 0.6851851851851852, + "acc_norm_stderr": 0.04489931073591312 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6319018404907976, + "acc_stderr": 0.03789213935838396, + "acc_norm": 0.6319018404907976, + "acc_norm_stderr": 0.03789213935838396 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285714, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285714 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6407766990291263, + "acc_stderr": 0.047504583990416946, + "acc_norm": 0.6407766990291263, + "acc_norm_stderr": 0.047504583990416946 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7350427350427351, + "acc_stderr": 0.028911208802749486, + "acc_norm": 0.7350427350427351, + "acc_norm_stderr": 0.028911208802749486 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.52, + "acc_stderr": 0.05021167315686779, + "acc_norm": 0.52, + "acc_norm_stderr": 0.05021167315686779 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7305236270753512, + "acc_stderr": 0.01586624307321508, + "acc_norm": 0.7305236270753512, + "acc_norm_stderr": 0.01586624307321508 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5982658959537572, + "acc_stderr": 0.026394104177643634, + "acc_norm": 0.5982658959537572, + "acc_norm_stderr": 0.026394104177643634 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3217877094972067, + "acc_stderr": 0.015624236160792575, + "acc_norm": 0.3217877094972067, + "acc_norm_stderr": 0.015624236160792575 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5718954248366013, + "acc_stderr": 0.028332397483664278, + "acc_norm": 0.5718954248366013, + "acc_norm_stderr": 0.028332397483664278 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6205787781350482, + "acc_stderr": 0.027559949802347824, + "acc_norm": 0.6205787781350482, + "acc_norm_stderr": 0.027559949802347824 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6419753086419753, + "acc_stderr": 0.0266756119260371, + "acc_norm": 0.6419753086419753, + "acc_norm_stderr": 0.0266756119260371 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.38652482269503546, + "acc_stderr": 0.029049190342543465, + "acc_norm": 0.38652482269503546, + "acc_norm_stderr": 0.029049190342543465 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.38722294654498046, + "acc_stderr": 0.012441155326854922, + "acc_norm": 0.38722294654498046, + "acc_norm_stderr": 0.012441155326854922 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4852941176470588, + "acc_stderr": 0.03035969707904611, + "acc_norm": 0.4852941176470588, + "acc_norm_stderr": 0.03035969707904611 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5392156862745098, + "acc_stderr": 0.020165523313907904, + "acc_norm": 0.5392156862745098, + "acc_norm_stderr": 0.020165523313907904 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6272727272727273, + "acc_stderr": 0.04631381319425465, + "acc_norm": 0.6272727272727273, + "acc_norm_stderr": 0.04631381319425465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5795918367346938, + "acc_stderr": 0.03160106993449601, + "acc_norm": 0.5795918367346938, + "acc_norm_stderr": 0.03160106993449601 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6865671641791045, + "acc_stderr": 0.03280188205348643, + "acc_norm": 0.6865671641791045, + "acc_norm_stderr": 0.03280188205348643 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932263, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932263 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.46987951807228917, + "acc_stderr": 0.03885425420866767, + "acc_norm": 0.46987951807228917, + "acc_norm_stderr": 0.03885425420866767 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7192982456140351, + "acc_stderr": 0.034462962170884265, + "acc_norm": 0.7192982456140351, + "acc_norm_stderr": 0.034462962170884265 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3157894736842105, + "mc1_stderr": 0.016272287957916912, + "mc2": 0.446804769719196, + "mc2_stderr": 0.015255669220773822 + }, + "all": { + "acc": 0.5331764578355961, + "acc_stderr": 0.034774593152638665, + "acc_norm": 0.5374060816264686, + "acc_norm_stderr": 0.03475454236847059, + "mc1": 0.3157894736842105, + "mc1_stderr": 0.016272287957916912, + "mc2": 0.446804769719196, + "mc2_stderr": 0.015255669220773822 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "9943.63177227974", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Airoboros-L2-13B-2.1-GPTQ/results_2023-09-22T16-15-21.953879.json b/eval-results/TheBloke/Airoboros-L2-13B-2.1-GPTQ/results_2023-09-22T16-15-21.953879.json new file mode 100644 index 0000000000000000000000000000000000000000..d8ccda9ecba455833e1abb2c4288ef1a1934f877 --- /dev/null +++ b/eval-results/TheBloke/Airoboros-L2-13B-2.1-GPTQ/results_2023-09-22T16-15-21.953879.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/Airoboros-L2-13B-2.1-GPTQ", + "model_sha": "8c8bfd00adea48d93966549357607a07816251b3", + "model_size": "6.84 GB", + "model_dtype": "None", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.36325503355704697, + "em_stderr": 0.004925249459609538, + "f1": 0.4313957634228207, + "f1_stderr": 0.004768180025704384 + }, + "harness|gsm8k|5": { + "acc": 0.05989385898407885, + "acc_stderr": 0.006536148151288716 + }, + "harness|winogrande|5": { + "acc": 0.7434885556432518, + "acc_stderr": 0.012273648008759982 + }, + "all": { + "em": 0.36325503355704697, + "em_stderr": 0.004925249459609538, + "f1": 0.4313957634228207, + "f1_stderr": 0.004768180025704384, + "acc": 0.4016912073136653, + "acc_stderr": 0.00940489808002435 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "7ef5b9e36fc17173" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "c337b4538b40177a" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "abdea30e68618ba5" + }, + "total_evaluation_time_secondes": "4567.207172393799", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Airoboros-L2-70B-2.1-GPTQ/results_2023-09-01T04-12-47.380452.json b/eval-results/TheBloke/Airoboros-L2-70B-2.1-GPTQ/results_2023-09-01T04-12-47.380452.json new file mode 100644 index 0000000000000000000000000000000000000000..9dfbb7f9fd398a7fafcd4d7a29ab91f535bd14ad --- /dev/null +++ b/eval-results/TheBloke/Airoboros-L2-70B-2.1-GPTQ/results_2023-09-01T04-12-47.380452.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/Airoboros-L2-70B-2.1-GPTQ", + "model_sha": "23ed580cb77ebaee49ea11eb4538fd3ab3795b76", + "model_dtype": "None", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6697952218430034, + "acc_stderr": 0.013743085603760427, + "acc_norm": 0.7039249146757679, + "acc_norm_stderr": 0.013340916085246261 + }, + "harness|hellaswag|10": { + "acc": 0.6748655646285601, + "acc_stderr": 0.004674677287148612, + "acc_norm": 0.8653654650468035, + "acc_norm_stderr": 0.003406352071341718 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.562962962962963, + "acc_stderr": 0.042849586397534015, + "acc_norm": 0.562962962962963, + "acc_norm_stderr": 0.042849586397534015 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.8223684210526315, + "acc_stderr": 0.031103182383123387, + "acc_norm": 0.8223684210526315, + "acc_norm_stderr": 0.031103182383123387 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7283018867924528, + "acc_stderr": 0.027377706624670713, + "acc_norm": 0.7283018867924528, + "acc_norm_stderr": 0.027377706624670713 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8402777777777778, + "acc_stderr": 0.030635578972093274, + "acc_norm": 0.8402777777777778, + "acc_norm_stderr": 0.030635578972093274 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6763005780346821, + "acc_stderr": 0.0356760379963917, + "acc_norm": 0.6763005780346821, + "acc_norm_stderr": 0.0356760379963917 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.04560480215720685, + "acc_norm": 0.71, + "acc_norm_stderr": 0.04560480215720685 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6851063829787234, + "acc_stderr": 0.030363582197238167, + "acc_norm": 0.6851063829787234, + "acc_norm_stderr": 0.030363582197238167 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.41228070175438597, + "acc_stderr": 0.046306532033665956, + "acc_norm": 0.41228070175438597, + "acc_norm_stderr": 0.046306532033665956 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5862068965517241, + "acc_stderr": 0.04104269211806232, + "acc_norm": 0.5862068965517241, + "acc_norm_stderr": 0.04104269211806232 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.41798941798941797, + "acc_stderr": 0.025402555503260912, + "acc_norm": 0.41798941798941797, + "acc_norm_stderr": 0.025402555503260912 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5, + "acc_stderr": 0.04472135954999579, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04472135954999579 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8451612903225807, + "acc_stderr": 0.020579287326583227, + "acc_norm": 0.8451612903225807, + "acc_norm_stderr": 0.020579287326583227 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4975369458128079, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.4975369458128079, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.74, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.74, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.806060606060606, + "acc_stderr": 0.030874145136562094, + "acc_norm": 0.806060606060606, + "acc_norm_stderr": 0.030874145136562094 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8585858585858586, + "acc_stderr": 0.02482590979334334, + "acc_norm": 0.8585858585858586, + "acc_norm_stderr": 0.02482590979334334 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9378238341968912, + "acc_stderr": 0.01742697415424053, + "acc_norm": 0.9378238341968912, + "acc_norm_stderr": 0.01742697415424053 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7333333333333333, + "acc_stderr": 0.02242127361292371, + "acc_norm": 0.7333333333333333, + "acc_norm_stderr": 0.02242127361292371 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.36666666666666664, + "acc_stderr": 0.029381620726465076, + "acc_norm": 0.36666666666666664, + "acc_norm_stderr": 0.029381620726465076 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7563025210084033, + "acc_stderr": 0.027886828078380572, + "acc_norm": 0.7563025210084033, + "acc_norm_stderr": 0.027886828078380572 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4304635761589404, + "acc_stderr": 0.04042809961395634, + "acc_norm": 0.4304635761589404, + "acc_norm_stderr": 0.04042809961395634 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8862385321100917, + "acc_stderr": 0.013613614800232801, + "acc_norm": 0.8862385321100917, + "acc_norm_stderr": 0.013613614800232801 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.03362277436608043, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.03362277436608043 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9019607843137255, + "acc_stderr": 0.020871118455552104, + "acc_norm": 0.9019607843137255, + "acc_norm_stderr": 0.020871118455552104 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8818565400843882, + "acc_stderr": 0.021011052659878456, + "acc_norm": 0.8818565400843882, + "acc_norm_stderr": 0.021011052659878456 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7847533632286996, + "acc_stderr": 0.027584066602208277, + "acc_norm": 0.7847533632286996, + "acc_norm_stderr": 0.027584066602208277 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8320610687022901, + "acc_stderr": 0.032785485373431386, + "acc_norm": 0.8320610687022901, + "acc_norm_stderr": 0.032785485373431386 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8842975206611571, + "acc_stderr": 0.029199802455622804, + "acc_norm": 0.8842975206611571, + "acc_norm_stderr": 0.029199802455622804 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.03602814176392645, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.03602814176392645 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.803680981595092, + "acc_stderr": 0.031207970394709218, + "acc_norm": 0.803680981595092, + "acc_norm_stderr": 0.031207970394709218 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7961165048543689, + "acc_stderr": 0.0398913985953177, + "acc_norm": 0.7961165048543689, + "acc_norm_stderr": 0.0398913985953177 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.021901905115073325, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.021901905115073325 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8480204342273308, + "acc_stderr": 0.012837852506645214, + "acc_norm": 0.8480204342273308, + "acc_norm_stderr": 0.012837852506645214 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7687861271676301, + "acc_stderr": 0.022698657167855713, + "acc_norm": 0.7687861271676301, + "acc_norm_stderr": 0.022698657167855713 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4960893854748603, + "acc_stderr": 0.016721990073156657, + "acc_norm": 0.4960893854748603, + "acc_norm_stderr": 0.016721990073156657 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7679738562091504, + "acc_stderr": 0.024170840879340873, + "acc_norm": 0.7679738562091504, + "acc_norm_stderr": 0.024170840879340873 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7813504823151125, + "acc_stderr": 0.02347558141786111, + "acc_norm": 0.7813504823151125, + "acc_norm_stderr": 0.02347558141786111 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8055555555555556, + "acc_stderr": 0.022021366100220194, + "acc_norm": 0.8055555555555556, + "acc_norm_stderr": 0.022021366100220194 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5567375886524822, + "acc_stderr": 0.02963483847376601, + "acc_norm": 0.5567375886524822, + "acc_norm_stderr": 0.02963483847376601 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5521512385919165, + "acc_stderr": 0.012700582404768233, + "acc_norm": 0.5521512385919165, + "acc_norm_stderr": 0.012700582404768233 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7352941176470589, + "acc_stderr": 0.02679956202488766, + "acc_norm": 0.7352941176470589, + "acc_norm_stderr": 0.02679956202488766 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7581699346405228, + "acc_stderr": 0.017322789207784326, + "acc_norm": 0.7581699346405228, + "acc_norm_stderr": 0.017322789207784326 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7181818181818181, + "acc_stderr": 0.04309118709946458, + "acc_norm": 0.7181818181818181, + "acc_norm_stderr": 0.04309118709946458 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7959183673469388, + "acc_stderr": 0.02580128347509049, + "acc_norm": 0.7959183673469388, + "acc_norm_stderr": 0.02580128347509049 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.900497512437811, + "acc_stderr": 0.021166216304659393, + "acc_norm": 0.900497512437811, + "acc_norm_stderr": 0.021166216304659393 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.94, + "acc_stderr": 0.023868325657594197, + "acc_norm": 0.94, + "acc_norm_stderr": 0.023868325657594197 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5421686746987951, + "acc_stderr": 0.0387862677100236, + "acc_norm": 0.5421686746987951, + "acc_norm_stderr": 0.0387862677100236 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3880048959608323, + "mc1_stderr": 0.017058761501347972, + "mc2": 0.5554952598764649, + "mc2_stderr": 0.014960518140869628 + }, + "all": { + "acc": 0.6883614003862496, + "acc_stderr": 0.03120175044911289, + "acc_norm": 0.6921686816277922, + "acc_norm_stderr": 0.03117343697903965, + "mc1": 0.3880048959608323, + "mc1_stderr": 0.017058761501347972, + "mc2": 0.5554952598764649, + "mc2_stderr": 0.014960518140869628 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "50898.54520988464", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Airoboros-L2-70B-2.1-GPTQ/results_2023-11-08T02-26-46.433766.json b/eval-results/TheBloke/Airoboros-L2-70B-2.1-GPTQ/results_2023-11-08T02-26-46.433766.json new file mode 100644 index 0000000000000000000000000000000000000000..8633480e92b58b1de2e447318c5be49f03fa29c4 --- /dev/null +++ b/eval-results/TheBloke/Airoboros-L2-70B-2.1-GPTQ/results_2023-11-08T02-26-46.433766.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/Airoboros-L2-70B-2.1-GPTQ", + "model_sha": "b2d813bbe0c0784d7c2ff2abd5aed89452f79eca", + "model_dtype": "None", + "model_size": "38.03 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.4241820469798658, + "em_stderr": 0.0050612570385902955, + "f1": 0.5410476090604083, + "f1_stderr": 0.004613044422574753 + }, + "harness|gsm8k|5": { + "acc": 0.15238817285822592, + "acc_stderr": 0.009899572254794198 + }, + "harness|winogrande|5": { + "acc": 0.8161010260457774, + "acc_stderr": 0.010887916013305896 + }, + "all": { + "em": 0.4241820469798658, + "em_stderr": 0.0050612570385902955, + "f1": 0.5410476090604083, + "f1_stderr": 0.004613044422574753, + "acc": 0.48424459945200166, + "acc_stderr": 0.010393744134050047 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "7ccaadbdb7707c60" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "00fbbfa8820279e4" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "1d9d42ff18b0a09c" + }, + "truncated": 3, + "non_truncated": 12119, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/BigTranslate-13B-GPTQ/results_2023-10-03T16-49-43.978313.json b/eval-results/TheBloke/BigTranslate-13B-GPTQ/results_2023-10-03T16-49-43.978313.json new file mode 100644 index 0000000000000000000000000000000000000000..d54d39021656cdaf2984d99d3437d7dcce7f2234 --- /dev/null +++ b/eval-results/TheBloke/BigTranslate-13B-GPTQ/results_2023-10-03T16-49-43.978313.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "TheBloke/BigTranslate-13B-GPTQ", + "model_sha": "f2968552d2f522023f3289747234aea5508980e2", + "model_size": "7.21 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4308873720136519, + "acc_stderr": 0.01447113339264247, + "acc_norm": 0.45307167235494883, + "acc_norm_stderr": 0.01454689205200563 + }, + "harness|hellaswag|10": { + "acc": 0.5786695877315275, + "acc_stderr": 0.004927631806477558, + "acc_norm": 0.7510456084445329, + "acc_norm_stderr": 0.004315236154543959 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.03944624162501117, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.03944624162501117 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.23026315789473684, + "acc_stderr": 0.03426059424403165, + "acc_norm": 0.23026315789473684, + "acc_norm_stderr": 0.03426059424403165 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2830188679245283, + "acc_stderr": 0.027724236492700904, + "acc_norm": 0.2830188679245283, + "acc_norm_stderr": 0.027724236492700904 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3263888888888889, + "acc_stderr": 0.03921067198982266, + "acc_norm": 0.3263888888888889, + "acc_norm_stderr": 0.03921067198982266 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2254335260115607, + "acc_stderr": 0.03186209851641144, + "acc_norm": 0.2254335260115607, + "acc_norm_stderr": 0.03186209851641144 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171452, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171452 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3446808510638298, + "acc_stderr": 0.03106898596312215, + "acc_norm": 0.3446808510638298, + "acc_norm_stderr": 0.03106898596312215 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21929824561403508, + "acc_stderr": 0.03892431106518753, + "acc_norm": 0.21929824561403508, + "acc_norm_stderr": 0.03892431106518753 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2, + "acc_stderr": 0.03333333333333329, + "acc_norm": 0.2, + "acc_norm_stderr": 0.03333333333333329 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.23544973544973544, + "acc_stderr": 0.02185150982203172, + "acc_norm": 0.23544973544973544, + "acc_norm_stderr": 0.02185150982203172 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.039325376803928724, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.039325376803928724 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.26129032258064516, + "acc_stderr": 0.024993053397764812, + "acc_norm": 0.26129032258064516, + "acc_norm_stderr": 0.024993053397764812 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.1625615763546798, + "acc_stderr": 0.025960300064605576, + "acc_norm": 0.1625615763546798, + "acc_norm_stderr": 0.025960300064605576 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.3515151515151515, + "acc_stderr": 0.037282069986826503, + "acc_norm": 0.3515151515151515, + "acc_norm_stderr": 0.037282069986826503 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.23232323232323232, + "acc_stderr": 0.030088629490217487, + "acc_norm": 0.23232323232323232, + "acc_norm_stderr": 0.030088629490217487 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.43523316062176165, + "acc_stderr": 0.03578038165008585, + "acc_norm": 0.43523316062176165, + "acc_norm_stderr": 0.03578038165008585 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3487179487179487, + "acc_stderr": 0.02416278028401772, + "acc_norm": 0.3487179487179487, + "acc_norm_stderr": 0.02416278028401772 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.027840811495871927, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.027840811495871927 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3277310924369748, + "acc_stderr": 0.030489911417673227, + "acc_norm": 0.3277310924369748, + "acc_norm_stderr": 0.030489911417673227 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.25165562913907286, + "acc_stderr": 0.035433042343899844, + "acc_norm": 0.25165562913907286, + "acc_norm_stderr": 0.035433042343899844 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3559633027522936, + "acc_stderr": 0.02052855927824422, + "acc_norm": 0.3559633027522936, + "acc_norm_stderr": 0.02052855927824422 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3472222222222222, + "acc_stderr": 0.03246887243637649, + "acc_norm": 0.3472222222222222, + "acc_norm_stderr": 0.03246887243637649 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.3480392156862745, + "acc_stderr": 0.03343311240488419, + "acc_norm": 0.3480392156862745, + "acc_norm_stderr": 0.03343311240488419 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.37130801687763715, + "acc_stderr": 0.03145068600744859, + "acc_norm": 0.37130801687763715, + "acc_norm_stderr": 0.03145068600744859 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.42152466367713004, + "acc_stderr": 0.033141902221106564, + "acc_norm": 0.42152466367713004, + "acc_norm_stderr": 0.033141902221106564 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.29770992366412213, + "acc_stderr": 0.04010358942462202, + "acc_norm": 0.29770992366412213, + "acc_norm_stderr": 0.04010358942462202 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.49586776859504134, + "acc_stderr": 0.045641987674327526, + "acc_norm": 0.49586776859504134, + "acc_norm_stderr": 0.045641987674327526 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.32407407407407407, + "acc_stderr": 0.04524596007030048, + "acc_norm": 0.32407407407407407, + "acc_norm_stderr": 0.04524596007030048 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2147239263803681, + "acc_stderr": 0.03226219377286774, + "acc_norm": 0.2147239263803681, + "acc_norm_stderr": 0.03226219377286774 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3392857142857143, + "acc_stderr": 0.04493949068613539, + "acc_norm": 0.3392857142857143, + "acc_norm_stderr": 0.04493949068613539 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.30097087378640774, + "acc_stderr": 0.04541609446503947, + "acc_norm": 0.30097087378640774, + "acc_norm_stderr": 0.04541609446503947 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.3803418803418803, + "acc_stderr": 0.031804252043840985, + "acc_norm": 0.3803418803418803, + "acc_norm_stderr": 0.031804252043840985 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.4342273307790549, + "acc_stderr": 0.01772458938967779, + "acc_norm": 0.4342273307790549, + "acc_norm_stderr": 0.01772458938967779 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2774566473988439, + "acc_stderr": 0.024105712607754307, + "acc_norm": 0.2774566473988439, + "acc_norm_stderr": 0.024105712607754307 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2245810055865922, + "acc_stderr": 0.01395680366654464, + "acc_norm": 0.2245810055865922, + "acc_norm_stderr": 0.01395680366654464 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2908496732026144, + "acc_stderr": 0.026004800363952113, + "acc_norm": 0.2908496732026144, + "acc_norm_stderr": 0.026004800363952113 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3762057877813505, + "acc_stderr": 0.027513925683549427, + "acc_norm": 0.3762057877813505, + "acc_norm_stderr": 0.027513925683549427 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2839506172839506, + "acc_stderr": 0.025089478523765127, + "acc_norm": 0.2839506172839506, + "acc_norm_stderr": 0.025089478523765127 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.24468085106382978, + "acc_stderr": 0.02564555362226673, + "acc_norm": 0.24468085106382978, + "acc_norm_stderr": 0.02564555362226673 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.27183833116036504, + "acc_stderr": 0.011363135278651414, + "acc_norm": 0.27183833116036504, + "acc_norm_stderr": 0.011363135278651414 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4264705882352941, + "acc_stderr": 0.030042615832714857, + "acc_norm": 0.4264705882352941, + "acc_norm_stderr": 0.030042615832714857 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.31209150326797386, + "acc_stderr": 0.01874501120127766, + "acc_norm": 0.31209150326797386, + "acc_norm_stderr": 0.01874501120127766 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.32727272727272727, + "acc_stderr": 0.0449429086625209, + "acc_norm": 0.32727272727272727, + "acc_norm_stderr": 0.0449429086625209 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2693877551020408, + "acc_stderr": 0.02840125202902294, + "acc_norm": 0.2693877551020408, + "acc_norm_stderr": 0.02840125202902294 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.373134328358209, + "acc_stderr": 0.034198326081760065, + "acc_norm": 0.373134328358209, + "acc_norm_stderr": 0.034198326081760065 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.29518072289156627, + "acc_stderr": 0.03550920185689629, + "acc_norm": 0.29518072289156627, + "acc_norm_stderr": 0.03550920185689629 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.391812865497076, + "acc_stderr": 0.037439798259264016, + "acc_norm": 0.391812865497076, + "acc_norm_stderr": 0.037439798259264016 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2558139534883721, + "mc1_stderr": 0.015274176219283352, + "mc2": 0.40595324855240134, + "mc2_stderr": 0.014913531416143539 + }, + "all": { + "acc": 0.3183234260279837, + "acc_stderr": 0.03345911148648646, + "acc_norm": 0.32162105858822604, + "acc_norm_stderr": 0.0334500159442395, + "mc1": 0.2558139534883721, + "mc1_stderr": 0.015274176219283352, + "mc2": 0.40595324855240134, + "mc2_stderr": 0.014913531416143539 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3905c11f73465545", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "4e494979b9c955ea", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "aa0e9c402d060087", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4338.809689998627", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/BigTranslate-13B-GPTQ/results_2023-11-05T05-22-44.746843.json b/eval-results/TheBloke/BigTranslate-13B-GPTQ/results_2023-11-05T05-22-44.746843.json new file mode 100644 index 0000000000000000000000000000000000000000..eb4e3e37c3b0cdad06eb4008e85637fd6f2f5751 --- /dev/null +++ b/eval-results/TheBloke/BigTranslate-13B-GPTQ/results_2023-11-05T05-22-44.746843.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/BigTranslate-13B-GPTQ", + "model_sha": "f2968552d2f522023f3289747234aea5508980e2", + "model_dtype": "torch.float16", + "model_size": "7.21 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.17533557046979867, + "em_stderr": 0.0038941555740368405, + "f1": 0.2203764681208055, + "f1_stderr": 0.003964834981357511 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.7095501183898973, + "acc_stderr": 0.0127588134480646 + }, + "all": { + "em": 0.17533557046979867, + "em_stderr": 0.0038941555740368405, + "f1": 0.2203764681208055, + "f1_stderr": 0.003964834981357511, + "acc": 0.35477505919494867, + "acc_stderr": 0.0063794067240323 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "4cb8f984afb75347", + "hash_cont_tokens": "b1c52ba4ffcc9553" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "a0a680e3ad75758f" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "ac62dfeb9414e21a", + "hash_cont_tokens": "9cec69c7044130fd" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/BigTranslate-13B-GPTQ/results_2023-11-07T08-36-33.722457.json b/eval-results/TheBloke/BigTranslate-13B-GPTQ/results_2023-11-07T08-36-33.722457.json new file mode 100644 index 0000000000000000000000000000000000000000..88e2a130507c24951eadb3522f32004a95e5b4a9 --- /dev/null +++ b/eval-results/TheBloke/BigTranslate-13B-GPTQ/results_2023-11-07T08-36-33.722457.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/BigTranslate-13B-GPTQ", + "model_sha": "f2968552d2f522023f3289747234aea5508980e2", + "model_dtype": "torch.float16", + "model_size": "7.21 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.1751258389261745, + "em_stderr": 0.0038923206966426068, + "f1": 0.22030725671140952, + "f1_stderr": 0.003963217826978471 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.7095501183898973, + "acc_stderr": 0.0127588134480646 + }, + "all": { + "em": 0.1751258389261745, + "em_stderr": 0.0038923206966426068, + "f1": 0.22030725671140952, + "f1_stderr": 0.003963217826978471, + "acc": 0.35477505919494867, + "acc_stderr": 0.0063794067240323 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "4cb8f984afb75347", + "hash_cont_tokens": "552e77f98f300380" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "aebca38d680223cc" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "ac62dfeb9414e21a", + "hash_cont_tokens": "31ff35fb555cfa9a" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/CAMEL-33B-Combined-Data-SuperHOT-8K-fp16/results_2023-08-01T15-49-11.962689.json b/eval-results/TheBloke/CAMEL-33B-Combined-Data-SuperHOT-8K-fp16/results_2023-08-01T15-49-11.962689.json new file mode 100644 index 0000000000000000000000000000000000000000..75d192fb02d86b496ced13c30b1ea32d714b9d95 --- /dev/null +++ b/eval-results/TheBloke/CAMEL-33B-Combined-Data-SuperHOT-8K-fp16/results_2023-08-01T15-49-11.962689.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.22781569965870307, + "acc_stderr": 0.012256708602326916, + "acc_norm": 0.25853242320819114, + "acc_norm_stderr": 0.012794553754288679 + }, + "harness|hellaswag|10": { + "acc": 0.2765385381398128, + "acc_stderr": 0.004463721071319092, + "acc_norm": 0.3156741684923322, + "acc_norm_stderr": 0.004638339207348899 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.21481481481481482, + "acc_stderr": 0.035478541985608236, + "acc_norm": 0.21481481481481482, + "acc_norm_stderr": 0.035478541985608236 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.19736842105263158, + "acc_stderr": 0.03238981601699397, + "acc_norm": 0.19736842105263158, + "acc_norm_stderr": 0.03238981601699397 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2037735849056604, + "acc_stderr": 0.0247907845017754, + "acc_norm": 0.2037735849056604, + "acc_norm_stderr": 0.0247907845017754 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.25, + "acc_stderr": 0.03621034121889507, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03621034121889507 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2138728323699422, + "acc_stderr": 0.03126511206173044, + "acc_norm": 0.2138728323699422, + "acc_norm_stderr": 0.03126511206173044 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813365 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.20899470899470898, + "acc_stderr": 0.02094048156533486, + "acc_norm": 0.20899470899470898, + "acc_norm_stderr": 0.02094048156533486 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.04006168083848876, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.04006168083848876 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.1870967741935484, + "acc_stderr": 0.022185710092252266, + "acc_norm": 0.1870967741935484, + "acc_norm_stderr": 0.022185710092252266 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.15270935960591134, + "acc_stderr": 0.02530890453938063, + "acc_norm": 0.15270935960591134, + "acc_norm_stderr": 0.02530890453938063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.22424242424242424, + "acc_stderr": 0.032568666616811015, + "acc_norm": 0.22424242424242424, + "acc_norm_stderr": 0.032568666616811015 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.17676767676767677, + "acc_stderr": 0.027178752639044915, + "acc_norm": 0.17676767676767677, + "acc_norm_stderr": 0.027178752639044915 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.20207253886010362, + "acc_stderr": 0.02897908979429673, + "acc_norm": 0.20207253886010362, + "acc_norm_stderr": 0.02897908979429673 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2076923076923077, + "acc_stderr": 0.020567539567246797, + "acc_norm": 0.2076923076923077, + "acc_norm_stderr": 0.020567539567246797 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.026202766534652148, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.026202766534652148 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.19327731092436976, + "acc_stderr": 0.0256494702658892, + "acc_norm": 0.19327731092436976, + "acc_norm_stderr": 0.0256494702658892 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.1986754966887417, + "acc_stderr": 0.03257847384436776, + "acc_norm": 0.1986754966887417, + "acc_norm_stderr": 0.03257847384436776 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1908256880733945, + "acc_stderr": 0.016847676400091112, + "acc_norm": 0.1908256880733945, + "acc_norm_stderr": 0.016847676400091112 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.16203703703703703, + "acc_stderr": 0.02513045365226846, + "acc_norm": 0.16203703703703703, + "acc_norm_stderr": 0.02513045365226846 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.03019028245350195, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.03019028245350195 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.26582278481012656, + "acc_stderr": 0.02875679962965834, + "acc_norm": 0.26582278481012656, + "acc_norm_stderr": 0.02875679962965834 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.30493273542600896, + "acc_stderr": 0.030898610882477515, + "acc_norm": 0.30493273542600896, + "acc_norm_stderr": 0.030898610882477515 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.25190839694656486, + "acc_stderr": 0.03807387116306086, + "acc_norm": 0.25190839694656486, + "acc_norm_stderr": 0.03807387116306086 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.256198347107438, + "acc_stderr": 0.03984979653302872, + "acc_norm": 0.256198347107438, + "acc_norm_stderr": 0.03984979653302872 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.04284467968052192, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.04284467968052192 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.18446601941747573, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.18446601941747573, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.27350427350427353, + "acc_stderr": 0.029202540153431163, + "acc_norm": 0.27350427350427353, + "acc_norm_stderr": 0.029202540153431163 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.25287356321839083, + "acc_stderr": 0.015543377313719681, + "acc_norm": 0.25287356321839083, + "acc_norm_stderr": 0.015543377313719681 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24804469273743016, + "acc_stderr": 0.014444157808261427, + "acc_norm": 0.24804469273743016, + "acc_norm_stderr": 0.014444157808261427 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.02355083135199509, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.02355083135199509 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1832797427652733, + "acc_stderr": 0.02197419884826581, + "acc_norm": 0.1832797427652733, + "acc_norm_stderr": 0.02197419884826581 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.0227797190887334, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.0227797190887334 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2872340425531915, + "acc_stderr": 0.026992199173064356, + "acc_norm": 0.2872340425531915, + "acc_norm_stderr": 0.026992199173064356 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142692, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142692 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.18382352941176472, + "acc_stderr": 0.023529242185193106, + "acc_norm": 0.18382352941176472, + "acc_norm_stderr": 0.023529242185193106 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.24673202614379086, + "acc_stderr": 0.017440820367402497, + "acc_norm": 0.24673202614379086, + "acc_norm_stderr": 0.017440820367402497 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2571428571428571, + "acc_stderr": 0.027979823538744546, + "acc_norm": 0.2571428571428571, + "acc_norm_stderr": 0.027979823538744546 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401465, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401465 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.26506024096385544, + "acc_stderr": 0.03436024037944967, + "acc_norm": 0.26506024096385544, + "acc_norm_stderr": 0.03436024037944967 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.30409356725146197, + "acc_stderr": 0.03528211258245233, + "acc_norm": 0.30409356725146197, + "acc_norm_stderr": 0.03528211258245233 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.22766217870257038, + "mc1_stderr": 0.01467925503211107, + "mc2": 0.4812732167278406, + "mc2_stderr": 0.016794155618723362 + }, + "all": { + "acc": 0.2373823213860445, + "acc_stderr": 0.0310093391404882, + "acc_norm": 0.23856625958777344, + "acc_norm_stderr": 0.03102141478943721, + "mc1": 0.22766217870257038, + "mc1_stderr": 0.01467925503211107, + "mc2": 0.4812732167278406, + "mc2_stderr": 0.016794155618723362 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/CAMEL-33B-Combined-Data-SuperHOT-8K-fp16", + "model_sha": "14744d11eab7028c5c845f89db2edc9c6fe2becb", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "13040.743820905685", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Chinese-Alpaca-33B-SuperHOT-8K-fp16/results_2023-07-31T19-21-09.032023.json b/eval-results/TheBloke/Chinese-Alpaca-33B-SuperHOT-8K-fp16/results_2023-07-31T19-21-09.032023.json new file mode 100644 index 0000000000000000000000000000000000000000..a429a3450e8019974ccd92bdc4d69b674f9d0813 --- /dev/null +++ b/eval-results/TheBloke/Chinese-Alpaca-33B-SuperHOT-8K-fp16/results_2023-07-31T19-21-09.032023.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.2175767918088737, + "acc_stderr": 0.0120572620209725, + "acc_norm": 0.26791808873720135, + "acc_norm_stderr": 0.012942030195136426 + }, + "harness|hellaswag|10": { + "acc": 0.26926906990639315, + "acc_stderr": 0.004426734718808876, + "acc_norm": 0.29555865365465045, + "acc_norm_stderr": 0.004553609405747228 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.035914440841969694, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.035914440841969694 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2188679245283019, + "acc_stderr": 0.025447863825108608, + "acc_norm": 0.2188679245283019, + "acc_norm_stderr": 0.025447863825108608 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.03476590104304134, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.03476590104304134 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.15, + "acc_stderr": 0.03588702812826372, + "acc_norm": 0.15, + "acc_norm_stderr": 0.03588702812826372 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749874, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749874 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.0414243971948936, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.0414243971948936 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2275132275132275, + "acc_stderr": 0.021591269407823778, + "acc_norm": 0.2275132275132275, + "acc_norm_stderr": 0.021591269407823778 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.037184890068181146, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.037184890068181146 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25161290322580643, + "acc_stderr": 0.02468597928623997, + "acc_norm": 0.25161290322580643, + "acc_norm_stderr": 0.02468597928623997 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.23645320197044334, + "acc_stderr": 0.029896114291733552, + "acc_norm": 0.23645320197044334, + "acc_norm_stderr": 0.029896114291733552 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.0347769116216366, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.0347769116216366 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.21212121212121213, + "acc_stderr": 0.029126522834586804, + "acc_norm": 0.21212121212121213, + "acc_norm_stderr": 0.029126522834586804 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.21761658031088082, + "acc_stderr": 0.02977866303775296, + "acc_norm": 0.21761658031088082, + "acc_norm_stderr": 0.02977866303775296 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2128205128205128, + "acc_stderr": 0.020752423722128013, + "acc_norm": 0.2128205128205128, + "acc_norm_stderr": 0.020752423722128013 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.026962424325073828, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.026962424325073828 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21008403361344538, + "acc_stderr": 0.026461398717471874, + "acc_norm": 0.21008403361344538, + "acc_norm_stderr": 0.026461398717471874 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.1986754966887417, + "acc_stderr": 0.03257847384436776, + "acc_norm": 0.1986754966887417, + "acc_norm_stderr": 0.03257847384436776 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1926605504587156, + "acc_stderr": 0.016909276884936087, + "acc_norm": 0.1926605504587156, + "acc_norm_stderr": 0.016909276884936087 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.03058759135160425, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.03058759135160425 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.28270042194092826, + "acc_stderr": 0.02931281415395592, + "acc_norm": 0.28270042194092826, + "acc_norm_stderr": 0.02931281415395592 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.28699551569506726, + "acc_stderr": 0.030360379710291947, + "acc_norm": 0.28699551569506726, + "acc_norm_stderr": 0.030360379710291947 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.29914529914529914, + "acc_stderr": 0.029996951858349497, + "acc_norm": 0.29914529914529914, + "acc_norm_stderr": 0.029996951858349497 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.26947637292464877, + "acc_stderr": 0.01586624307321506, + "acc_norm": 0.26947637292464877, + "acc_norm_stderr": 0.01586624307321506 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.26878612716763006, + "acc_stderr": 0.023868003262500114, + "acc_norm": 0.26878612716763006, + "acc_norm_stderr": 0.023868003262500114 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.02355083135199509, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.02355083135199509 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1864951768488746, + "acc_stderr": 0.02212243977248077, + "acc_norm": 0.1864951768488746, + "acc_norm_stderr": 0.02212243977248077 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21604938271604937, + "acc_stderr": 0.022899162918445806, + "acc_norm": 0.21604938271604937, + "acc_norm_stderr": 0.022899162918445806 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.22340425531914893, + "acc_stderr": 0.024847921358063962, + "acc_norm": 0.22340425531914893, + "acc_norm_stderr": 0.024847921358063962 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142692, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142692 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.024562204314142314, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.024562204314142314 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25, + "acc_stderr": 0.01751781884501444, + "acc_norm": 0.25, + "acc_norm_stderr": 0.01751781884501444 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.20408163265306123, + "acc_stderr": 0.02580128347509051, + "acc_norm": 0.20408163265306123, + "acc_norm_stderr": 0.02580128347509051 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401465, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401465 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.26506024096385544, + "acc_stderr": 0.03436024037944967, + "acc_norm": 0.26506024096385544, + "acc_norm_stderr": 0.03436024037944967 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23378212974296206, + "mc1_stderr": 0.014816195991931588, + "mc2": 0.4774590793334822, + "mc2_stderr": 0.01691343346185639 + }, + "all": { + "acc": 0.24079112101610886, + "acc_stderr": 0.030961801782247226, + "acc_norm": 0.24208994950215265, + "acc_norm_stderr": 0.03097894827141845, + "mc1": 0.23378212974296206, + "mc1_stderr": 0.014816195991931588, + "mc2": 0.4774590793334822, + "mc2_stderr": 0.01691343346185639 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/Chinese-Alpaca-33B-SuperHOT-8K-fp16", + "model_sha": "a55ce761bace8be6d17c357c57ef927751afd40c", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "13152.962387084961", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/CodeLlama-13B-Instruct-fp16/results_2023-08-25T23-11-55.664382.json b/eval-results/TheBloke/CodeLlama-13B-Instruct-fp16/results_2023-08-25T23-11-55.664382.json new file mode 100644 index 0000000000000000000000000000000000000000..d98246bd230e2e43dec4c2f5dad6752cdce06571 --- /dev/null +++ b/eval-results/TheBloke/CodeLlama-13B-Instruct-fp16/results_2023-08-25T23-11-55.664382.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/CodeLlama-13B-Instruct-fp16", + "model_sha": "521c208c7251ccd3e44ccd9500b6bed419bca565", + "model_dtype": "torch.float16", + "lighteval_sha": "578835f70c499eaf870208de093513e08f864581", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4087030716723549, + "acc_stderr": 0.014365750345427006, + "acc_norm": 0.4462457337883959, + "acc_norm_stderr": 0.014526705548539982 + }, + "harness|hellaswag|10": { + "acc": 0.4812786297550289, + "acc_stderr": 0.004986282450647317, + "acc_norm": 0.6493726349332802, + "acc_norm_stderr": 0.004761912511707506 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.32592592592592595, + "acc_stderr": 0.040491220417025055, + "acc_norm": 0.32592592592592595, + "acc_norm_stderr": 0.040491220417025055 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4144736842105263, + "acc_stderr": 0.04008973785779206, + "acc_norm": 0.4144736842105263, + "acc_norm_stderr": 0.04008973785779206 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.38113207547169814, + "acc_stderr": 0.029890609686286637, + "acc_norm": 0.38113207547169814, + "acc_norm_stderr": 0.029890609686286637 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3611111111111111, + "acc_stderr": 0.04016660030451233, + "acc_norm": 0.3611111111111111, + "acc_norm_stderr": 0.04016660030451233 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3236994219653179, + "acc_stderr": 0.0356760379963917, + "acc_norm": 0.3236994219653179, + "acc_norm_stderr": 0.0356760379963917 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.046550104113196177, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.046550104113196177 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3617021276595745, + "acc_stderr": 0.031410821975962414, + "acc_norm": 0.3617021276595745, + "acc_norm_stderr": 0.031410821975962414 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4206896551724138, + "acc_stderr": 0.0411391498118926, + "acc_norm": 0.4206896551724138, + "acc_norm_stderr": 0.0411391498118926 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2751322751322751, + "acc_stderr": 0.02300008685906866, + "acc_norm": 0.2751322751322751, + "acc_norm_stderr": 0.02300008685906866 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.040735243221471255, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.040735243221471255 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.4064516129032258, + "acc_stderr": 0.0279417273462563, + "acc_norm": 0.4064516129032258, + "acc_norm_stderr": 0.0279417273462563 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3054187192118227, + "acc_stderr": 0.03240661565868408, + "acc_norm": 0.3054187192118227, + "acc_norm_stderr": 0.03240661565868408 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.3575757575757576, + "acc_stderr": 0.03742597043806585, + "acc_norm": 0.3575757575757576, + "acc_norm_stderr": 0.03742597043806585 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5151515151515151, + "acc_stderr": 0.03560716516531061, + "acc_norm": 0.5151515151515151, + "acc_norm_stderr": 0.03560716516531061 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.5077720207253886, + "acc_stderr": 0.03608003225569654, + "acc_norm": 0.5077720207253886, + "acc_norm_stderr": 0.03608003225569654 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.36923076923076925, + "acc_stderr": 0.02446861524147892, + "acc_norm": 0.36923076923076925, + "acc_norm_stderr": 0.02446861524147892 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.025348097468097863, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.025348097468097863 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.37815126050420167, + "acc_stderr": 0.03149930577784906, + "acc_norm": 0.37815126050420167, + "acc_norm_stderr": 0.03149930577784906 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.4990825688073395, + "acc_stderr": 0.021437287056051215, + "acc_norm": 0.4990825688073395, + "acc_norm_stderr": 0.021437287056051215 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.37962962962962965, + "acc_stderr": 0.03309682581119035, + "acc_norm": 0.37962962962962965, + "acc_norm_stderr": 0.03309682581119035 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.03410785338904719, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.03410785338904719 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.38396624472573837, + "acc_stderr": 0.031658678064106674, + "acc_norm": 0.38396624472573837, + "acc_norm_stderr": 0.031658678064106674 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.4260089686098655, + "acc_stderr": 0.033188332862172806, + "acc_norm": 0.4260089686098655, + "acc_norm_stderr": 0.033188332862172806 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.4198473282442748, + "acc_stderr": 0.04328577215262972, + "acc_norm": 0.4198473282442748, + "acc_norm_stderr": 0.04328577215262972 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.38016528925619836, + "acc_stderr": 0.04431324501968431, + "acc_norm": 0.38016528925619836, + "acc_norm_stderr": 0.04431324501968431 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.4537037037037037, + "acc_stderr": 0.04812917324536823, + "acc_norm": 0.4537037037037037, + "acc_norm_stderr": 0.04812917324536823 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4049079754601227, + "acc_stderr": 0.038566721635489125, + "acc_norm": 0.4049079754601227, + "acc_norm_stderr": 0.038566721635489125 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.30357142857142855, + "acc_stderr": 0.04364226155841044, + "acc_norm": 0.30357142857142855, + "acc_norm_stderr": 0.04364226155841044 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.4563106796116505, + "acc_stderr": 0.049318019942204146, + "acc_norm": 0.4563106796116505, + "acc_norm_stderr": 0.049318019942204146 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7051282051282052, + "acc_stderr": 0.02987257770889117, + "acc_norm": 0.7051282051282052, + "acc_norm_stderr": 0.02987257770889117 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562427, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562427 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.4955300127713921, + "acc_stderr": 0.017879248970584377, + "acc_norm": 0.4955300127713921, + "acc_norm_stderr": 0.017879248970584377 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.3468208092485549, + "acc_stderr": 0.025624723994030457, + "acc_norm": 0.3468208092485549, + "acc_norm_stderr": 0.025624723994030457 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2681564245810056, + "acc_stderr": 0.014816119635317003, + "acc_norm": 0.2681564245810056, + "acc_norm_stderr": 0.014816119635317003 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.39215686274509803, + "acc_stderr": 0.027956046165424516, + "acc_norm": 0.39215686274509803, + "acc_norm_stderr": 0.027956046165424516 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.40836012861736337, + "acc_stderr": 0.027917050748484634, + "acc_norm": 0.40836012861736337, + "acc_norm_stderr": 0.027917050748484634 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.02712511551316686, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.02712511551316686 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3049645390070922, + "acc_stderr": 0.027464708442022128, + "acc_norm": 0.3049645390070922, + "acc_norm_stderr": 0.027464708442022128 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2926988265971317, + "acc_stderr": 0.011620949195849528, + "acc_norm": 0.2926988265971317, + "acc_norm_stderr": 0.011620949195849528 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.34558823529411764, + "acc_stderr": 0.028888193103988647, + "acc_norm": 0.34558823529411764, + "acc_norm_stderr": 0.028888193103988647 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.3284313725490196, + "acc_stderr": 0.01899970738316267, + "acc_norm": 0.3284313725490196, + "acc_norm_stderr": 0.01899970738316267 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.4727272727272727, + "acc_stderr": 0.04782001791380063, + "acc_norm": 0.4727272727272727, + "acc_norm_stderr": 0.04782001791380063 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4326530612244898, + "acc_stderr": 0.031717528240626645, + "acc_norm": 0.4326530612244898, + "acc_norm_stderr": 0.031717528240626645 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.4577114427860697, + "acc_stderr": 0.035228658640995975, + "acc_norm": 0.4577114427860697, + "acc_norm_stderr": 0.035228658640995975 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.56, + "acc_stderr": 0.049888765156985884, + "acc_norm": 0.56, + "acc_norm_stderr": 0.049888765156985884 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.39759036144578314, + "acc_stderr": 0.038099730845402184, + "acc_norm": 0.39759036144578314, + "acc_norm_stderr": 0.038099730845402184 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.4678362573099415, + "acc_stderr": 0.038268824176603676, + "acc_norm": 0.4678362573099415, + "acc_norm_stderr": 0.038268824176603676 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2937576499388005, + "mc1_stderr": 0.015945068581236614, + "mc2": 0.45878663529563757, + "mc2_stderr": 0.014860043549181953 + }, + "all": { + "acc": 0.3896359912218762, + "acc_stderr": 0.03514263009984807, + "acc_norm": 0.39312135846415236, + "acc_norm_stderr": 0.03514155527381711, + "mc1": 0.2937576499388005, + "mc1_stderr": 0.015945068581236614, + "mc2": 0.45878663529563757, + "mc2_stderr": 0.014860043549181953 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6286.013152122498", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/CodeLlama-13B-Instruct-fp16/results_2023-10-22T11-46-33.264561.json b/eval-results/TheBloke/CodeLlama-13B-Instruct-fp16/results_2023-10-22T11-46-33.264561.json new file mode 100644 index 0000000000000000000000000000000000000000..eaaec78bdbd0997c92bd4c6e7072ae3a72f4f65a --- /dev/null +++ b/eval-results/TheBloke/CodeLlama-13B-Instruct-fp16/results_2023-10-22T11-46-33.264561.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/CodeLlama-13B-Instruct-fp16", + "model_sha": "521c208c7251ccd3e44ccd9500b6bed419bca565", + "model_size": "24.56 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0009437919463087249, + "em_stderr": 0.0003144653119413506, + "f1": 0.05136010906040279, + "f1_stderr": 0.001238131643997091 + }, + "harness|gsm8k|5": { + "acc": 0.12661106899166036, + "acc_stderr": 0.009159715283081094 + }, + "harness|winogrande|5": { + "acc": 0.6803472770323599, + "acc_stderr": 0.013106528517665136 + }, + "all": { + "em": 0.0009437919463087249, + "em_stderr": 0.0003144653119413506, + "f1": 0.05136010906040279, + "f1_stderr": 0.001238131643997091, + "acc": 0.4034791730120101, + "acc_stderr": 0.011133121900373116 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "85c0cfc873d5a6b5" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "21ecf868b75ddc8d" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "da96969fe4f590ee" + }, + "total_evaluation_time_secondes": "11798.901111125946", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/CodeLlama-13B-Python-fp16/results_2023-08-25T19-26-38.056569.json b/eval-results/TheBloke/CodeLlama-13B-Python-fp16/results_2023-08-25T19-26-38.056569.json new file mode 100644 index 0000000000000000000000000000000000000000..aaffa80c301d13410ac43bcff5bcb29b0e9c0dae --- /dev/null +++ b/eval-results/TheBloke/CodeLlama-13B-Python-fp16/results_2023-08-25T19-26-38.056569.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/CodeLlama-13B-Python-fp16", + "model_sha": "442282f4207442b828953a72c51a919c332cba5c", + "model_dtype": "torch.float16", + "lighteval_sha": "578835f70c499eaf870208de093513e08f864581", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.2960750853242321, + "acc_stderr": 0.013340916085246252, + "acc_norm": 0.3319112627986348, + "acc_norm_stderr": 0.013760988200880543 + }, + "harness|hellaswag|10": { + "acc": 0.35769766978689504, + "acc_stderr": 0.0047834288742735764, + "acc_norm": 0.44503087034455285, + "acc_norm_stderr": 0.0049595354431706175 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816507, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816507 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04072314811876837, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04072314811876837 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.20394736842105263, + "acc_stderr": 0.03279000406310052, + "acc_norm": 0.20394736842105263, + "acc_norm_stderr": 0.03279000406310052 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2188679245283019, + "acc_stderr": 0.02544786382510861, + "acc_norm": 0.2188679245283019, + "acc_norm_stderr": 0.02544786382510861 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2916666666666667, + "acc_stderr": 0.038009680605548574, + "acc_norm": 0.2916666666666667, + "acc_norm_stderr": 0.038009680605548574 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.26011560693641617, + "acc_stderr": 0.03345036916788992, + "acc_norm": 0.26011560693641617, + "acc_norm_stderr": 0.03345036916788992 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.16666666666666666, + "acc_stderr": 0.037082846624165444, + "acc_norm": 0.16666666666666666, + "acc_norm_stderr": 0.037082846624165444 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.21, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.21, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2680851063829787, + "acc_stderr": 0.028957342788342347, + "acc_norm": 0.2680851063829787, + "acc_norm_stderr": 0.028957342788342347 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2689655172413793, + "acc_stderr": 0.03695183311650232, + "acc_norm": 0.2689655172413793, + "acc_norm_stderr": 0.03695183311650232 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2566137566137566, + "acc_stderr": 0.022494510767503154, + "acc_norm": 0.2566137566137566, + "acc_norm_stderr": 0.022494510767503154 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.1746031746031746, + "acc_stderr": 0.03395490020856112, + "acc_norm": 0.1746031746031746, + "acc_norm_stderr": 0.03395490020856112 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.31290322580645163, + "acc_stderr": 0.026377567028645858, + "acc_norm": 0.31290322580645163, + "acc_norm_stderr": 0.026377567028645858 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.270935960591133, + "acc_stderr": 0.03127090713297698, + "acc_norm": 0.270935960591133, + "acc_norm_stderr": 0.03127090713297698 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2606060606060606, + "acc_stderr": 0.03427743175816524, + "acc_norm": 0.2606060606060606, + "acc_norm_stderr": 0.03427743175816524 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2828282828282828, + "acc_stderr": 0.0320877955878675, + "acc_norm": 0.2828282828282828, + "acc_norm_stderr": 0.0320877955878675 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.35233160621761656, + "acc_stderr": 0.03447478286414359, + "acc_norm": 0.35233160621761656, + "acc_norm_stderr": 0.03447478286414359 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3282051282051282, + "acc_stderr": 0.02380763319865726, + "acc_norm": 0.3282051282051282, + "acc_norm_stderr": 0.02380763319865726 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.026842057873833706, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.026842057873833706 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2605042016806723, + "acc_stderr": 0.028510251512341937, + "acc_norm": 0.2605042016806723, + "acc_norm_stderr": 0.028510251512341937 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.037345356767871984, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.037345356767871984 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.28990825688073396, + "acc_stderr": 0.0194530666092016, + "acc_norm": 0.28990825688073396, + "acc_norm_stderr": 0.0194530666092016 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.39814814814814814, + "acc_stderr": 0.033384734032074016, + "acc_norm": 0.39814814814814814, + "acc_norm_stderr": 0.033384734032074016 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.03077855467869326, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.03077855467869326 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.29957805907172996, + "acc_stderr": 0.029818024749753095, + "acc_norm": 0.29957805907172996, + "acc_norm_stderr": 0.029818024749753095 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.242152466367713, + "acc_stderr": 0.028751392398694755, + "acc_norm": 0.242152466367713, + "acc_norm_stderr": 0.028751392398694755 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.29770992366412213, + "acc_stderr": 0.04010358942462202, + "acc_norm": 0.29770992366412213, + "acc_norm_stderr": 0.04010358942462202 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.256198347107438, + "acc_stderr": 0.03984979653302872, + "acc_norm": 0.256198347107438, + "acc_norm_stderr": 0.03984979653302872 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.041331194402438376, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.041331194402438376 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.03259177392742177, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.03259177392742177 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.23214285714285715, + "acc_stderr": 0.04007341809755806, + "acc_norm": 0.23214285714285715, + "acc_norm_stderr": 0.04007341809755806 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.0376017800602662, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.0376017800602662 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.1752136752136752, + "acc_stderr": 0.024904439098918214, + "acc_norm": 0.1752136752136752, + "acc_norm_stderr": 0.024904439098918214 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.31800766283524906, + "acc_stderr": 0.016653486275615394, + "acc_norm": 0.31800766283524906, + "acc_norm_stderr": 0.016653486275615394 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.021855255263421795, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.021855255263421795 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.27150837988826815, + "acc_stderr": 0.014874252168095261, + "acc_norm": 0.27150837988826815, + "acc_norm_stderr": 0.014874252168095261 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2581699346405229, + "acc_stderr": 0.025058503316958147, + "acc_norm": 0.2581699346405229, + "acc_norm_stderr": 0.025058503316958147 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24758842443729903, + "acc_stderr": 0.024513879973621967, + "acc_norm": 0.24758842443729903, + "acc_norm_stderr": 0.024513879973621967 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.22839506172839505, + "acc_stderr": 0.023358211840626267, + "acc_norm": 0.22839506172839505, + "acc_norm_stderr": 0.023358211840626267 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.28368794326241137, + "acc_stderr": 0.026891709428343957, + "acc_norm": 0.28368794326241137, + "acc_norm_stderr": 0.026891709428343957 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2438070404172099, + "acc_stderr": 0.010966507972178475, + "acc_norm": 0.2438070404172099, + "acc_norm_stderr": 0.010966507972178475 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.21691176470588236, + "acc_stderr": 0.025035845227711233, + "acc_norm": 0.21691176470588236, + "acc_norm_stderr": 0.025035845227711233 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.24673202614379086, + "acc_stderr": 0.0174408203674025, + "acc_norm": 0.24673202614379086, + "acc_norm_stderr": 0.0174408203674025 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.20909090909090908, + "acc_stderr": 0.038950910157241364, + "acc_norm": 0.20909090909090908, + "acc_norm_stderr": 0.038950910157241364 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3877551020408163, + "acc_stderr": 0.031192230726795656, + "acc_norm": 0.3877551020408163, + "acc_norm_stderr": 0.031192230726795656 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.21393034825870647, + "acc_stderr": 0.028996909693328923, + "acc_norm": 0.21393034825870647, + "acc_norm_stderr": 0.028996909693328923 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.21686746987951808, + "acc_stderr": 0.03208284450356365, + "acc_norm": 0.21686746987951808, + "acc_norm_stderr": 0.03208284450356365 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.035650796707083106, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.035650796707083106 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.26560587515299877, + "mc1_stderr": 0.015461027627253595, + "mc2": 0.43989219144943836, + "mc2_stderr": 0.014690020723528612 + }, + "all": { + "acc": 0.26171872838516447, + "acc_stderr": 0.03167776466143373, + "acc_norm": 0.2638063449619791, + "acc_norm_stderr": 0.03168786938490037, + "mc1": 0.26560587515299877, + "mc1_stderr": 0.015461027627253595, + "mc2": 0.43989219144943836, + "mc2_stderr": 0.014690020723528612 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6302.621599912643", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/CodeLlama-13B-Python-fp16/results_2023-10-22T10-58-59.562452.json b/eval-results/TheBloke/CodeLlama-13B-Python-fp16/results_2023-10-22T10-58-59.562452.json new file mode 100644 index 0000000000000000000000000000000000000000..b9e0362fbac642fdad5939d955edf3d5b8b8ca12 --- /dev/null +++ b/eval-results/TheBloke/CodeLlama-13B-Python-fp16/results_2023-10-22T10-58-59.562452.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/CodeLlama-13B-Python-fp16", + "model_sha": "442282f4207442b828953a72c51a919c332cba5c", + "model_size": "24.56 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001153523489932886, + "em_stderr": 0.0003476179896857104, + "f1": 0.04942743288590626, + "f1_stderr": 0.001208970062104149 + }, + "harness|gsm8k|5": { + "acc": 0.10083396512509477, + "acc_stderr": 0.008294031192126591 + }, + "harness|winogrande|5": { + "acc": 0.6740331491712708, + "acc_stderr": 0.013173782636922189 + }, + "all": { + "em": 0.001153523489932886, + "em_stderr": 0.0003476179896857104, + "f1": 0.04942743288590626, + "f1_stderr": 0.001208970062104149, + "acc": 0.3874335571481828, + "acc_stderr": 0.01073390691452439 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "d090f7d4f9bda42a" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "d42c2f1aaa804f68" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "39edfaaac5c5f2a9" + }, + "total_evaluation_time_secondes": "13279.365068674088", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/CodeLlama-34B-Instruct-fp16/results_2023-08-26T01-22-34.444520.json b/eval-results/TheBloke/CodeLlama-34B-Instruct-fp16/results_2023-08-26T01-22-34.444520.json new file mode 100644 index 0000000000000000000000000000000000000000..94a171213bf86b0cc9369f2a377a857b3863c52d --- /dev/null +++ b/eval-results/TheBloke/CodeLlama-34B-Instruct-fp16/results_2023-08-26T01-22-34.444520.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/CodeLlama-34B-Instruct-fp16", + "model_sha": "a4d0ce949de4d5b5f74691641efb5b70736a32a8", + "model_dtype": "torch.float16", + "lighteval_sha": "578835f70c499eaf870208de093513e08f864581", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.3796928327645051, + "acc_stderr": 0.014182119866974876, + "acc_norm": 0.40784982935153585, + "acc_norm_stderr": 0.014361097288449708 + }, + "harness|hellaswag|10": { + "acc": 0.2998406691894045, + "acc_stderr": 0.004572515919210699, + "acc_norm": 0.35660227046405096, + "acc_norm_stderr": 0.00478016987333286 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.35555555555555557, + "acc_stderr": 0.04135176749720386, + "acc_norm": 0.35555555555555557, + "acc_norm_stderr": 0.04135176749720386 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.03925523381052932, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.03925523381052932 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4037735849056604, + "acc_stderr": 0.030197611600197953, + "acc_norm": 0.4037735849056604, + "acc_norm_stderr": 0.030197611600197953 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3680555555555556, + "acc_stderr": 0.04032999053960718, + "acc_norm": 0.3680555555555556, + "acc_norm_stderr": 0.04032999053960718 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3583815028901734, + "acc_stderr": 0.036563436533531585, + "acc_norm": 0.3583815028901734, + "acc_norm_stderr": 0.036563436533531585 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3872340425531915, + "acc_stderr": 0.03184389265339525, + "acc_norm": 0.3872340425531915, + "acc_norm_stderr": 0.03184389265339525 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.04096985139843672, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.04096985139843672 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3448275862068966, + "acc_stderr": 0.03960933549451208, + "acc_norm": 0.3448275862068966, + "acc_norm_stderr": 0.03960933549451208 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30423280423280424, + "acc_stderr": 0.02369541500946309, + "acc_norm": 0.30423280423280424, + "acc_norm_stderr": 0.02369541500946309 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30952380952380953, + "acc_stderr": 0.04134913018303316, + "acc_norm": 0.30952380952380953, + "acc_norm_stderr": 0.04134913018303316 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.44193548387096776, + "acc_stderr": 0.02825155790684974, + "acc_norm": 0.44193548387096776, + "acc_norm_stderr": 0.02825155790684974 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3645320197044335, + "acc_stderr": 0.033864057460620905, + "acc_norm": 0.3645320197044335, + "acc_norm_stderr": 0.033864057460620905 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.03453131801885415, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.03453131801885415 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5202020202020202, + "acc_stderr": 0.03559443565563918, + "acc_norm": 0.5202020202020202, + "acc_norm_stderr": 0.03559443565563918 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.533678756476684, + "acc_stderr": 0.036002440698671784, + "acc_norm": 0.533678756476684, + "acc_norm_stderr": 0.036002440698671784 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3564102564102564, + "acc_stderr": 0.024283140529467295, + "acc_norm": 0.3564102564102564, + "acc_norm_stderr": 0.024283140529467295 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3111111111111111, + "acc_stderr": 0.028226446749683515, + "acc_norm": 0.3111111111111111, + "acc_norm_stderr": 0.028226446749683515 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.39915966386554624, + "acc_stderr": 0.031811100324139245, + "acc_norm": 0.39915966386554624, + "acc_norm_stderr": 0.031811100324139245 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.45321100917431195, + "acc_stderr": 0.021343255165546037, + "acc_norm": 0.45321100917431195, + "acc_norm_stderr": 0.021343255165546037 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.029157522184605596, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.029157522184605596 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.031660096793998116, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.031660096793998116 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.4092827004219409, + "acc_stderr": 0.032007041833595914, + "acc_norm": 0.4092827004219409, + "acc_norm_stderr": 0.032007041833595914 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.4260089686098655, + "acc_stderr": 0.0331883328621728, + "acc_norm": 0.4260089686098655, + "acc_norm_stderr": 0.0331883328621728 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.4351145038167939, + "acc_stderr": 0.04348208051644858, + "acc_norm": 0.4351145038167939, + "acc_norm_stderr": 0.04348208051644858 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5702479338842975, + "acc_stderr": 0.04519082021319773, + "acc_norm": 0.5702479338842975, + "acc_norm_stderr": 0.04519082021319773 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.04820403072760628, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.04820403072760628 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3803680981595092, + "acc_stderr": 0.03814269893261837, + "acc_norm": 0.3803680981595092, + "acc_norm_stderr": 0.03814269893261837 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.26785714285714285, + "acc_stderr": 0.04203277291467762, + "acc_norm": 0.26785714285714285, + "acc_norm_stderr": 0.04203277291467762 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5436893203883495, + "acc_stderr": 0.049318019942204146, + "acc_norm": 0.5436893203883495, + "acc_norm_stderr": 0.049318019942204146 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6196581196581197, + "acc_stderr": 0.03180425204384099, + "acc_norm": 0.6196581196581197, + "acc_norm_stderr": 0.03180425204384099 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.565772669220945, + "acc_stderr": 0.017724589389677785, + "acc_norm": 0.565772669220945, + "acc_norm_stderr": 0.017724589389677785 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.41040462427745666, + "acc_stderr": 0.02648339204209818, + "acc_norm": 0.41040462427745666, + "acc_norm_stderr": 0.02648339204209818 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.20446927374301677, + "acc_stderr": 0.013488813404711917, + "acc_norm": 0.20446927374301677, + "acc_norm_stderr": 0.013488813404711917 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.02818059632825929, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.02818059632825929 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5048231511254019, + "acc_stderr": 0.028396770444111298, + "acc_norm": 0.5048231511254019, + "acc_norm_stderr": 0.028396770444111298 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.4567901234567901, + "acc_stderr": 0.02771666165019404, + "acc_norm": 0.4567901234567901, + "acc_norm_stderr": 0.02771666165019404 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.31560283687943264, + "acc_stderr": 0.027724989449509314, + "acc_norm": 0.31560283687943264, + "acc_norm_stderr": 0.027724989449509314 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.27835723598435463, + "acc_stderr": 0.011446990197380985, + "acc_norm": 0.27835723598435463, + "acc_norm_stderr": 0.011446990197380985 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.3272058823529412, + "acc_stderr": 0.028501452860396563, + "acc_norm": 0.3272058823529412, + "acc_norm_stderr": 0.028501452860396563 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.3431372549019608, + "acc_stderr": 0.019206606848825365, + "acc_norm": 0.3431372549019608, + "acc_norm_stderr": 0.019206606848825365 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5272727272727272, + "acc_stderr": 0.04782001791380061, + "acc_norm": 0.5272727272727272, + "acc_norm_stderr": 0.04782001791380061 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.0289205832206756, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.0289205832206756 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.48258706467661694, + "acc_stderr": 0.03533389234739244, + "acc_norm": 0.48258706467661694, + "acc_norm_stderr": 0.03533389234739244 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4036144578313253, + "acc_stderr": 0.03819486140758398, + "acc_norm": 0.4036144578313253, + "acc_norm_stderr": 0.03819486140758398 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6432748538011696, + "acc_stderr": 0.03674013002860954, + "acc_norm": 0.6432748538011696, + "acc_norm_stderr": 0.03674013002860954 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.29008567931456547, + "mc1_stderr": 0.01588623687420952, + "mc2": 0.4428923144531004, + "mc2_stderr": 0.014810370517699043 + }, + "all": { + "acc": 0.39529982814936127, + "acc_stderr": 0.03498378261854782, + "acc_norm": 0.39673912641820325, + "acc_norm_stderr": 0.03499033569271049, + "mc1": 0.29008567931456547, + "mc1_stderr": 0.01588623687420952, + "mc2": 0.4428923144531004, + "mc2_stderr": 0.014810370517699043 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "23883.632033586502", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/CodeLlama-34B-Instruct-fp16/results_2023-10-22T08-36-03.546774.json b/eval-results/TheBloke/CodeLlama-34B-Instruct-fp16/results_2023-10-22T08-36-03.546774.json new file mode 100644 index 0000000000000000000000000000000000000000..e4a55ef247ec0d704ce00d040440756dbe85f12b --- /dev/null +++ b/eval-results/TheBloke/CodeLlama-34B-Instruct-fp16/results_2023-10-22T08-36-03.546774.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/CodeLlama-34B-Instruct-fp16", + "model_sha": "a4d0ce949de4d5b5f74691641efb5b70736a32a8", + "model_size": "63.23 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0014681208053691276, + "em_stderr": 0.00039210421902985756, + "f1": 0.057836619127516906, + "f1_stderr": 0.0012992524934897988 + }, + "harness|gsm8k|5": { + "acc": 0.2304776345716452, + "acc_stderr": 0.011600249020595822 + }, + "harness|winogrande|5": { + "acc": 0.745067087608524, + "acc_stderr": 0.012248806969376422 + }, + "all": { + "em": 0.0014681208053691276, + "em_stderr": 0.00039210421902985756, + "f1": 0.057836619127516906, + "f1_stderr": 0.0012992524934897988, + "acc": 0.4877723610900846, + "acc_stderr": 0.011924527994986122 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "4ddcbefffe1b7d85" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "696c198058cd00d3" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "83852f382f93e5eb" + }, + "total_evaluation_time_secondes": "28410.408175468445", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/CodeLlama-34B-Python-fp16/results_2023-08-26T02-33-13.745130.json b/eval-results/TheBloke/CodeLlama-34B-Python-fp16/results_2023-08-26T02-33-13.745130.json new file mode 100644 index 0000000000000000000000000000000000000000..04099b09741625ad8dcb361b9a5de58b55384c81 --- /dev/null +++ b/eval-results/TheBloke/CodeLlama-34B-Python-fp16/results_2023-08-26T02-33-13.745130.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/CodeLlama-34B-Python-fp16", + "model_sha": "875f9d97fb6c9619d8867887dd1d80918ff0f593", + "model_dtype": "torch.float16", + "lighteval_sha": "578835f70c499eaf870208de093513e08f864581", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.3575085324232082, + "acc_stderr": 0.01400549427591657, + "acc_norm": 0.38139931740614336, + "acc_norm_stderr": 0.014194389086685268 + }, + "harness|hellaswag|10": { + "acc": 0.29924317864967137, + "acc_stderr": 0.004569906485090286, + "acc_norm": 0.3480382393945429, + "acc_norm_stderr": 0.004753746951620155 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.32592592592592595, + "acc_stderr": 0.040491220417025055, + "acc_norm": 0.32592592592592595, + "acc_norm_stderr": 0.040491220417025055 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3223684210526316, + "acc_stderr": 0.03803510248351585, + "acc_norm": 0.3223684210526316, + "acc_norm_stderr": 0.03803510248351585 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.3660377358490566, + "acc_stderr": 0.02964781353936523, + "acc_norm": 0.3660377358490566, + "acc_norm_stderr": 0.02964781353936523 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3819444444444444, + "acc_stderr": 0.040629907841466674, + "acc_norm": 0.3819444444444444, + "acc_norm_stderr": 0.040629907841466674 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3236994219653179, + "acc_stderr": 0.035676037996391685, + "acc_norm": 0.3236994219653179, + "acc_norm_stderr": 0.035676037996391685 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.04488482852329017, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.04488482852329017 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3191489361702128, + "acc_stderr": 0.03047297336338005, + "acc_norm": 0.3191489361702128, + "acc_norm_stderr": 0.03047297336338005 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.22807017543859648, + "acc_stderr": 0.03947152782669415, + "acc_norm": 0.22807017543859648, + "acc_norm_stderr": 0.03947152782669415 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.037245636197746325, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.037245636197746325 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.022569897074918417, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.022569897074918417 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.04190596438871136, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.04190596438871136 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3580645161290323, + "acc_stderr": 0.027273890594300642, + "acc_norm": 0.3580645161290323, + "acc_norm_stderr": 0.027273890594300642 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2019704433497537, + "acc_stderr": 0.028247350122180284, + "acc_norm": 0.2019704433497537, + "acc_norm_stderr": 0.028247350122180284 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939098, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939098 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.398989898989899, + "acc_stderr": 0.0348890161685273, + "acc_norm": 0.398989898989899, + "acc_norm_stderr": 0.0348890161685273 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.47668393782383417, + "acc_stderr": 0.03604513672442206, + "acc_norm": 0.47668393782383417, + "acc_norm_stderr": 0.03604513672442206 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3435897435897436, + "acc_stderr": 0.02407869658063547, + "acc_norm": 0.3435897435897436, + "acc_norm_stderr": 0.02407869658063547 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2074074074074074, + "acc_stderr": 0.024720713193952158, + "acc_norm": 0.2074074074074074, + "acc_norm_stderr": 0.024720713193952158 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.36134453781512604, + "acc_stderr": 0.031204691225150016, + "acc_norm": 0.36134453781512604, + "acc_norm_stderr": 0.031204691225150016 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.25165562913907286, + "acc_stderr": 0.035433042343899844, + "acc_norm": 0.25165562913907286, + "acc_norm_stderr": 0.035433042343899844 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3596330275229358, + "acc_stderr": 0.020575234660123783, + "acc_norm": 0.3596330275229358, + "acc_norm_stderr": 0.020575234660123783 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.30092592592592593, + "acc_stderr": 0.031280390843298804, + "acc_norm": 0.30092592592592593, + "acc_norm_stderr": 0.031280390843298804 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.03132179803083292, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.03132179803083292 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.39662447257383965, + "acc_stderr": 0.03184399873811224, + "acc_norm": 0.39662447257383965, + "acc_norm_stderr": 0.03184399873811224 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.32286995515695066, + "acc_stderr": 0.031381476375754995, + "acc_norm": 0.32286995515695066, + "acc_norm_stderr": 0.031381476375754995 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.29770992366412213, + "acc_stderr": 0.04010358942462203, + "acc_norm": 0.29770992366412213, + "acc_norm_stderr": 0.04010358942462203 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.33884297520661155, + "acc_stderr": 0.043207678075366705, + "acc_norm": 0.33884297520661155, + "acc_norm_stderr": 0.043207678075366705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.3055555555555556, + "acc_stderr": 0.044531975073749834, + "acc_norm": 0.3055555555555556, + "acc_norm_stderr": 0.044531975073749834 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22699386503067484, + "acc_stderr": 0.032910995786157686, + "acc_norm": 0.22699386503067484, + "acc_norm_stderr": 0.032910995786157686 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.24107142857142858, + "acc_stderr": 0.04059867246952687, + "acc_norm": 0.24107142857142858, + "acc_norm_stderr": 0.04059867246952687 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.46601941747572817, + "acc_stderr": 0.04939291447273481, + "acc_norm": 0.46601941747572817, + "acc_norm_stderr": 0.04939291447273481 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.44871794871794873, + "acc_stderr": 0.0325833464938688, + "acc_norm": 0.44871794871794873, + "acc_norm_stderr": 0.0325833464938688 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.4508301404853129, + "acc_stderr": 0.01779329757269904, + "acc_norm": 0.4508301404853129, + "acc_norm_stderr": 0.01779329757269904 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.34104046242774566, + "acc_stderr": 0.025522474632121615, + "acc_norm": 0.34104046242774566, + "acc_norm_stderr": 0.025522474632121615 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2636871508379888, + "acc_stderr": 0.014736926383761964, + "acc_norm": 0.2636871508379888, + "acc_norm_stderr": 0.014736926383761964 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.027826109307283686, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.027826109307283686 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.43086816720257237, + "acc_stderr": 0.028125340983972718, + "acc_norm": 0.43086816720257237, + "acc_norm_stderr": 0.028125340983972718 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.3055555555555556, + "acc_stderr": 0.025630824975621344, + "acc_norm": 0.3055555555555556, + "acc_norm_stderr": 0.025630824975621344 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.25886524822695034, + "acc_stderr": 0.026129572527180848, + "acc_norm": 0.25886524822695034, + "acc_norm_stderr": 0.026129572527180848 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.26401564537157757, + "acc_stderr": 0.011258435537723818, + "acc_norm": 0.26401564537157757, + "acc_norm_stderr": 0.011258435537723818 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4007352941176471, + "acc_stderr": 0.029768263528933102, + "acc_norm": 0.4007352941176471, + "acc_norm_stderr": 0.029768263528933102 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2875816993464052, + "acc_stderr": 0.018311653053648222, + "acc_norm": 0.2875816993464052, + "acc_norm_stderr": 0.018311653053648222 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.34545454545454546, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.34545454545454546, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.37142857142857144, + "acc_stderr": 0.030932858792789855, + "acc_norm": 0.37142857142857144, + "acc_norm_stderr": 0.030932858792789855 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2736318407960199, + "acc_stderr": 0.03152439186555401, + "acc_norm": 0.2736318407960199, + "acc_norm_stderr": 0.03152439186555401 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3132530120481928, + "acc_stderr": 0.03610805018031023, + "acc_norm": 0.3132530120481928, + "acc_norm_stderr": 0.03610805018031023 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.0381107966983353, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.0381107966983353 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2778457772337821, + "mc1_stderr": 0.015680929364024626, + "mc2": 0.43567105267740514, + "mc2_stderr": 0.014685884652076228 + }, + "all": { + "acc": 0.32944678557923035, + "acc_stderr": 0.0339038417486707, + "acc_norm": 0.3306787490661423, + "acc_norm_stderr": 0.03391015929574356, + "mc1": 0.2778457772337821, + "mc1_stderr": 0.015680929364024626, + "mc2": 0.43567105267740514, + "mc2_stderr": 0.014685884652076228 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "23838.72892189026", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/CodeLlama-34B-Python-fp16/results_2023-10-22T22-16-27.646288.json b/eval-results/TheBloke/CodeLlama-34B-Python-fp16/results_2023-10-22T22-16-27.646288.json new file mode 100644 index 0000000000000000000000000000000000000000..9e3040dff6af6d1b14a82a50b076b1ba5b281f9c --- /dev/null +++ b/eval-results/TheBloke/CodeLlama-34B-Python-fp16/results_2023-10-22T22-16-27.646288.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/CodeLlama-34B-Python-fp16", + "model_sha": "875f9d97fb6c9619d8867887dd1d80918ff0f593", + "model_size": "63.23 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0014681208053691276, + "em_stderr": 0.0003921042190298454, + "f1": 0.047479026845637595, + "f1_stderr": 0.0011836496363564649 + }, + "harness|gsm8k|5": { + "acc": 0.2001516300227445, + "acc_stderr": 0.011021119022510191 + }, + "harness|winogrande|5": { + "acc": 0.7213891081294396, + "acc_stderr": 0.012599896649493876 + }, + "all": { + "em": 0.0014681208053691276, + "em_stderr": 0.0003921042190298454, + "f1": 0.047479026845637595, + "f1_stderr": 0.0011836496363564649, + "acc": 0.46077036907609203, + "acc_stderr": 0.011810507836002033 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "0308f391a4cc9a38" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "a9cdbfdfef685e6a" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "b4b22ac632721de3" + }, + "total_evaluation_time_secondes": "28901.26035094261", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/EverythingLM-13B-16K-GPTQ/results_2023-08-21T17-43-17.754973.json b/eval-results/TheBloke/EverythingLM-13B-16K-GPTQ/results_2023-08-21T17-43-17.754973.json new file mode 100644 index 0000000000000000000000000000000000000000..c75ead9c46fae756cd637bba5983f3389d552979 --- /dev/null +++ b/eval-results/TheBloke/EverythingLM-13B-16K-GPTQ/results_2023-08-21T17-43-17.754973.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.23720136518771331, + "acc_stderr": 0.012430399829260847, + "acc_norm": 0.29266211604095566, + "acc_norm_stderr": 0.01329591610361941 + }, + "harness|hellaswag|10": { + "acc": 0.25801633140808605, + "acc_stderr": 0.004366488167386392, + "acc_norm": 0.26239792869946227, + "acc_norm_stderr": 0.004390386775400534 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.03785714465066653, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.03785714465066653 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.03690677986137282, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.03690677986137282 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036847, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036847 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.26037735849056604, + "acc_stderr": 0.027008766090708087, + "acc_norm": 0.26037735849056604, + "acc_norm_stderr": 0.027008766090708087 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.24305555555555555, + "acc_stderr": 0.03586879280080342, + "acc_norm": 0.24305555555555555, + "acc_norm_stderr": 0.03586879280080342 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.26011560693641617, + "acc_stderr": 0.033450369167889904, + "acc_norm": 0.26011560693641617, + "acc_norm_stderr": 0.033450369167889904 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.04023382273617747, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.04023382273617747 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2297872340425532, + "acc_stderr": 0.027501752944412417, + "acc_norm": 0.2297872340425532, + "acc_norm_stderr": 0.027501752944412417 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.22807017543859648, + "acc_stderr": 0.03947152782669415, + "acc_norm": 0.22807017543859648, + "acc_norm_stderr": 0.03947152782669415 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.30344827586206896, + "acc_stderr": 0.038312260488503336, + "acc_norm": 0.30344827586206896, + "acc_norm_stderr": 0.038312260488503336 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.0220190800122179, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.0220190800122179 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.03852273364924315, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.03852273364924315 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3064516129032258, + "acc_stderr": 0.026226485652553883, + "acc_norm": 0.3064516129032258, + "acc_norm_stderr": 0.026226485652553883 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3399014778325123, + "acc_stderr": 0.033327690684107895, + "acc_norm": 0.3399014778325123, + "acc_norm_stderr": 0.033327690684107895 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.24242424242424243, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.29292929292929293, + "acc_stderr": 0.03242497958178815, + "acc_norm": 0.29292929292929293, + "acc_norm_stderr": 0.03242497958178815 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.35751295336787564, + "acc_stderr": 0.03458816042181005, + "acc_norm": 0.35751295336787564, + "acc_norm_stderr": 0.03458816042181005 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2743589743589744, + "acc_stderr": 0.0226227657674932, + "acc_norm": 0.2743589743589744, + "acc_norm_stderr": 0.0226227657674932 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.22592592592592592, + "acc_stderr": 0.02549753263960955, + "acc_norm": 0.22592592592592592, + "acc_norm_stderr": 0.02549753263960955 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2689075630252101, + "acc_stderr": 0.028801392193631276, + "acc_norm": 0.2689075630252101, + "acc_norm_stderr": 0.028801392193631276 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2913907284768212, + "acc_stderr": 0.037101857261199946, + "acc_norm": 0.2913907284768212, + "acc_norm_stderr": 0.037101857261199946 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.26422018348623855, + "acc_stderr": 0.01890416417151018, + "acc_norm": 0.26422018348623855, + "acc_norm_stderr": 0.01890416417151018 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.032149521478027486, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.032149521478027486 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.029771775228145628, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.029771775228145628 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.26582278481012656, + "acc_stderr": 0.02875679962965834, + "acc_norm": 0.26582278481012656, + "acc_norm_stderr": 0.02875679962965834 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.14349775784753363, + "acc_stderr": 0.023529371269618186, + "acc_norm": 0.14349775784753363, + "acc_norm_stderr": 0.023529371269618186 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2748091603053435, + "acc_stderr": 0.03915345408847836, + "acc_norm": 0.2748091603053435, + "acc_norm_stderr": 0.03915345408847836 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.256198347107438, + "acc_stderr": 0.039849796533028725, + "acc_norm": 0.256198347107438, + "acc_norm_stderr": 0.039849796533028725 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.19444444444444445, + "acc_stderr": 0.038260763248848646, + "acc_norm": 0.19444444444444445, + "acc_norm_stderr": 0.038260763248848646 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2822085889570552, + "acc_stderr": 0.03536117886664742, + "acc_norm": 0.2822085889570552, + "acc_norm_stderr": 0.03536117886664742 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.16964285714285715, + "acc_stderr": 0.0356236785009539, + "acc_norm": 0.16964285714285715, + "acc_norm_stderr": 0.0356236785009539 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.2912621359223301, + "acc_stderr": 0.044986763205729245, + "acc_norm": 0.2912621359223301, + "acc_norm_stderr": 0.044986763205729245 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.23931623931623933, + "acc_stderr": 0.027951826808924333, + "acc_norm": 0.23931623931623933, + "acc_norm_stderr": 0.027951826808924333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.23627075351213284, + "acc_stderr": 0.015190473717037497, + "acc_norm": 0.23627075351213284, + "acc_norm_stderr": 0.015190473717037497 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2138728323699422, + "acc_stderr": 0.022075709251757187, + "acc_norm": 0.2138728323699422, + "acc_norm_stderr": 0.022075709251757187 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25139664804469275, + "acc_stderr": 0.014508979453553991, + "acc_norm": 0.25139664804469275, + "acc_norm_stderr": 0.014508979453553991 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.28104575163398693, + "acc_stderr": 0.025738854797818733, + "acc_norm": 0.28104575163398693, + "acc_norm_stderr": 0.025738854797818733 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.27009646302250806, + "acc_stderr": 0.025218040373410622, + "acc_norm": 0.27009646302250806, + "acc_norm_stderr": 0.025218040373410622 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25308641975308643, + "acc_stderr": 0.024191808600713002, + "acc_norm": 0.25308641975308643, + "acc_norm_stderr": 0.024191808600713002 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2765957446808511, + "acc_stderr": 0.026684564340461004, + "acc_norm": 0.2765957446808511, + "acc_norm_stderr": 0.026684564340461004 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2392438070404172, + "acc_stderr": 0.010896123652676651, + "acc_norm": 0.2392438070404172, + "acc_norm_stderr": 0.010896123652676651 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.22426470588235295, + "acc_stderr": 0.025336848563332355, + "acc_norm": 0.22426470588235295, + "acc_norm_stderr": 0.025336848563332355 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.24836601307189543, + "acc_stderr": 0.017479487001364764, + "acc_norm": 0.24836601307189543, + "acc_norm_stderr": 0.017479487001364764 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.20909090909090908, + "acc_stderr": 0.038950910157241364, + "acc_norm": 0.20909090909090908, + "acc_norm_stderr": 0.038950910157241364 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.23265306122448978, + "acc_stderr": 0.02704925791589618, + "acc_norm": 0.23265306122448978, + "acc_norm_stderr": 0.02704925791589618 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23880597014925373, + "acc_stderr": 0.030147775935409217, + "acc_norm": 0.23880597014925373, + "acc_norm_stderr": 0.030147775935409217 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.22289156626506024, + "acc_stderr": 0.03240004825594689, + "acc_norm": 0.22289156626506024, + "acc_norm_stderr": 0.03240004825594689 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.29239766081871343, + "acc_stderr": 0.034886477134579215, + "acc_norm": 0.29239766081871343, + "acc_norm_stderr": 0.034886477134579215 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862668, + "mc2": 0.4858437813036855, + "mc2_stderr": 0.016982636970661793 + }, + "all": { + "acc": 0.2537523849869386, + "acc_stderr": 0.03161597232025838, + "acc_norm": 0.2547666620741355, + "acc_norm_stderr": 0.03163104714877318, + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862668, + "mc2": 0.4858437813036855, + "mc2_stderr": 0.016982636970661793 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/EverythingLM-13B-16K-GPTQ", + "model_sha": "f14d3df05577f3e1ac35e2c4ec32ce0d39b97508", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "7102.397312164307", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/EverythingLM-13B-16K-GPTQ/results_2023-11-05T10-45-48.960213.json b/eval-results/TheBloke/EverythingLM-13B-16K-GPTQ/results_2023-11-05T10-45-48.960213.json new file mode 100644 index 0000000000000000000000000000000000000000..b18888c37adef3c094ea024eee9f1740f46419b7 --- /dev/null +++ b/eval-results/TheBloke/EverythingLM-13B-16K-GPTQ/results_2023-11-05T10-45-48.960213.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/EverythingLM-13B-16K-GPTQ", + "model_sha": "43e5f4e4be93c953e40418e5bbee66061f7e5c21", + "model_dtype": "torch.float16", + "model_size": "7.07 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.002307046979865772, + "em_stderr": 0.0004913221265094551, + "f1": 0.05822147651006729, + "f1_stderr": 0.0013554863715247699 + }, + "harness|gsm8k|5": { + "acc": 0.056103108415466264, + "acc_stderr": 0.006338668431321867 + }, + "harness|winogrande|5": { + "acc": 0.7134964483030781, + "acc_stderr": 0.012707030139960381 + }, + "all": { + "em": 0.002307046979865772, + "em_stderr": 0.0004913221265094551, + "f1": 0.05822147651006729, + "f1_stderr": 0.0013554863715247699, + "acc": 0.3847997783592722, + "acc_stderr": 0.009522849285641123 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "5c02ca302d0a9170" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "b86eb5f8dbfc46e2" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "3f3af27dccdd73d0" + }, + "truncated": 0, + "non_truncated": 12122, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/EverythingLM-13B-16K-GPTQ/results_2023-11-07T12-26-38.184269.json b/eval-results/TheBloke/EverythingLM-13B-16K-GPTQ/results_2023-11-07T12-26-38.184269.json new file mode 100644 index 0000000000000000000000000000000000000000..3b98a659078e5491746a1874dd60df0d1f3b29a6 --- /dev/null +++ b/eval-results/TheBloke/EverythingLM-13B-16K-GPTQ/results_2023-11-07T12-26-38.184269.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/EverythingLM-13B-16K-GPTQ", + "model_sha": "43e5f4e4be93c953e40418e5bbee66061f7e5c21", + "model_dtype": "torch.float16", + "model_size": "7.07 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.002307046979865772, + "em_stderr": 0.0004913221265094551, + "f1": 0.05827705536912766, + "f1_stderr": 0.0013555316279792778 + }, + "harness|gsm8k|5": { + "acc": 0.053828658074298714, + "acc_stderr": 0.0062163286402381465 + }, + "harness|winogrande|5": { + "acc": 0.7134964483030781, + "acc_stderr": 0.012707030139960381 + }, + "all": { + "em": 0.002307046979865772, + "em_stderr": 0.0004913221265094551, + "f1": 0.05827705536912766, + "f1_stderr": 0.0013555316279792778, + "acc": 0.3836625531886884, + "acc_stderr": 0.009461679390099264 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "944cdbf96c29895f" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "8a2fef2dc3026219" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "f9a690bd035b614e" + }, + "truncated": 0, + "non_truncated": 12122, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/GPlatty-30B-SuperHOT-8K-fp16/results_2023-08-01T15-51-23.628970.json b/eval-results/TheBloke/GPlatty-30B-SuperHOT-8K-fp16/results_2023-08-01T15-51-23.628970.json new file mode 100644 index 0000000000000000000000000000000000000000..c4d04a6f723b75b8d6b03c7e5fac998dfdea269d --- /dev/null +++ b/eval-results/TheBloke/GPlatty-30B-SuperHOT-8K-fp16/results_2023-08-01T15-51-23.628970.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.22696245733788395, + "acc_stderr": 0.012240491536132868, + "acc_norm": 0.2832764505119454, + "acc_norm_stderr": 0.013167478735134576 + }, + "harness|hellaswag|10": { + "acc": 0.28450507866958774, + "acc_stderr": 0.004502563079349398, + "acc_norm": 0.33479386576379205, + "acc_norm_stderr": 0.0047095388649163105 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.037498507091740206, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.037498507091740206 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.19736842105263158, + "acc_stderr": 0.03238981601699397, + "acc_norm": 0.19736842105263158, + "acc_norm_stderr": 0.03238981601699397 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.22641509433962265, + "acc_stderr": 0.02575755989310675, + "acc_norm": 0.22641509433962265, + "acc_norm_stderr": 0.02575755989310675 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2361111111111111, + "acc_stderr": 0.03551446610810826, + "acc_norm": 0.2361111111111111, + "acc_norm_stderr": 0.03551446610810826 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909282, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909282 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749874, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749874 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.04389869956808777, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.04389869956808777 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2851063829787234, + "acc_stderr": 0.029513196625539355, + "acc_norm": 0.2851063829787234, + "acc_norm_stderr": 0.029513196625539355 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.03835153954399421, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.03835153954399421 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.037245636197746325, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.037245636197746325 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.02113285918275444, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.02113285918275444 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04040610178208841, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04040610178208841 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2645161290322581, + "acc_stderr": 0.025091892378859275, + "acc_norm": 0.2645161290322581, + "acc_norm_stderr": 0.025091892378859275 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.22167487684729065, + "acc_stderr": 0.029225575892489624, + "acc_norm": 0.22167487684729065, + "acc_norm_stderr": 0.029225575892489624 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720683, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720683 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.21717171717171718, + "acc_stderr": 0.029376616484945633, + "acc_norm": 0.21717171717171718, + "acc_norm_stderr": 0.029376616484945633 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.18134715025906736, + "acc_stderr": 0.02780703236068609, + "acc_norm": 0.18134715025906736, + "acc_norm_stderr": 0.02780703236068609 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2717948717948718, + "acc_stderr": 0.022556551010132354, + "acc_norm": 0.2717948717948718, + "acc_norm_stderr": 0.022556551010132354 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2037037037037037, + "acc_stderr": 0.024556172219141265, + "acc_norm": 0.2037037037037037, + "acc_norm_stderr": 0.024556172219141265 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.22268907563025211, + "acc_stderr": 0.027025433498882385, + "acc_norm": 0.22268907563025211, + "acc_norm_stderr": 0.027025433498882385 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.19205298013245034, + "acc_stderr": 0.032162984205936135, + "acc_norm": 0.19205298013245034, + "acc_norm_stderr": 0.032162984205936135 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.22568807339449543, + "acc_stderr": 0.017923087667803053, + "acc_norm": 0.22568807339449543, + "acc_norm_stderr": 0.017923087667803053 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.030225226160012397, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.030225226160012397 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.03019028245350195, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.03019028245350195 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2869198312236287, + "acc_stderr": 0.029443773022594693, + "acc_norm": 0.2869198312236287, + "acc_norm_stderr": 0.029443773022594693 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.2914798206278027, + "acc_stderr": 0.030500283176545902, + "acc_norm": 0.2914798206278027, + "acc_norm_stderr": 0.030500283176545902 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2748091603053435, + "acc_stderr": 0.03915345408847835, + "acc_norm": 0.2748091603053435, + "acc_norm_stderr": 0.03915345408847835 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.04065578140908705, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.04065578140908705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.04284467968052191, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.04284467968052191 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.26993865030674846, + "acc_stderr": 0.03487825168497892, + "acc_norm": 0.26993865030674846, + "acc_norm_stderr": 0.03487825168497892 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2863247863247863, + "acc_stderr": 0.029614323690456648, + "acc_norm": 0.2863247863247863, + "acc_norm_stderr": 0.029614323690456648 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.3065134099616858, + "acc_stderr": 0.01648695289304151, + "acc_norm": 0.3065134099616858, + "acc_norm_stderr": 0.01648695289304151 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.23121387283236994, + "acc_stderr": 0.022698657167855716, + "acc_norm": 0.23121387283236994, + "acc_norm_stderr": 0.022698657167855716 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24581005586592178, + "acc_stderr": 0.014400296429225629, + "acc_norm": 0.24581005586592178, + "acc_norm_stderr": 0.014400296429225629 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.3104575163398693, + "acc_stderr": 0.026493033225145894, + "acc_norm": 0.3104575163398693, + "acc_norm_stderr": 0.026493033225145894 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.27009646302250806, + "acc_stderr": 0.025218040373410612, + "acc_norm": 0.27009646302250806, + "acc_norm_stderr": 0.025218040373410612 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2345679012345679, + "acc_stderr": 0.023576881744005716, + "acc_norm": 0.2345679012345679, + "acc_norm_stderr": 0.023576881744005716 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.25177304964539005, + "acc_stderr": 0.025892151156709405, + "acc_norm": 0.25177304964539005, + "acc_norm_stderr": 0.025892151156709405 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.26597131681877445, + "acc_stderr": 0.011285033165551274, + "acc_norm": 0.26597131681877445, + "acc_norm_stderr": 0.011285033165551274 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.02315746830855934, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.02315746830855934 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.018054027458815198, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.018054027458815198 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2, + "acc_stderr": 0.03831305140884601, + "acc_norm": 0.2, + "acc_norm_stderr": 0.03831305140884601 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.20816326530612245, + "acc_stderr": 0.025991117672813292, + "acc_norm": 0.20816326530612245, + "acc_norm_stderr": 0.025991117672813292 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.26865671641791045, + "acc_stderr": 0.03134328358208954, + "acc_norm": 0.26865671641791045, + "acc_norm_stderr": 0.03134328358208954 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2710843373493976, + "acc_stderr": 0.03460579907553027, + "acc_norm": 0.2710843373493976, + "acc_norm_stderr": 0.03460579907553027 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.29239766081871343, + "acc_stderr": 0.034886477134579215, + "acc_norm": 0.29239766081871343, + "acc_norm_stderr": 0.034886477134579215 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.22888616891064872, + "mc1_stderr": 0.014706994909055027, + "mc2": 0.46272712607124966, + "mc2_stderr": 0.016702158477967525 + }, + "all": { + "acc": 0.24941704039386783, + "acc_stderr": 0.0314384194357432, + "acc_norm": 0.2512238671780757, + "acc_norm_stderr": 0.03145763914734606, + "mc1": 0.22888616891064872, + "mc1_stderr": 0.014706994909055027, + "mc2": 0.46272712607124966, + "mc2_stderr": 0.016702158477967525 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/GPlatty-30B-SuperHOT-8K-fp16", + "model_sha": "e2103a424c1700756df1c0c0b334195f37efe17b", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "13162.618493795395", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Genz-70b-GPTQ/results_2023-08-31T00-30-34.342002.json b/eval-results/TheBloke/Genz-70b-GPTQ/results_2023-08-31T00-30-34.342002.json new file mode 100644 index 0000000000000000000000000000000000000000..033839f876f5cd98cfb060e704093ff0c923ee57 --- /dev/null +++ b/eval-results/TheBloke/Genz-70b-GPTQ/results_2023-08-31T00-30-34.342002.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/Genz-70b-GPTQ", + "model_sha": "7d38987a43d2445b193db99a029a264b39dc6c8e", + "model_dtype": "torch.float16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6638225255972696, + "acc_stderr": 0.013804855026205763, + "acc_norm": 0.7107508532423208, + "acc_norm_stderr": 0.013250012579393443 + }, + "harness|hellaswag|10": { + "acc": 0.689205337582155, + "acc_stderr": 0.004618730353217047, + "acc_norm": 0.8764190400318662, + "acc_norm_stderr": 0.0032843028764223 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6370370370370371, + "acc_stderr": 0.04153948404742398, + "acc_norm": 0.6370370370370371, + "acc_norm_stderr": 0.04153948404742398 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.8223684210526315, + "acc_stderr": 0.03110318238312338, + "acc_norm": 0.8223684210526315, + "acc_norm_stderr": 0.03110318238312338 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.76, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.76, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.720754716981132, + "acc_stderr": 0.027611163402399715, + "acc_norm": 0.720754716981132, + "acc_norm_stderr": 0.027611163402399715 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8194444444444444, + "acc_stderr": 0.032166008088022675, + "acc_norm": 0.8194444444444444, + "acc_norm_stderr": 0.032166008088022675 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.036430371689585475, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.036430371689585475 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4019607843137255, + "acc_stderr": 0.04878608714466996, + "acc_norm": 0.4019607843137255, + "acc_norm_stderr": 0.04878608714466996 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.676595744680851, + "acc_stderr": 0.030579442773610337, + "acc_norm": 0.676595744680851, + "acc_norm_stderr": 0.030579442773610337 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.45614035087719296, + "acc_stderr": 0.04685473041907789, + "acc_norm": 0.45614035087719296, + "acc_norm_stderr": 0.04685473041907789 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6344827586206897, + "acc_stderr": 0.04013124195424386, + "acc_norm": 0.6344827586206897, + "acc_norm_stderr": 0.04013124195424386 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4417989417989418, + "acc_stderr": 0.02557625706125384, + "acc_norm": 0.4417989417989418, + "acc_norm_stderr": 0.02557625706125384 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.46825396825396826, + "acc_stderr": 0.04463112720677172, + "acc_norm": 0.46825396825396826, + "acc_norm_stderr": 0.04463112720677172 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8290322580645161, + "acc_stderr": 0.02141724293632159, + "acc_norm": 0.8290322580645161, + "acc_norm_stderr": 0.02141724293632159 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5270935960591133, + "acc_stderr": 0.03512819077876106, + "acc_norm": 0.5270935960591133, + "acc_norm_stderr": 0.03512819077876106 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8545454545454545, + "acc_stderr": 0.027530196355066573, + "acc_norm": 0.8545454545454545, + "acc_norm_stderr": 0.027530196355066573 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8686868686868687, + "acc_stderr": 0.024063156416822523, + "acc_norm": 0.8686868686868687, + "acc_norm_stderr": 0.024063156416822523 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9378238341968912, + "acc_stderr": 0.017426974154240528, + "acc_norm": 0.9378238341968912, + "acc_norm_stderr": 0.017426974154240528 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7051282051282052, + "acc_stderr": 0.023119362758232294, + "acc_norm": 0.7051282051282052, + "acc_norm_stderr": 0.023119362758232294 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.31851851851851853, + "acc_stderr": 0.028406533090608463, + "acc_norm": 0.31851851851851853, + "acc_norm_stderr": 0.028406533090608463 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.773109243697479, + "acc_stderr": 0.027205371538279472, + "acc_norm": 0.773109243697479, + "acc_norm_stderr": 0.027205371538279472 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.5099337748344371, + "acc_stderr": 0.04081677107248437, + "acc_norm": 0.5099337748344371, + "acc_norm_stderr": 0.04081677107248437 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8917431192660551, + "acc_stderr": 0.013321348447611769, + "acc_norm": 0.8917431192660551, + "acc_norm_stderr": 0.013321348447611769 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.6018518518518519, + "acc_stderr": 0.033384734032074016, + "acc_norm": 0.6018518518518519, + "acc_norm_stderr": 0.033384734032074016 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9313725490196079, + "acc_stderr": 0.017744453647073312, + "acc_norm": 0.9313725490196079, + "acc_norm_stderr": 0.017744453647073312 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.9029535864978903, + "acc_stderr": 0.019269323025640262, + "acc_norm": 0.9029535864978903, + "acc_norm_stderr": 0.019269323025640262 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.8116591928251121, + "acc_stderr": 0.026241132996407252, + "acc_norm": 0.8116591928251121, + "acc_norm_stderr": 0.026241132996407252 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8396946564885496, + "acc_stderr": 0.03217829420744633, + "acc_norm": 0.8396946564885496, + "acc_norm_stderr": 0.03217829420744633 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.859504132231405, + "acc_stderr": 0.03172233426002157, + "acc_norm": 0.859504132231405, + "acc_norm_stderr": 0.03172233426002157 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8240740740740741, + "acc_stderr": 0.036809181416738807, + "acc_norm": 0.8240740740740741, + "acc_norm_stderr": 0.036809181416738807 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8159509202453987, + "acc_stderr": 0.030446777687971726, + "acc_norm": 0.8159509202453987, + "acc_norm_stderr": 0.030446777687971726 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8446601941747572, + "acc_stderr": 0.03586594738573974, + "acc_norm": 0.8446601941747572, + "acc_norm_stderr": 0.03586594738573974 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9102564102564102, + "acc_stderr": 0.018724301741941642, + "acc_norm": 0.9102564102564102, + "acc_norm_stderr": 0.018724301741941642 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8722860791826309, + "acc_stderr": 0.011935626313999876, + "acc_norm": 0.8722860791826309, + "acc_norm_stderr": 0.011935626313999876 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.8005780346820809, + "acc_stderr": 0.021511900654252562, + "acc_norm": 0.8005780346820809, + "acc_norm_stderr": 0.021511900654252562 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.5754189944134078, + "acc_stderr": 0.01653117099327888, + "acc_norm": 0.5754189944134078, + "acc_norm_stderr": 0.01653117099327888 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7745098039215687, + "acc_stderr": 0.02392915551735129, + "acc_norm": 0.7745098039215687, + "acc_norm_stderr": 0.02392915551735129 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7717041800643086, + "acc_stderr": 0.023839303311398205, + "acc_norm": 0.7717041800643086, + "acc_norm_stderr": 0.023839303311398205 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8487654320987654, + "acc_stderr": 0.019935086092149897, + "acc_norm": 0.8487654320987654, + "acc_norm_stderr": 0.019935086092149897 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5638297872340425, + "acc_stderr": 0.029583452036284076, + "acc_norm": 0.5638297872340425, + "acc_norm_stderr": 0.029583452036284076 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5534550195567145, + "acc_stderr": 0.012697046024399654, + "acc_norm": 0.5534550195567145, + "acc_norm_stderr": 0.012697046024399654 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7389705882352942, + "acc_stderr": 0.026679252270103135, + "acc_norm": 0.7389705882352942, + "acc_norm_stderr": 0.026679252270103135 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7630718954248366, + "acc_stderr": 0.017201662169789772, + "acc_norm": 0.7630718954248366, + "acc_norm_stderr": 0.017201662169789772 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7363636363636363, + "acc_stderr": 0.04220224692971987, + "acc_norm": 0.7363636363636363, + "acc_norm_stderr": 0.04220224692971987 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.8, + "acc_stderr": 0.02560737598657916, + "acc_norm": 0.8, + "acc_norm_stderr": 0.02560737598657916 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8606965174129353, + "acc_stderr": 0.024484487162913973, + "acc_norm": 0.8606965174129353, + "acc_norm_stderr": 0.024484487162913973 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.9, + "acc_stderr": 0.030151134457776334, + "acc_norm": 0.9, + "acc_norm_stderr": 0.030151134457776334 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5180722891566265, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.5180722891566265, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8771929824561403, + "acc_stderr": 0.025172984350155754, + "acc_norm": 0.8771929824561403, + "acc_norm_stderr": 0.025172984350155754 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.4320685434516524, + "mc1_stderr": 0.01734120239498826, + "mc2": 0.6228267270427654, + "mc2_stderr": 0.014836432877772263 + }, + "all": { + "acc": 0.7017249416277331, + "acc_stderr": 0.030832772804323012, + "acc_norm": 0.70569345061239, + "acc_norm_stderr": 0.03080075128019408, + "mc1": 0.4320685434516524, + "mc1_stderr": 0.01734120239498826, + "mc2": 0.6228267270427654, + "mc2_stderr": 0.014836432877772263 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "49238.55923986435", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Guanaco-3B-Uncensored-v2-GPTQ/results_2023-10-03T21-39-11.409465.json b/eval-results/TheBloke/Guanaco-3B-Uncensored-v2-GPTQ/results_2023-10-03T21-39-11.409465.json new file mode 100644 index 0000000000000000000000000000000000000000..0d25f4b2671baa415b7854856f3e0ba523727637 --- /dev/null +++ b/eval-results/TheBloke/Guanaco-3B-Uncensored-v2-GPTQ/results_2023-10-03T21-39-11.409465.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "TheBloke/Guanaco-3B-Uncensored-v2-GPTQ", + "model_sha": "c80e2f01377d551ad17c8c9bac3f52578c38d653", + "model_size": "1.72 GB", + "model_dtype": "None", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.3703071672354949, + "acc_stderr": 0.014111298751674948, + "acc_norm": 0.41638225255972694, + "acc_norm_stderr": 0.014405618279436172 + }, + "harness|hellaswag|10": { + "acc": 0.4772953594901414, + "acc_stderr": 0.00498463428510162, + "acc_norm": 0.6475801633140809, + "acc_norm_stderr": 0.004767475366689784 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.037125378336148665, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.037125378336148665 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.34868421052631576, + "acc_stderr": 0.0387813988879761, + "acc_norm": 0.34868421052631576, + "acc_norm_stderr": 0.0387813988879761 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2943396226415094, + "acc_stderr": 0.028049186315695248, + "acc_norm": 0.2943396226415094, + "acc_norm_stderr": 0.028049186315695248 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.24305555555555555, + "acc_stderr": 0.03586879280080341, + "acc_norm": 0.24305555555555555, + "acc_norm_stderr": 0.03586879280080341 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2543352601156069, + "acc_stderr": 0.0332055644308557, + "acc_norm": 0.2543352601156069, + "acc_norm_stderr": 0.0332055644308557 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.13725490196078433, + "acc_stderr": 0.034240846698915216, + "acc_norm": 0.13725490196078433, + "acc_norm_stderr": 0.034240846698915216 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.19574468085106383, + "acc_stderr": 0.025937853139977148, + "acc_norm": 0.19574468085106383, + "acc_norm_stderr": 0.025937853139977148 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.04142439719489361, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.04142439719489361 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2620689655172414, + "acc_stderr": 0.03664666337225256, + "acc_norm": 0.2620689655172414, + "acc_norm_stderr": 0.03664666337225256 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.022644212615525214, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.022644212615525214 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2698412698412698, + "acc_stderr": 0.03970158273235172, + "acc_norm": 0.2698412698412698, + "acc_norm_stderr": 0.03970158273235172 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2645161290322581, + "acc_stderr": 0.02509189237885928, + "acc_norm": 0.2645161290322581, + "acc_norm_stderr": 0.02509189237885928 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.26108374384236455, + "acc_stderr": 0.03090379695211449, + "acc_norm": 0.26108374384236455, + "acc_norm_stderr": 0.03090379695211449 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.03453131801885415, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.03453131801885415 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3383838383838384, + "acc_stderr": 0.03371124142626302, + "acc_norm": 0.3383838383838384, + "acc_norm_stderr": 0.03371124142626302 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.23316062176165803, + "acc_stderr": 0.03051611137147601, + "acc_norm": 0.23316062176165803, + "acc_norm_stderr": 0.03051611137147601 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2794871794871795, + "acc_stderr": 0.02275238883977683, + "acc_norm": 0.2794871794871795, + "acc_norm_stderr": 0.02275238883977683 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.02684205787383371, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.02684205787383371 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.24789915966386555, + "acc_stderr": 0.028047967224176892, + "acc_norm": 0.24789915966386555, + "acc_norm_stderr": 0.028047967224176892 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943343, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943343 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.26788990825688075, + "acc_stderr": 0.01898746225797865, + "acc_norm": 0.26788990825688075, + "acc_norm_stderr": 0.01898746225797865 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.20833333333333334, + "acc_stderr": 0.027696910713093936, + "acc_norm": 0.20833333333333334, + "acc_norm_stderr": 0.027696910713093936 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.03077855467869326, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.03077855467869326 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2616033755274262, + "acc_stderr": 0.028609516716994934, + "acc_norm": 0.2616033755274262, + "acc_norm_stderr": 0.028609516716994934 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.18834080717488788, + "acc_stderr": 0.026241132996407252, + "acc_norm": 0.18834080717488788, + "acc_norm_stderr": 0.026241132996407252 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2366412213740458, + "acc_stderr": 0.03727673575596918, + "acc_norm": 0.2366412213740458, + "acc_norm_stderr": 0.03727673575596918 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.38016528925619836, + "acc_stderr": 0.04431324501968432, + "acc_norm": 0.38016528925619836, + "acc_norm_stderr": 0.04431324501968432 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.04330043749650741, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.04330043749650741 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2883435582822086, + "acc_stderr": 0.035590395316173425, + "acc_norm": 0.2883435582822086, + "acc_norm_stderr": 0.035590395316173425 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.23214285714285715, + "acc_stderr": 0.04007341809755806, + "acc_norm": 0.23214285714285715, + "acc_norm_stderr": 0.04007341809755806 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.32038834951456313, + "acc_stderr": 0.0462028408228004, + "acc_norm": 0.32038834951456313, + "acc_norm_stderr": 0.0462028408228004 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.029343114798094455, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.029343114798094455 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2503192848020434, + "acc_stderr": 0.015491088951494576, + "acc_norm": 0.2503192848020434, + "acc_norm_stderr": 0.015491088951494576 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.28034682080924855, + "acc_stderr": 0.024182427496577605, + "acc_norm": 0.28034682080924855, + "acc_norm_stderr": 0.024182427496577605 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24804469273743016, + "acc_stderr": 0.014444157808261466, + "acc_norm": 0.24804469273743016, + "acc_norm_stderr": 0.014444157808261466 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.024954184324879905, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.024954184324879905 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2797427652733119, + "acc_stderr": 0.02549425935069491, + "acc_norm": 0.2797427652733119, + "acc_norm_stderr": 0.02549425935069491 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25308641975308643, + "acc_stderr": 0.024191808600713002, + "acc_norm": 0.25308641975308643, + "acc_norm_stderr": 0.024191808600713002 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2695035460992908, + "acc_stderr": 0.026469036818590627, + "acc_norm": 0.2695035460992908, + "acc_norm_stderr": 0.026469036818590627 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2711864406779661, + "acc_stderr": 0.011354581451622985, + "acc_norm": 0.2711864406779661, + "acc_norm_stderr": 0.011354581451622985 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.16911764705882354, + "acc_stderr": 0.022770868010113025, + "acc_norm": 0.16911764705882354, + "acc_norm_stderr": 0.022770868010113025 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.23202614379084968, + "acc_stderr": 0.017077373377857006, + "acc_norm": 0.23202614379084968, + "acc_norm_stderr": 0.017077373377857006 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.3090909090909091, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.3090909090909091, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2571428571428571, + "acc_stderr": 0.02797982353874455, + "acc_norm": 0.2571428571428571, + "acc_norm_stderr": 0.02797982353874455 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.030360490154014652, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.030360490154014652 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2710843373493976, + "acc_stderr": 0.03460579907553026, + "acc_norm": 0.2710843373493976, + "acc_norm_stderr": 0.03460579907553026 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.30994152046783624, + "acc_stderr": 0.035469769593931624, + "acc_norm": 0.30994152046783624, + "acc_norm_stderr": 0.035469769593931624 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.21664626682986537, + "mc1_stderr": 0.014421468452506978, + "mc2": 0.3658408497762684, + "mc2_stderr": 0.013884287044021056 + }, + "all": { + "acc": 0.26796194534851764, + "acc_stderr": 0.0320582805520218, + "acc_norm": 0.27162906211374094, + "acc_norm_stderr": 0.032059588358959924, + "mc1": 0.21664626682986537, + "mc1_stderr": 0.014421468452506978, + "mc2": 0.3658408497762684, + "mc2_stderr": 0.013884287044021056 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "573b1b078b6e9deb", + "hash_cont_tokens": "d9940905d0c552c9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "f0fd0caf4d4c1110", + "hash_cont_tokens": "5a151675bb24bc7e" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40123, + "non-padded": 45, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "f076ac6b177ca28c", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "059827606e6b0780", + "hash_cont_tokens": "ec7e2288ab5f1ce9" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "1dd0dab88aa9e4b2", + "hash_cont_tokens": "044d83cac9e59cbb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "d51eb5246cbe2173", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "2337a7f17800c6ec", + "hash_cont_tokens": "bc82b3cc5072f164" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "e394ebbb8ceace76", + "hash_cont_tokens": "3bc45e0c4b6d612d" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "9221fbdf710a6f67", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "ebe2748d21b2ba41", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "bfecefb08ffb7faa", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "2ac8aec9025dc58b", + "hash_cont_tokens": "16f654508cdc19c4" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 680, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "faf44c77f43368ef", + "hash_cont_tokens": "a3a24586c7218684" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "280c7f12abde10a5", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "217a841c86d2d992", + "hash_cont_tokens": "43818b3dc0c7496f" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "354267c0f98aad3b", + "hash_cont_tokens": "cff195e157be949a" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "4f5e8d051d04dde0", + "hash_cont_tokens": "7e14ccd1e2688bb8" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "cd12bec1d5448dda", + "hash_cont_tokens": "62f751399492015f" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "c549e395850984fe", + "hash_cont_tokens": "961939aeb671801f" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "81b06f5caa221f97", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "ad626d781102fe51", + "hash_cont_tokens": "d7a3b149f7e83a27" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "2c0d3f2eacc6bbd5", + "hash_cont_tokens": "b2579ba9c4c7423e" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "aada51d0571db37b", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "6e47d696116edd01", + "hash_cont_tokens": "47a5e5973f50fe17" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "0e8ee6c9e572e3c4", + "hash_cont_tokens": "812f79117b9593de" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8fa2bf90de3b07e7", + "hash_cont_tokens": "5d4317e7acbf10e5" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fabb8f176276af2f", + "hash_cont_tokens": "8d468d84a686647d" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3e86d13ef021476a", + "hash_cont_tokens": "5ef6ef9328ef5238" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1069, + "non-padded": 11, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a132b5e9c9531b36", + "hash_cont_tokens": "4c32e38c066727bc" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "f8f6fe5143776cb4", + "hash_cont_tokens": "bf29d47c925caba6" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "e28121967b27a315", + "hash_cont_tokens": "45f02bc4af60f027" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "bdbe90efb4a1c4ce", + "hash_cont_tokens": "b15e06c7557a0ca1" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "b8f58f05dc082011", + "hash_cont_tokens": "e5ab34a54e3f5b7c" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "3af911bf93093a85", + "hash_cont_tokens": "3b99b36f60960908" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "1dd2240eb90b9a70", + "hash_cont_tokens": "7982edf99219e1b0" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f3de2f8181824a79", + "hash_cont_tokens": "ed73d516c5552dd0" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "0c2a1dd63cc74137", + "hash_cont_tokens": "6b17b0774106ed83" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "08e3527985f33aab", + "hash_cont_tokens": "ddf5241e450210d6" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "bf7216a648529f68", + "hash_cont_tokens": "eb791fcbee9e0682" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "28f5891c956afd65", + "hash_cont_tokens": "ed6f21d7fec8cbab" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6de88b824d4f64c3", + "hash_cont_tokens": "27795e9c98bdeda8" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "5ef855d01044fd83", + "hash_cont_tokens": "874c5b0b496cbe8a" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "1840e0b96d7e619e", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "02483f6b53dc13ac", + "hash_cont_tokens": "313ee361fbdbab3c" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "93202e79d594dde4", + "hash_cont_tokens": "bfc9a5db80e5bba3" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1356, + "non-padded": 28, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "41c03f41d2ba9fe7", + "hash_cont_tokens": "b6b5d477136351d3" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "d83bcb6dd08809ac", + "hash_cont_tokens": "497c8d5896f280f6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "65c70474c8a5d205", + "hash_cont_tokens": "7916d26928435f1a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "4d4126ac9a91ac47", + "hash_cont_tokens": "88542052394953bd" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "592f80ad364d686a", + "hash_cont_tokens": "316cf4c387aa53e3" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "7f837322b1b62ac1", + "hash_cont_tokens": "6b31cf265df9b81b" + }, + "truncated": 16, + "non-truncated": 6120, + "padded": 6120, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "05a8ef0dd10b4bba", + "hash_cont_tokens": "ce95c9ee454fdf64" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3c7944f0b2c49f64", + "hash_cont_tokens": "0782e6576a3a8785" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "637e934bb716d5ec", + "hash_cont_tokens": "ca79966b90cda0ea" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "3bad229573ed6a9c", + "hash_cont_tokens": "5e8fd3201be1a1f4" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "70a479e96d02d5d8", + "hash_cont_tokens": "f49476cf49b37d7c" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "0d690fc0db462440", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "4b0fdf8e692dd640", + "hash_cont_tokens": "0065c4bbe6134c1c" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "cfd7092dc8aacd96", + "hash_cont_tokens": "a111a36329479373" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "e820abadeb7ebfb3", + "hash_cont_tokens": "87e1c2b162b3e4c6" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "c86f5765cd1e9dab", + "hash_cont_tokens": "70be634de3673b78" + }, + "total_evaluation_time_secondes": "2640.6203553676605", + "truncated": 1492, + "non-truncated": 109527, + "padded": 109403, + "non-padded": 1616, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Guanaco-3B-Uncensored-v2-GPTQ/results_2023-10-29T01-04-16.242483.json b/eval-results/TheBloke/Guanaco-3B-Uncensored-v2-GPTQ/results_2023-10-29T01-04-16.242483.json new file mode 100644 index 0000000000000000000000000000000000000000..92d8f685255efdbb65464cd9862956942afbefc4 --- /dev/null +++ b/eval-results/TheBloke/Guanaco-3B-Uncensored-v2-GPTQ/results_2023-10-29T01-04-16.242483.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/Guanaco-3B-Uncensored-v2-GPTQ", + "model_sha": "c80e2f01377d551ad17c8c9bac3f52578c38d653", + "model_size": "1.72 GB", + "model_dtype": "None", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0045092281879194635, + "em_stderr": 0.0006861346899095007, + "f1": 0.06708368288590627, + "f1_stderr": 0.0016014292768729186 + }, + "harness|gsm8k|5": { + "acc": 0.001516300227445034, + "acc_stderr": 0.0010717793485492612 + }, + "harness|winogrande|5": { + "acc": 0.6432517758484609, + "acc_stderr": 0.013463393958028728 + }, + "all": { + "em": 0.0045092281879194635, + "em_stderr": 0.0006861346899095007, + "f1": 0.06708368288590627, + "f1_stderr": 0.0016014292768729186, + "acc": 0.322384038037953, + "acc_stderr": 0.0072675866532889944 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "4bf3f6ba1bae765a", + "hash_cont_tokens": "5b9d6675c4e216c1" + }, + "truncated": 439, + "non-truncated": 9097, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "ef516f9ffbe76423", + "hash_cont_tokens": "cb99d9d085450bef" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c469718508f43cab", + "hash_cont_tokens": "87eeb79172195781" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2456, + "non-padded": 78, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "401c6c49053f17ab", + "hash_cont_tokens": "38bee1edc4b2ef9b" + }, + "total_evaluation_time_secondes": "7495.541535377502", + "truncated": 439, + "non-truncated": 12950, + "padded": 2456, + "non-padded": 10933, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Kimiko-13B-fp16/results_2023-08-21T21-08-02.539395.json b/eval-results/TheBloke/Kimiko-13B-fp16/results_2023-08-21T21-08-02.539395.json new file mode 100644 index 0000000000000000000000000000000000000000..2a5c8cf86e6c1eef5bb46899ade875097c8ebdb2 --- /dev/null +++ b/eval-results/TheBloke/Kimiko-13B-fp16/results_2023-08-21T21-08-02.539395.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.552901023890785, + "acc_stderr": 0.014529380160526843, + "acc_norm": 0.5921501706484642, + "acc_norm_stderr": 0.0143610972884497 + }, + "harness|hellaswag|10": { + "acc": 0.617307309300936, + "acc_stderr": 0.004850508945116088, + "acc_norm": 0.823541127265485, + "acc_norm_stderr": 0.003804310123682778 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5657894736842105, + "acc_stderr": 0.04033565667848319, + "acc_norm": 0.5657894736842105, + "acc_norm_stderr": 0.04033565667848319 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.630188679245283, + "acc_stderr": 0.029711421880107933, + "acc_norm": 0.630188679245283, + "acc_norm_stderr": 0.029711421880107933 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.041227287076512825, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.041227287076512825 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.04488482852329017, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.04488482852329017 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4085106382978723, + "acc_stderr": 0.03213418026701576, + "acc_norm": 0.4085106382978723, + "acc_norm_stderr": 0.03213418026701576 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.04142439719489361, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.04142439719489361 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.023919984164047732, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.023919984164047732 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.042857142857142816, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.042857142857142816 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.667741935483871, + "acc_stderr": 0.0267955608481228, + "acc_norm": 0.667741935483871, + "acc_norm_stderr": 0.0267955608481228 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4236453201970443, + "acc_stderr": 0.03476725747649038, + "acc_norm": 0.4236453201970443, + "acc_norm_stderr": 0.03476725747649038 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6606060606060606, + "acc_stderr": 0.03697442205031596, + "acc_norm": 0.6606060606060606, + "acc_norm_stderr": 0.03697442205031596 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6919191919191919, + "acc_stderr": 0.032894773300986155, + "acc_norm": 0.6919191919191919, + "acc_norm_stderr": 0.032894773300986155 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8290155440414507, + "acc_stderr": 0.02717121368316455, + "acc_norm": 0.8290155440414507, + "acc_norm_stderr": 0.02717121368316455 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5205128205128206, + "acc_stderr": 0.02532966316348994, + "acc_norm": 0.5205128205128206, + "acc_norm_stderr": 0.02532966316348994 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.028037929969114986, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.028037929969114986 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5798319327731093, + "acc_stderr": 0.03206183783236152, + "acc_norm": 0.5798319327731093, + "acc_norm_stderr": 0.03206183783236152 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.03879687024073327, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.03879687024073327 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7596330275229358, + "acc_stderr": 0.01832060732096407, + "acc_norm": 0.7596330275229358, + "acc_norm_stderr": 0.01832060732096407 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4861111111111111, + "acc_stderr": 0.03408655867977748, + "acc_norm": 0.4861111111111111, + "acc_norm_stderr": 0.03408655867977748 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7450980392156863, + "acc_stderr": 0.030587591351604257, + "acc_norm": 0.7450980392156863, + "acc_norm_stderr": 0.030587591351604257 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7341772151898734, + "acc_stderr": 0.028756799629658342, + "acc_norm": 0.7341772151898734, + "acc_norm_stderr": 0.028756799629658342 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6322869955156951, + "acc_stderr": 0.03236198350928276, + "acc_norm": 0.6322869955156951, + "acc_norm_stderr": 0.03236198350928276 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6717557251908397, + "acc_stderr": 0.041184385658062976, + "acc_norm": 0.6717557251908397, + "acc_norm_stderr": 0.041184385658062976 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6942148760330579, + "acc_stderr": 0.042059539338841226, + "acc_norm": 0.6942148760330579, + "acc_norm_stderr": 0.042059539338841226 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.04330043749650742, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.04330043749650742 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6932515337423313, + "acc_stderr": 0.03623089915724146, + "acc_norm": 0.6932515337423313, + "acc_norm_stderr": 0.03623089915724146 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.042878587513404565, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.042878587513404565 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7961165048543689, + "acc_stderr": 0.03989139859531771, + "acc_norm": 0.7961165048543689, + "acc_norm_stderr": 0.03989139859531771 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7863247863247863, + "acc_stderr": 0.026853450377009168, + "acc_norm": 0.7863247863247863, + "acc_norm_stderr": 0.026853450377009168 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7432950191570882, + "acc_stderr": 0.015620480263064512, + "acc_norm": 0.7432950191570882, + "acc_norm_stderr": 0.015620480263064512 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6445086705202312, + "acc_stderr": 0.025770292082977257, + "acc_norm": 0.6445086705202312, + "acc_norm_stderr": 0.025770292082977257 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3396648044692737, + "acc_stderr": 0.015839400406212494, + "acc_norm": 0.3396648044692737, + "acc_norm_stderr": 0.015839400406212494 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6437908496732027, + "acc_stderr": 0.02742047766262923, + "acc_norm": 0.6437908496732027, + "acc_norm_stderr": 0.02742047766262923 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6334405144694534, + "acc_stderr": 0.027368078243971635, + "acc_norm": 0.6334405144694534, + "acc_norm_stderr": 0.027368078243971635 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6327160493827161, + "acc_stderr": 0.026822801759507894, + "acc_norm": 0.6327160493827161, + "acc_norm_stderr": 0.026822801759507894 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4078014184397163, + "acc_stderr": 0.029316011776343555, + "acc_norm": 0.4078014184397163, + "acc_norm_stderr": 0.029316011776343555 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.42633637548891784, + "acc_stderr": 0.012630884771599698, + "acc_norm": 0.42633637548891784, + "acc_norm_stderr": 0.012630884771599698 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5404411764705882, + "acc_stderr": 0.03027332507734576, + "acc_norm": 0.5404411764705882, + "acc_norm_stderr": 0.03027332507734576 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5490196078431373, + "acc_stderr": 0.020130388312904528, + "acc_norm": 0.5490196078431373, + "acc_norm_stderr": 0.020130388312904528 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6272727272727273, + "acc_stderr": 0.04631381319425465, + "acc_norm": 0.6272727272727273, + "acc_norm_stderr": 0.04631381319425465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6448979591836734, + "acc_stderr": 0.030635655150387638, + "acc_norm": 0.6448979591836734, + "acc_norm_stderr": 0.030635655150387638 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7512437810945274, + "acc_stderr": 0.030567675938916714, + "acc_norm": 0.7512437810945274, + "acc_norm_stderr": 0.030567675938916714 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.463855421686747, + "acc_stderr": 0.03882310850890593, + "acc_norm": 0.463855421686747, + "acc_norm_stderr": 0.03882310850890593 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7719298245614035, + "acc_stderr": 0.032180937956023566, + "acc_norm": 0.7719298245614035, + "acc_norm_stderr": 0.032180937956023566 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2766217870257038, + "mc1_stderr": 0.015659605755326923, + "mc2": 0.3955278979709314, + "mc2_stderr": 0.01443420009313445 + }, + "all": { + "acc": 0.5593608326190542, + "acc_stderr": 0.03428929602438075, + "acc_norm": 0.5635215608346852, + "acc_norm_stderr": 0.03426871158889753, + "mc1": 0.2766217870257038, + "mc1_stderr": 0.015659605755326923, + "mc2": 0.3955278979709314, + "mc2_stderr": 0.01443420009313445 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/Kimiko-13B-fp16", + "model_sha": "27868769e2d6b1af46337f0997c71b0577952a3d", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6288.64780497551", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Kimiko-13B-fp16/results_2023-10-22T20-29-03.807457.json b/eval-results/TheBloke/Kimiko-13B-fp16/results_2023-10-22T20-29-03.807457.json new file mode 100644 index 0000000000000000000000000000000000000000..911324a78e3852c8d7ce971e9baa1f57140c9527 --- /dev/null +++ b/eval-results/TheBloke/Kimiko-13B-fp16/results_2023-10-22T20-29-03.807457.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/Kimiko-13B-fp16", + "model_sha": "39b09fedc95d71a83c50c4d01cd9fb06f644880c", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0017827181208053692, + "em_stderr": 0.00043200973460388425, + "f1": 0.06370176174496635, + "f1_stderr": 0.0013821226935642709 + }, + "harness|gsm8k|5": { + "acc": 0.08794541319181198, + "acc_stderr": 0.007801162197487721 + }, + "harness|winogrande|5": { + "acc": 0.7671665351223362, + "acc_stderr": 0.011878201073856539 + }, + "all": { + "em": 0.0017827181208053692, + "em_stderr": 0.00043200973460388425, + "f1": 0.06370176174496635, + "f1_stderr": 0.0013821226935642709, + "acc": 0.42755597415707414, + "acc_stderr": 0.009839681635672129 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "297b73985285273f" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "dbec9329a723234d" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "ce63d3cb76586abd" + }, + "total_evaluation_time_secondes": "12292.558754444122", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Kimiko-v2-13B-fp16/results_2023-08-31T10-23-07.841871.json b/eval-results/TheBloke/Kimiko-v2-13B-fp16/results_2023-08-31T10-23-07.841871.json new file mode 100644 index 0000000000000000000000000000000000000000..40b8af4e909b0c5bdb525b4f49209b264f1c231e --- /dev/null +++ b/eval-results/TheBloke/Kimiko-v2-13B-fp16/results_2023-08-31T10-23-07.841871.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/Kimiko-v2-13B-fp16", + "model_sha": "0fed305667508e50330e71a2d43e9cee5ea73783", + "model_dtype": "torch.float16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5750853242320819, + "acc_stderr": 0.014445698968520769, + "acc_norm": 0.6100682593856656, + "acc_norm_stderr": 0.014252959848892889 + }, + "harness|hellaswag|10": { + "acc": 0.6241784505078669, + "acc_stderr": 0.00483344455633862, + "acc_norm": 0.8332005576578371, + "acc_norm_stderr": 0.0037203482062127006 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252606, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252606 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5394736842105263, + "acc_stderr": 0.04056242252249034, + "acc_norm": 0.5394736842105263, + "acc_norm_stderr": 0.04056242252249034 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5773584905660377, + "acc_stderr": 0.03040233144576954, + "acc_norm": 0.5773584905660377, + "acc_norm_stderr": 0.03040233144576954 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5694444444444444, + "acc_stderr": 0.04140685639111503, + "acc_norm": 0.5694444444444444, + "acc_norm_stderr": 0.04140685639111503 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5375722543352601, + "acc_stderr": 0.0380168510452446, + "acc_norm": 0.5375722543352601, + "acc_norm_stderr": 0.0380168510452446 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171453, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171453 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4297872340425532, + "acc_stderr": 0.03236214467715564, + "acc_norm": 0.4297872340425532, + "acc_norm_stderr": 0.03236214467715564 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.04166567577101579, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.04166567577101579 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.31216931216931215, + "acc_stderr": 0.02386520683697261, + "acc_norm": 0.31216931216931215, + "acc_norm_stderr": 0.02386520683697261 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.04306241259127153, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.04306241259127153 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.667741935483871, + "acc_stderr": 0.026795560848122804, + "acc_norm": 0.667741935483871, + "acc_norm_stderr": 0.026795560848122804 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4039408866995074, + "acc_stderr": 0.0345245390382204, + "acc_norm": 0.4039408866995074, + "acc_norm_stderr": 0.0345245390382204 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6848484848484848, + "acc_stderr": 0.0362773057502241, + "acc_norm": 0.6848484848484848, + "acc_norm_stderr": 0.0362773057502241 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.033586181457325226, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.033586181457325226 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8290155440414507, + "acc_stderr": 0.027171213683164542, + "acc_norm": 0.8290155440414507, + "acc_norm_stderr": 0.027171213683164542 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5076923076923077, + "acc_stderr": 0.02534800603153477, + "acc_norm": 0.5076923076923077, + "acc_norm_stderr": 0.02534800603153477 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.027309140588230182, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.027309140588230182 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.032252942323996406, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.032252942323996406 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2913907284768212, + "acc_stderr": 0.037101857261199946, + "acc_norm": 0.2913907284768212, + "acc_norm_stderr": 0.037101857261199946 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7504587155963303, + "acc_stderr": 0.018553897629501628, + "acc_norm": 0.7504587155963303, + "acc_norm_stderr": 0.018553897629501628 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.46296296296296297, + "acc_stderr": 0.03400603625538271, + "acc_norm": 0.46296296296296297, + "acc_norm_stderr": 0.03400603625538271 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7598039215686274, + "acc_stderr": 0.02998373305591362, + "acc_norm": 0.7598039215686274, + "acc_norm_stderr": 0.02998373305591362 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7510548523206751, + "acc_stderr": 0.028146970599422644, + "acc_norm": 0.7510548523206751, + "acc_norm_stderr": 0.028146970599422644 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6502242152466368, + "acc_stderr": 0.03200736719484503, + "acc_norm": 0.6502242152466368, + "acc_norm_stderr": 0.03200736719484503 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6259541984732825, + "acc_stderr": 0.042438692422305246, + "acc_norm": 0.6259541984732825, + "acc_norm_stderr": 0.042438692422305246 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.04026187527591207, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.04026187527591207 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6481481481481481, + "acc_stderr": 0.04616631111801713, + "acc_norm": 0.6481481481481481, + "acc_norm_stderr": 0.04616631111801713 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6625766871165644, + "acc_stderr": 0.037149084099355745, + "acc_norm": 0.6625766871165644, + "acc_norm_stderr": 0.037149084099355745 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.042878587513404565, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.042878587513404565 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7378640776699029, + "acc_stderr": 0.04354631077260594, + "acc_norm": 0.7378640776699029, + "acc_norm_stderr": 0.04354631077260594 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7863247863247863, + "acc_stderr": 0.026853450377009168, + "acc_norm": 0.7863247863247863, + "acc_norm_stderr": 0.026853450377009168 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.58, + "acc_stderr": 0.04960449637488583, + "acc_norm": 0.58, + "acc_norm_stderr": 0.04960449637488583 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.756066411238825, + "acc_stderr": 0.015357212665829465, + "acc_norm": 0.756066411238825, + "acc_norm_stderr": 0.015357212665829465 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.02607431485165708, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.02607431485165708 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3106145251396648, + "acc_stderr": 0.015476515438005566, + "acc_norm": 0.3106145251396648, + "acc_norm_stderr": 0.015476515438005566 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.630718954248366, + "acc_stderr": 0.02763417668960266, + "acc_norm": 0.630718954248366, + "acc_norm_stderr": 0.02763417668960266 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6495176848874598, + "acc_stderr": 0.027098652621301754, + "acc_norm": 0.6495176848874598, + "acc_norm_stderr": 0.027098652621301754 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6604938271604939, + "acc_stderr": 0.026348564412011624, + "acc_norm": 0.6604938271604939, + "acc_norm_stderr": 0.026348564412011624 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3971631205673759, + "acc_stderr": 0.029189805673587095, + "acc_norm": 0.3971631205673759, + "acc_norm_stderr": 0.029189805673587095 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.42242503259452413, + "acc_stderr": 0.012615600475734921, + "acc_norm": 0.42242503259452413, + "acc_norm_stderr": 0.012615600475734921 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5404411764705882, + "acc_stderr": 0.030273325077345755, + "acc_norm": 0.5404411764705882, + "acc_norm_stderr": 0.030273325077345755 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5490196078431373, + "acc_stderr": 0.020130388312904528, + "acc_norm": 0.5490196078431373, + "acc_norm_stderr": 0.020130388312904528 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6181818181818182, + "acc_stderr": 0.046534298079135075, + "acc_norm": 0.6181818181818182, + "acc_norm_stderr": 0.046534298079135075 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.636734693877551, + "acc_stderr": 0.030789051139030806, + "acc_norm": 0.636734693877551, + "acc_norm_stderr": 0.030789051139030806 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7512437810945274, + "acc_stderr": 0.030567675938916714, + "acc_norm": 0.7512437810945274, + "acc_norm_stderr": 0.030567675938916714 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.035887028128263686, + "acc_norm": 0.85, + "acc_norm_stderr": 0.035887028128263686 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.03889951252827217, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.03889951252827217 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7602339181286549, + "acc_stderr": 0.03274485211946956, + "acc_norm": 0.7602339181286549, + "acc_norm_stderr": 0.03274485211946956 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.26805385556915545, + "mc1_stderr": 0.015506204722834557, + "mc2": 0.4065291125077462, + "mc2_stderr": 0.014264280736472443 + }, + "all": { + "acc": 0.5533200657692665, + "acc_stderr": 0.0342809929173807, + "acc_norm": 0.5574557444523777, + "acc_norm_stderr": 0.034258860112808605, + "mc1": 0.26805385556915545, + "mc1_stderr": 0.015506204722834557, + "mc2": 0.4065291125077462, + "mc2_stderr": 0.014264280736472443 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6358.395143032074", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Kimiko-v2-13B-fp16/results_2023-10-22T17-23-39.395223.json b/eval-results/TheBloke/Kimiko-v2-13B-fp16/results_2023-10-22T17-23-39.395223.json new file mode 100644 index 0000000000000000000000000000000000000000..5a4f517851e10629c0820aa9cc4307c74122a824 --- /dev/null +++ b/eval-results/TheBloke/Kimiko-v2-13B-fp16/results_2023-10-22T17-23-39.395223.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/Kimiko-v2-13B-fp16", + "model_sha": "ab77260e813b75bad666f4c87de97515cebde966", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0017827181208053692, + "em_stderr": 0.00043200973460388544, + "f1": 0.06393351510067083, + "f1_stderr": 0.001389281752742565 + }, + "harness|gsm8k|5": { + "acc": 0.12509476876421532, + "acc_stderr": 0.009112601439849618 + }, + "harness|winogrande|5": { + "acc": 0.7679558011049724, + "acc_stderr": 0.01186414969182794 + }, + "all": { + "em": 0.0017827181208053692, + "em_stderr": 0.00043200973460388544, + "f1": 0.06393351510067083, + "f1_stderr": 0.001389281752742565, + "acc": 0.44652528493459387, + "acc_stderr": 0.01048837556583878 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "261246a799e45169" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "69b0614e03f1a265" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "857b7ba6cad728d9" + }, + "total_evaluation_time_secondes": "12390.529589891434", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Lemur-70B-Chat-v1-GPTQ/results_2023-08-31T06-46-13.725525.json b/eval-results/TheBloke/Lemur-70B-Chat-v1-GPTQ/results_2023-08-31T06-46-13.725525.json new file mode 100644 index 0000000000000000000000000000000000000000..8fc709c163fc42c50388e5c4b599293fd5d5373a --- /dev/null +++ b/eval-results/TheBloke/Lemur-70B-Chat-v1-GPTQ/results_2023-08-31T06-46-13.725525.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/Lemur-70B-Chat-v1-GPTQ", + "model_sha": "12499165df1785f50df3e95940406032776401ea", + "model_dtype": "None", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6075085324232082, + "acc_stderr": 0.014269634635670724, + "acc_norm": 0.6527303754266212, + "acc_norm_stderr": 0.013913034529620446 + }, + "harness|hellaswag|10": { + "acc": 0.6475801633140809, + "acc_stderr": 0.00476747536668976, + "acc_norm": 0.8440549691296555, + "acc_norm_stderr": 0.003620617550747387 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5407407407407407, + "acc_stderr": 0.04304979692464242, + "acc_norm": 0.5407407407407407, + "acc_norm_stderr": 0.04304979692464242 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6973684210526315, + "acc_stderr": 0.03738520676119667, + "acc_norm": 0.6973684210526315, + "acc_norm_stderr": 0.03738520676119667 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252609, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252609 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6754716981132075, + "acc_stderr": 0.02881561571343211, + "acc_norm": 0.6754716981132075, + "acc_norm_stderr": 0.02881561571343211 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7430555555555556, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.7430555555555556, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6358381502890174, + "acc_stderr": 0.03669072477416906, + "acc_norm": 0.6358381502890174, + "acc_norm_stderr": 0.03669072477416906 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.04897104952726366, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.04897104952726366 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932263, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932263 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5914893617021276, + "acc_stderr": 0.032134180267015755, + "acc_norm": 0.5914893617021276, + "acc_norm_stderr": 0.032134180267015755 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4298245614035088, + "acc_stderr": 0.046570472605949625, + "acc_norm": 0.4298245614035088, + "acc_norm_stderr": 0.046570472605949625 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5448275862068965, + "acc_stderr": 0.04149886942192118, + "acc_norm": 0.5448275862068965, + "acc_norm_stderr": 0.04149886942192118 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4603174603174603, + "acc_stderr": 0.025670080636909186, + "acc_norm": 0.4603174603174603, + "acc_norm_stderr": 0.025670080636909186 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.47619047619047616, + "acc_stderr": 0.04467062628403273, + "acc_norm": 0.47619047619047616, + "acc_norm_stderr": 0.04467062628403273 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7612903225806451, + "acc_stderr": 0.02425107126220884, + "acc_norm": 0.7612903225806451, + "acc_norm_stderr": 0.02425107126220884 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4630541871921182, + "acc_stderr": 0.035083705204426656, + "acc_norm": 0.4630541871921182, + "acc_norm_stderr": 0.035083705204426656 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.03192271569548301, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.03192271569548301 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8131313131313131, + "acc_stderr": 0.027772533334218967, + "acc_norm": 0.8131313131313131, + "acc_norm_stderr": 0.027772533334218967 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8963730569948186, + "acc_stderr": 0.02199531196364424, + "acc_norm": 0.8963730569948186, + "acc_norm_stderr": 0.02199531196364424 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6615384615384615, + "acc_stderr": 0.023991500500313036, + "acc_norm": 0.6615384615384615, + "acc_norm_stderr": 0.023991500500313036 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.027940457136228426, + "acc_norm": 0.3, + "acc_norm_stderr": 0.027940457136228426 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6596638655462185, + "acc_stderr": 0.030778057422931673, + "acc_norm": 0.6596638655462185, + "acc_norm_stderr": 0.030778057422931673 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.41721854304635764, + "acc_stderr": 0.04026141497634611, + "acc_norm": 0.41721854304635764, + "acc_norm_stderr": 0.04026141497634611 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8348623853211009, + "acc_stderr": 0.015919557829976064, + "acc_norm": 0.8348623853211009, + "acc_norm_stderr": 0.015919557829976064 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5231481481481481, + "acc_stderr": 0.03406315360711507, + "acc_norm": 0.5231481481481481, + "acc_norm_stderr": 0.03406315360711507 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8382352941176471, + "acc_stderr": 0.02584501798692692, + "acc_norm": 0.8382352941176471, + "acc_norm_stderr": 0.02584501798692692 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8354430379746836, + "acc_stderr": 0.024135736240566932, + "acc_norm": 0.8354430379746836, + "acc_norm_stderr": 0.024135736240566932 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7309417040358744, + "acc_stderr": 0.029763779406874972, + "acc_norm": 0.7309417040358744, + "acc_norm_stderr": 0.029763779406874972 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7404580152671756, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.7404580152671756, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8429752066115702, + "acc_stderr": 0.03321244842547128, + "acc_norm": 0.8429752066115702, + "acc_norm_stderr": 0.03321244842547128 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.0401910747255735, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.0401910747255735 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7852760736196319, + "acc_stderr": 0.03226219377286775, + "acc_norm": 0.7852760736196319, + "acc_norm_stderr": 0.03226219377286775 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.49107142857142855, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.49107142857142855, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7864077669902912, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.7864077669902912, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8589743589743589, + "acc_stderr": 0.02280138253459753, + "acc_norm": 0.8589743589743589, + "acc_norm_stderr": 0.02280138253459753 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8135376756066411, + "acc_stderr": 0.013927751372001512, + "acc_norm": 0.8135376756066411, + "acc_norm_stderr": 0.013927751372001512 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7427745664739884, + "acc_stderr": 0.023532925431044287, + "acc_norm": 0.7427745664739884, + "acc_norm_stderr": 0.023532925431044287 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.5072625698324023, + "acc_stderr": 0.0167207374051795, + "acc_norm": 0.5072625698324023, + "acc_norm_stderr": 0.0167207374051795 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7058823529411765, + "acc_stderr": 0.026090162504279046, + "acc_norm": 0.7058823529411765, + "acc_norm_stderr": 0.026090162504279046 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.707395498392283, + "acc_stderr": 0.025839898334877983, + "acc_norm": 0.707395498392283, + "acc_norm_stderr": 0.025839898334877983 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7469135802469136, + "acc_stderr": 0.024191808600712992, + "acc_norm": 0.7469135802469136, + "acc_norm_stderr": 0.024191808600712992 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.49645390070921985, + "acc_stderr": 0.02982674915328092, + "acc_norm": 0.49645390070921985, + "acc_norm_stderr": 0.02982674915328092 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.49934810951760106, + "acc_stderr": 0.012770225252255534, + "acc_norm": 0.49934810951760106, + "acc_norm_stderr": 0.012770225252255534 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6580882352941176, + "acc_stderr": 0.028814722422254184, + "acc_norm": 0.6580882352941176, + "acc_norm_stderr": 0.028814722422254184 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6830065359477124, + "acc_stderr": 0.01882421951270621, + "acc_norm": 0.6830065359477124, + "acc_norm_stderr": 0.01882421951270621 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7, + "acc_stderr": 0.04389311454644287, + "acc_norm": 0.7, + "acc_norm_stderr": 0.04389311454644287 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7918367346938775, + "acc_stderr": 0.025991117672813296, + "acc_norm": 0.7918367346938775, + "acc_norm_stderr": 0.025991117672813296 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.845771144278607, + "acc_stderr": 0.025538433368578334, + "acc_norm": 0.845771144278607, + "acc_norm_stderr": 0.025538433368578334 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.034873508801977704, + "acc_norm": 0.86, + "acc_norm_stderr": 0.034873508801977704 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5120481927710844, + "acc_stderr": 0.038913644958358175, + "acc_norm": 0.5120481927710844, + "acc_norm_stderr": 0.038913644958358175 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.783625730994152, + "acc_stderr": 0.03158149539338734, + "acc_norm": 0.783625730994152, + "acc_norm_stderr": 0.03158149539338734 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3818849449204406, + "mc1_stderr": 0.01700810193916349, + "mc2": 0.5711470281396481, + "mc2_stderr": 0.015283087726691595 + }, + "all": { + "acc": 0.6468074911221942, + "acc_stderr": 0.03281612856930076, + "acc_norm": 0.6509040444920074, + "acc_norm_stderr": 0.032790646231639874, + "mc1": 0.3818849449204406, + "mc1_stderr": 0.01700810193916349, + "mc2": 0.5711470281396481, + "mc2_stderr": 0.015283087726691595 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "40634.87911391258", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Llama-2-13B-GPTQ/results_2023-08-29T15-04-20.709230.json b/eval-results/TheBloke/Llama-2-13B-GPTQ/results_2023-08-29T15-04-20.709230.json new file mode 100644 index 0000000000000000000000000000000000000000..11e63179e5e120c897ee81f7e7af7df302cd6750 --- /dev/null +++ b/eval-results/TheBloke/Llama-2-13B-GPTQ/results_2023-08-29T15-04-20.709230.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/Llama-2-13B-GPTQ", + "model_sha": "b7db471d1789802a3a8e3b93cdd66a9f046f17c3", + "model_dtype": "torch.float16", + "lighteval_sha": "2108e6d7ff766a8df132a73d138d42a559e21d18", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.53839590443686, + "acc_stderr": 0.014568245550296358, + "acc_norm": 0.5725255972696246, + "acc_norm_stderr": 0.01445686294465065 + }, + "harness|hellaswag|10": { + "acc": 0.6082453694483171, + "acc_stderr": 0.004871447106554929, + "acc_norm": 0.8163712407886875, + "acc_norm_stderr": 0.003863898546941601 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244219, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244219 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5111111111111111, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.5111111111111111, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5328947368421053, + "acc_stderr": 0.040601270352363966, + "acc_norm": 0.5328947368421053, + "acc_norm_stderr": 0.040601270352363966 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6226415094339622, + "acc_stderr": 0.029832808114796, + "acc_norm": 0.6226415094339622, + "acc_norm_stderr": 0.029832808114796 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6041666666666666, + "acc_stderr": 0.04089465449325582, + "acc_norm": 0.6041666666666666, + "acc_norm_stderr": 0.04089465449325582 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.04158307533083286, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.04158307533083286 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.42127659574468085, + "acc_stderr": 0.03227834510146268, + "acc_norm": 0.42127659574468085, + "acc_norm_stderr": 0.03227834510146268 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3508771929824561, + "acc_stderr": 0.04489539350270699, + "acc_norm": 0.3508771929824561, + "acc_norm_stderr": 0.04489539350270699 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.023919984164047732, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.023919984164047732 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.04104947269903394, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.04104947269903394 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.632258064516129, + "acc_stderr": 0.027430866579973467, + "acc_norm": 0.632258064516129, + "acc_norm_stderr": 0.027430866579973467 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43349753694581283, + "acc_stderr": 0.03486731727419872, + "acc_norm": 0.43349753694581283, + "acc_norm_stderr": 0.03486731727419872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6303030303030303, + "acc_stderr": 0.03769430314512567, + "acc_norm": 0.6303030303030303, + "acc_norm_stderr": 0.03769430314512567 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.033586181457325226, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.033586181457325226 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7875647668393783, + "acc_stderr": 0.029519282616817234, + "acc_norm": 0.7875647668393783, + "acc_norm_stderr": 0.029519282616817234 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5102564102564102, + "acc_stderr": 0.025345672221942374, + "acc_norm": 0.5102564102564102, + "acc_norm_stderr": 0.025345672221942374 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2851851851851852, + "acc_stderr": 0.027528599210340496, + "acc_norm": 0.2851851851851852, + "acc_norm_stderr": 0.027528599210340496 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5378151260504201, + "acc_stderr": 0.03238546948758979, + "acc_norm": 0.5378151260504201, + "acc_norm_stderr": 0.03238546948758979 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.03822746937658754, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.03822746937658754 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7357798165137615, + "acc_stderr": 0.0189041641715102, + "acc_norm": 0.7357798165137615, + "acc_norm_stderr": 0.0189041641715102 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4537037037037037, + "acc_stderr": 0.033953227263757976, + "acc_norm": 0.4537037037037037, + "acc_norm_stderr": 0.033953227263757976 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7254901960784313, + "acc_stderr": 0.031321798030832904, + "acc_norm": 0.7254901960784313, + "acc_norm_stderr": 0.031321798030832904 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.70042194092827, + "acc_stderr": 0.02981802474975309, + "acc_norm": 0.70042194092827, + "acc_norm_stderr": 0.02981802474975309 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6547085201793722, + "acc_stderr": 0.03191100192835794, + "acc_norm": 0.6547085201793722, + "acc_norm_stderr": 0.03191100192835794 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6259541984732825, + "acc_stderr": 0.042438692422305246, + "acc_norm": 0.6259541984732825, + "acc_norm_stderr": 0.042438692422305246 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7107438016528925, + "acc_stderr": 0.04139112727635463, + "acc_norm": 0.7107438016528925, + "acc_norm_stderr": 0.04139112727635463 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6759259259259259, + "acc_stderr": 0.045245960070300476, + "acc_norm": 0.6759259259259259, + "acc_norm_stderr": 0.045245960070300476 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6625766871165644, + "acc_stderr": 0.037149084099355745, + "acc_norm": 0.6625766871165644, + "acc_norm_stderr": 0.037149084099355745 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04287858751340456, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04287858751340456 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7475728155339806, + "acc_stderr": 0.04301250399690878, + "acc_norm": 0.7475728155339806, + "acc_norm_stderr": 0.04301250399690878 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8034188034188035, + "acc_stderr": 0.02603538609895129, + "acc_norm": 0.8034188034188035, + "acc_norm_stderr": 0.02603538609895129 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7573435504469987, + "acc_stderr": 0.01532988894089986, + "acc_norm": 0.7573435504469987, + "acc_norm_stderr": 0.01532988894089986 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.615606936416185, + "acc_stderr": 0.026189666966272035, + "acc_norm": 0.615606936416185, + "acc_norm_stderr": 0.026189666966272035 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.36089385474860336, + "acc_stderr": 0.01606229067111047, + "acc_norm": 0.36089385474860336, + "acc_norm_stderr": 0.01606229067111047 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5947712418300654, + "acc_stderr": 0.02811092849280907, + "acc_norm": 0.5947712418300654, + "acc_norm_stderr": 0.02811092849280907 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6237942122186495, + "acc_stderr": 0.02751392568354943, + "acc_norm": 0.6237942122186495, + "acc_norm_stderr": 0.02751392568354943 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6358024691358025, + "acc_stderr": 0.026774929899722334, + "acc_norm": 0.6358024691358025, + "acc_norm_stderr": 0.026774929899722334 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.39361702127659576, + "acc_stderr": 0.02914454478159614, + "acc_norm": 0.39361702127659576, + "acc_norm_stderr": 0.02914454478159614 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4256844850065189, + "acc_stderr": 0.012628393551811943, + "acc_norm": 0.4256844850065189, + "acc_norm_stderr": 0.012628393551811943 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.030320243265004137, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.030320243265004137 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.565359477124183, + "acc_stderr": 0.02005426920072646, + "acc_norm": 0.565359477124183, + "acc_norm_stderr": 0.02005426920072646 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.045820048415054174, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.045820048415054174 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6285714285714286, + "acc_stderr": 0.030932858792789855, + "acc_norm": 0.6285714285714286, + "acc_norm_stderr": 0.030932858792789855 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7064676616915423, + "acc_stderr": 0.03220024104534205, + "acc_norm": 0.7064676616915423, + "acc_norm_stderr": 0.03220024104534205 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.463855421686747, + "acc_stderr": 0.03882310850890593, + "acc_norm": 0.463855421686747, + "acc_norm_stderr": 0.03882310850890593 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7485380116959064, + "acc_stderr": 0.033275044238468436, + "acc_norm": 0.7485380116959064, + "acc_norm_stderr": 0.033275044238468436 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.25703794369645044, + "mc1_stderr": 0.01529807750948508, + "mc2": 0.36558532676669275, + "mc2_stderr": 0.01367643152298662 + }, + "all": { + "acc": 0.548949152051668, + "acc_stderr": 0.03454538718533361, + "acc_norm": 0.5530551785630772, + "acc_norm_stderr": 0.034526422250329224, + "mc1": 0.25703794369645044, + "mc1_stderr": 0.01529807750948508, + "mc2": 0.36558532676669275, + "mc2_stderr": 0.01367643152298662 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "9918.596225738525", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Llama-2-13B-GPTQ/results_2023-08-30T10-42-39.395336.json b/eval-results/TheBloke/Llama-2-13B-GPTQ/results_2023-08-30T10-42-39.395336.json new file mode 100644 index 0000000000000000000000000000000000000000..043e5e0dcb4ba86f02fa51b57d730168d019b01f --- /dev/null +++ b/eval-results/TheBloke/Llama-2-13B-GPTQ/results_2023-08-30T10-42-39.395336.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/Llama-2-13B-GPTQ", + "model_sha": "b7db471d1789802a3a8e3b93cdd66a9f046f17c3", + "model_dtype": "None", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.53839590443686, + "acc_stderr": 0.014568245550296358, + "acc_norm": 0.5725255972696246, + "acc_norm_stderr": 0.01445686294465065 + }, + "harness|hellaswag|10": { + "acc": 0.6082453694483171, + "acc_stderr": 0.004871447106554929, + "acc_norm": 0.8163712407886875, + "acc_norm_stderr": 0.003863898546941601 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244219, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244219 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5111111111111111, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.5111111111111111, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5328947368421053, + "acc_stderr": 0.040601270352363966, + "acc_norm": 0.5328947368421053, + "acc_norm_stderr": 0.040601270352363966 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6226415094339622, + "acc_stderr": 0.029832808114796, + "acc_norm": 0.6226415094339622, + "acc_norm_stderr": 0.029832808114796 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6041666666666666, + "acc_stderr": 0.04089465449325582, + "acc_norm": 0.6041666666666666, + "acc_norm_stderr": 0.04089465449325582 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.04158307533083286, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.04158307533083286 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.42127659574468085, + "acc_stderr": 0.03227834510146268, + "acc_norm": 0.42127659574468085, + "acc_norm_stderr": 0.03227834510146268 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3508771929824561, + "acc_stderr": 0.04489539350270699, + "acc_norm": 0.3508771929824561, + "acc_norm_stderr": 0.04489539350270699 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.023919984164047732, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.023919984164047732 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.04104947269903394, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.04104947269903394 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.632258064516129, + "acc_stderr": 0.027430866579973467, + "acc_norm": 0.632258064516129, + "acc_norm_stderr": 0.027430866579973467 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43349753694581283, + "acc_stderr": 0.03486731727419872, + "acc_norm": 0.43349753694581283, + "acc_norm_stderr": 0.03486731727419872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6303030303030303, + "acc_stderr": 0.03769430314512567, + "acc_norm": 0.6303030303030303, + "acc_norm_stderr": 0.03769430314512567 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.033586181457325226, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.033586181457325226 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7875647668393783, + "acc_stderr": 0.029519282616817234, + "acc_norm": 0.7875647668393783, + "acc_norm_stderr": 0.029519282616817234 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5102564102564102, + "acc_stderr": 0.025345672221942374, + "acc_norm": 0.5102564102564102, + "acc_norm_stderr": 0.025345672221942374 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2851851851851852, + "acc_stderr": 0.027528599210340496, + "acc_norm": 0.2851851851851852, + "acc_norm_stderr": 0.027528599210340496 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5378151260504201, + "acc_stderr": 0.03238546948758979, + "acc_norm": 0.5378151260504201, + "acc_norm_stderr": 0.03238546948758979 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.03822746937658754, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.03822746937658754 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7357798165137615, + "acc_stderr": 0.0189041641715102, + "acc_norm": 0.7357798165137615, + "acc_norm_stderr": 0.0189041641715102 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4537037037037037, + "acc_stderr": 0.033953227263757976, + "acc_norm": 0.4537037037037037, + "acc_norm_stderr": 0.033953227263757976 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7254901960784313, + "acc_stderr": 0.031321798030832904, + "acc_norm": 0.7254901960784313, + "acc_norm_stderr": 0.031321798030832904 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.70042194092827, + "acc_stderr": 0.02981802474975309, + "acc_norm": 0.70042194092827, + "acc_norm_stderr": 0.02981802474975309 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6547085201793722, + "acc_stderr": 0.03191100192835794, + "acc_norm": 0.6547085201793722, + "acc_norm_stderr": 0.03191100192835794 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6259541984732825, + "acc_stderr": 0.042438692422305246, + "acc_norm": 0.6259541984732825, + "acc_norm_stderr": 0.042438692422305246 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7107438016528925, + "acc_stderr": 0.04139112727635463, + "acc_norm": 0.7107438016528925, + "acc_norm_stderr": 0.04139112727635463 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6759259259259259, + "acc_stderr": 0.045245960070300476, + "acc_norm": 0.6759259259259259, + "acc_norm_stderr": 0.045245960070300476 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6625766871165644, + "acc_stderr": 0.037149084099355745, + "acc_norm": 0.6625766871165644, + "acc_norm_stderr": 0.037149084099355745 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04287858751340456, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04287858751340456 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7475728155339806, + "acc_stderr": 0.04301250399690878, + "acc_norm": 0.7475728155339806, + "acc_norm_stderr": 0.04301250399690878 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8034188034188035, + "acc_stderr": 0.02603538609895129, + "acc_norm": 0.8034188034188035, + "acc_norm_stderr": 0.02603538609895129 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7573435504469987, + "acc_stderr": 0.01532988894089986, + "acc_norm": 0.7573435504469987, + "acc_norm_stderr": 0.01532988894089986 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.615606936416185, + "acc_stderr": 0.026189666966272035, + "acc_norm": 0.615606936416185, + "acc_norm_stderr": 0.026189666966272035 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.36089385474860336, + "acc_stderr": 0.01606229067111047, + "acc_norm": 0.36089385474860336, + "acc_norm_stderr": 0.01606229067111047 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5947712418300654, + "acc_stderr": 0.02811092849280907, + "acc_norm": 0.5947712418300654, + "acc_norm_stderr": 0.02811092849280907 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6237942122186495, + "acc_stderr": 0.02751392568354943, + "acc_norm": 0.6237942122186495, + "acc_norm_stderr": 0.02751392568354943 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6358024691358025, + "acc_stderr": 0.026774929899722334, + "acc_norm": 0.6358024691358025, + "acc_norm_stderr": 0.026774929899722334 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.39361702127659576, + "acc_stderr": 0.02914454478159614, + "acc_norm": 0.39361702127659576, + "acc_norm_stderr": 0.02914454478159614 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4256844850065189, + "acc_stderr": 0.012628393551811943, + "acc_norm": 0.4256844850065189, + "acc_norm_stderr": 0.012628393551811943 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.030320243265004137, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.030320243265004137 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.565359477124183, + "acc_stderr": 0.02005426920072646, + "acc_norm": 0.565359477124183, + "acc_norm_stderr": 0.02005426920072646 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.045820048415054174, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.045820048415054174 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6285714285714286, + "acc_stderr": 0.030932858792789855, + "acc_norm": 0.6285714285714286, + "acc_norm_stderr": 0.030932858792789855 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7064676616915423, + "acc_stderr": 0.03220024104534205, + "acc_norm": 0.7064676616915423, + "acc_norm_stderr": 0.03220024104534205 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.463855421686747, + "acc_stderr": 0.03882310850890593, + "acc_norm": 0.463855421686747, + "acc_norm_stderr": 0.03882310850890593 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7485380116959064, + "acc_stderr": 0.033275044238468436, + "acc_norm": 0.7485380116959064, + "acc_norm_stderr": 0.033275044238468436 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.25703794369645044, + "mc1_stderr": 0.01529807750948508, + "mc2": 0.36558532676669275, + "mc2_stderr": 0.01367643152298662 + }, + "all": { + "acc": 0.548949152051668, + "acc_stderr": 0.03454538718533361, + "acc_norm": 0.5530551785630772, + "acc_norm_stderr": 0.034526422250329224, + "mc1": 0.25703794369645044, + "mc1_stderr": 0.01529807750948508, + "mc2": 0.36558532676669275, + "mc2_stderr": 0.01367643152298662 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "9928.712373018265", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Llama-2-13B-GPTQ/results_2023-08-31T11-12-42.998068.json b/eval-results/TheBloke/Llama-2-13B-GPTQ/results_2023-08-31T11-12-42.998068.json new file mode 100644 index 0000000000000000000000000000000000000000..55f0f14a4556afcacf4614d99c40b489825d0ab7 --- /dev/null +++ b/eval-results/TheBloke/Llama-2-13B-GPTQ/results_2023-08-31T11-12-42.998068.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/Llama-2-13B-GPTQ", + "model_sha": "52126dd33ced924387215154c1415b4b2bb85e0b", + "model_dtype": "None", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5443686006825939, + "acc_stderr": 0.014553749939306861, + "acc_norm": 0.591296928327645, + "acc_norm_stderr": 0.014365750345427 + }, + "harness|hellaswag|10": { + "acc": 0.6070503883688508, + "acc_stderr": 0.00487407625052158, + "acc_norm": 0.8147779326827326, + "acc_norm_stderr": 0.003876836709461133 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5333333333333333, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.5333333333333333, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5526315789473685, + "acc_stderr": 0.04046336883978251, + "acc_norm": 0.5526315789473685, + "acc_norm_stderr": 0.04046336883978251 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5773584905660377, + "acc_stderr": 0.030402331445769544, + "acc_norm": 0.5773584905660377, + "acc_norm_stderr": 0.030402331445769544 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5625, + "acc_stderr": 0.04148415739394154, + "acc_norm": 0.5625, + "acc_norm_stderr": 0.04148415739394154 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5317919075144508, + "acc_stderr": 0.03804749744364764, + "acc_norm": 0.5317919075144508, + "acc_norm_stderr": 0.03804749744364764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171453, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171453 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252609, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252609 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4425531914893617, + "acc_stderr": 0.03246956919789958, + "acc_norm": 0.4425531914893617, + "acc_norm_stderr": 0.03246956919789958 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.04166567577101579, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.04166567577101579 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.023636975996101806, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.023636975996101806 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3412698412698413, + "acc_stderr": 0.04240799327574924, + "acc_norm": 0.3412698412698413, + "acc_norm_stderr": 0.04240799327574924 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6548387096774193, + "acc_stderr": 0.02704574657353433, + "acc_norm": 0.6548387096774193, + "acc_norm_stderr": 0.02704574657353433 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4039408866995074, + "acc_stderr": 0.0345245390382204, + "acc_norm": 0.4039408866995074, + "acc_norm_stderr": 0.0345245390382204 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.037563357751878974, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.037563357751878974 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03358618145732522, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03358618145732522 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7823834196891192, + "acc_stderr": 0.029778663037752954, + "acc_norm": 0.7823834196891192, + "acc_norm_stderr": 0.029778663037752954 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4897435897435897, + "acc_stderr": 0.025345672221942374, + "acc_norm": 0.4897435897435897, + "acc_norm_stderr": 0.025345672221942374 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3111111111111111, + "acc_stderr": 0.028226446749683522, + "acc_norm": 0.3111111111111111, + "acc_norm_stderr": 0.028226446749683522 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5546218487394958, + "acc_stderr": 0.032284106267163895, + "acc_norm": 0.5546218487394958, + "acc_norm_stderr": 0.032284106267163895 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7357798165137615, + "acc_stderr": 0.018904164171510193, + "acc_norm": 0.7357798165137615, + "acc_norm_stderr": 0.018904164171510193 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4537037037037037, + "acc_stderr": 0.03395322726375797, + "acc_norm": 0.4537037037037037, + "acc_norm_stderr": 0.03395322726375797 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7107843137254902, + "acc_stderr": 0.031822318676475544, + "acc_norm": 0.7107843137254902, + "acc_norm_stderr": 0.031822318676475544 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7172995780590717, + "acc_stderr": 0.02931281415395593, + "acc_norm": 0.7172995780590717, + "acc_norm_stderr": 0.02931281415395593 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6591928251121076, + "acc_stderr": 0.0318114974705536, + "acc_norm": 0.6591928251121076, + "acc_norm_stderr": 0.0318114974705536 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5877862595419847, + "acc_stderr": 0.04317171194870254, + "acc_norm": 0.5877862595419847, + "acc_norm_stderr": 0.04317171194870254 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6694214876033058, + "acc_stderr": 0.04294340845212094, + "acc_norm": 0.6694214876033058, + "acc_norm_stderr": 0.04294340845212094 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6759259259259259, + "acc_stderr": 0.04524596007030048, + "acc_norm": 0.6759259259259259, + "acc_norm_stderr": 0.04524596007030048 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6625766871165644, + "acc_stderr": 0.03714908409935575, + "acc_norm": 0.6625766871165644, + "acc_norm_stderr": 0.03714908409935575 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2767857142857143, + "acc_stderr": 0.04246624336697625, + "acc_norm": 0.2767857142857143, + "acc_norm_stderr": 0.04246624336697625 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7378640776699029, + "acc_stderr": 0.04354631077260595, + "acc_norm": 0.7378640776699029, + "acc_norm_stderr": 0.04354631077260595 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7905982905982906, + "acc_stderr": 0.026655699653922726, + "acc_norm": 0.7905982905982906, + "acc_norm_stderr": 0.026655699653922726 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7343550446998723, + "acc_stderr": 0.01579430248788873, + "acc_norm": 0.7343550446998723, + "acc_norm_stderr": 0.01579430248788873 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.630057803468208, + "acc_stderr": 0.025992472029306393, + "acc_norm": 0.630057803468208, + "acc_norm_stderr": 0.025992472029306393 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.31731843575418994, + "acc_stderr": 0.01556639263005703, + "acc_norm": 0.31731843575418994, + "acc_norm_stderr": 0.01556639263005703 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6078431372549019, + "acc_stderr": 0.027956046165424516, + "acc_norm": 0.6078431372549019, + "acc_norm_stderr": 0.027956046165424516 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6591639871382636, + "acc_stderr": 0.026920841260776162, + "acc_norm": 0.6591639871382636, + "acc_norm_stderr": 0.026920841260776162 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6358024691358025, + "acc_stderr": 0.026774929899722327, + "acc_norm": 0.6358024691358025, + "acc_norm_stderr": 0.026774929899722327 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3971631205673759, + "acc_stderr": 0.0291898056735871, + "acc_norm": 0.3971631205673759, + "acc_norm_stderr": 0.0291898056735871 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4178617992177314, + "acc_stderr": 0.01259674410899856, + "acc_norm": 0.4178617992177314, + "acc_norm_stderr": 0.01259674410899856 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4742647058823529, + "acc_stderr": 0.030332578094555033, + "acc_norm": 0.4742647058823529, + "acc_norm_stderr": 0.030332578094555033 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5571895424836601, + "acc_stderr": 0.02009508315457734, + "acc_norm": 0.5571895424836601, + "acc_norm_stderr": 0.02009508315457734 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.04494290866252091, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.04494290866252091 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6408163265306123, + "acc_stderr": 0.03071356045510849, + "acc_norm": 0.6408163265306123, + "acc_norm_stderr": 0.03071356045510849 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7213930348258707, + "acc_stderr": 0.031700561834973086, + "acc_norm": 0.7213930348258707, + "acc_norm_stderr": 0.031700561834973086 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.81, + "acc_stderr": 0.039427724440366255, + "acc_norm": 0.81, + "acc_norm_stderr": 0.039427724440366255 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.45180722891566266, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.45180722891566266, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7543859649122807, + "acc_stderr": 0.033014059469872487, + "acc_norm": 0.7543859649122807, + "acc_norm_stderr": 0.033014059469872487 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.25703794369645044, + "mc1_stderr": 0.01529807750948508, + "mc2": 0.37071042385732017, + "mc2_stderr": 0.01376534132094419 + }, + "all": { + "acc": 0.5455217135882148, + "acc_stderr": 0.03452354370556732, + "acc_norm": 0.5498379148078576, + "acc_norm_stderr": 0.03450345490667003, + "mc1": 0.25703794369645044, + "mc1_stderr": 0.01529807750948508, + "mc2": 0.37071042385732017, + "mc2_stderr": 0.01376534132094419 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "8995.525546312332", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Llama-2-13B-GPTQ/results_2023-10-27T16-26-14.370378.json b/eval-results/TheBloke/Llama-2-13B-GPTQ/results_2023-10-27T16-26-14.370378.json new file mode 100644 index 0000000000000000000000000000000000000000..a167021354732a95a4dfef6aba3b9e33e13e279f --- /dev/null +++ b/eval-results/TheBloke/Llama-2-13B-GPTQ/results_2023-10-27T16-26-14.370378.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/Llama-2-13B-GPTQ", + "model_sha": "cacb45a9b6052bef68f6f50b6f00aa40f7a799d9", + "model_size": "7.49 GB", + "model_dtype": "None", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0020973154362416107, + "em_stderr": 0.0004685065030368251, + "f1": 0.06011535234899329, + "f1_stderr": 0.0013639179977941345 + }, + "harness|gsm8k|5": { + "acc": 0.11296436694465505, + "acc_stderr": 0.00871933902883308 + }, + "harness|winogrande|5": { + "acc": 0.7616416732438832, + "acc_stderr": 0.011974948667702316 + }, + "all": { + "em": 0.0020973154362416107, + "em_stderr": 0.0004685065030368251, + "f1": 0.06011535234899329, + "f1_stderr": 0.0013639179977941345, + "acc": 0.43730302009426913, + "acc_stderr": 0.010347143848267699 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "abdf329dae00bafa" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "2a233218f26d2a57" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "c24e2609fa7f6293" + }, + "total_evaluation_time_secondes": "7033.646796941757", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Llama-2-13B-fp16/results_2023-07-24T15-08-39.202746.json b/eval-results/TheBloke/Llama-2-13B-fp16/results_2023-07-24T15-08-39.202746.json new file mode 100644 index 0000000000000000000000000000000000000000..0848df876c104c889fe91b70852b4f13f74d5e83 --- /dev/null +++ b/eval-results/TheBloke/Llama-2-13B-fp16/results_2023-07-24T15-08-39.202746.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5477815699658704, + "acc_stderr": 0.014544519880633829, + "acc_norm": 0.5930034129692833, + "acc_norm_stderr": 0.014356399418009121 + }, + "harness|hellaswag|10": { + "acc": 0.614618601872137, + "acc_stderr": 0.004856906473719381, + "acc_norm": 0.8215494921330412, + "acc_norm_stderr": 0.003821090082721709 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411022, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411022 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5328947368421053, + "acc_stderr": 0.040601270352363966, + "acc_norm": 0.5328947368421053, + "acc_norm_stderr": 0.040601270352363966 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6037735849056604, + "acc_stderr": 0.030102793781791197, + "acc_norm": 0.6037735849056604, + "acc_norm_stderr": 0.030102793781791197 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.04076663253918567, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.04076663253918567 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5317919075144508, + "acc_stderr": 0.03804749744364764, + "acc_norm": 0.5317919075144508, + "acc_norm_stderr": 0.03804749744364764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.04280105837364395, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.04280105837364395 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.425531914893617, + "acc_stderr": 0.032321469162244675, + "acc_norm": 0.425531914893617, + "acc_norm_stderr": 0.032321469162244675 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.32456140350877194, + "acc_stderr": 0.04404556157374768, + "acc_norm": 0.32456140350877194, + "acc_norm_stderr": 0.04404556157374768 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5103448275862069, + "acc_stderr": 0.04165774775728762, + "acc_norm": 0.5103448275862069, + "acc_norm_stderr": 0.04165774775728762 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3386243386243386, + "acc_stderr": 0.02437319786798306, + "acc_norm": 0.3386243386243386, + "acc_norm_stderr": 0.02437319786798306 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.04343525428949097, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.04343525428949097 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6741935483870968, + "acc_stderr": 0.026662010578567107, + "acc_norm": 0.6741935483870968, + "acc_norm_stderr": 0.026662010578567107 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4482758620689655, + "acc_stderr": 0.034991131376767445, + "acc_norm": 0.4482758620689655, + "acc_norm_stderr": 0.034991131376767445 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562427, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562427 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.03713158067481913, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.03713158067481913 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6868686868686869, + "acc_stderr": 0.033042050878136525, + "acc_norm": 0.6868686868686869, + "acc_norm_stderr": 0.033042050878136525 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8186528497409327, + "acc_stderr": 0.02780703236068609, + "acc_norm": 0.8186528497409327, + "acc_norm_stderr": 0.02780703236068609 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5102564102564102, + "acc_stderr": 0.025345672221942374, + "acc_norm": 0.5102564102564102, + "acc_norm_stderr": 0.025345672221942374 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085622, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085622 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5840336134453782, + "acc_stderr": 0.032016501007396114, + "acc_norm": 0.5840336134453782, + "acc_norm_stderr": 0.032016501007396114 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3708609271523179, + "acc_stderr": 0.03943966699183629, + "acc_norm": 0.3708609271523179, + "acc_norm_stderr": 0.03943966699183629 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7596330275229358, + "acc_stderr": 0.01832060732096407, + "acc_norm": 0.7596330275229358, + "acc_norm_stderr": 0.01832060732096407 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.44907407407407407, + "acc_stderr": 0.03392238405321616, + "acc_norm": 0.44907407407407407, + "acc_norm_stderr": 0.03392238405321616 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7450980392156863, + "acc_stderr": 0.030587591351604246, + "acc_norm": 0.7450980392156863, + "acc_norm_stderr": 0.030587591351604246 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7215189873417721, + "acc_stderr": 0.029178682304842538, + "acc_norm": 0.7215189873417721, + "acc_norm_stderr": 0.029178682304842538 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6367713004484304, + "acc_stderr": 0.03227790442850499, + "acc_norm": 0.6367713004484304, + "acc_norm_stderr": 0.03227790442850499 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6183206106870229, + "acc_stderr": 0.04260735157644559, + "acc_norm": 0.6183206106870229, + "acc_norm_stderr": 0.04260735157644559 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.04026187527591207, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.04026187527591207 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.04236511258094633, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.04236511258094633 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6687116564417178, + "acc_stderr": 0.03697983910025588, + "acc_norm": 0.6687116564417178, + "acc_norm_stderr": 0.03697983910025588 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.26785714285714285, + "acc_stderr": 0.04203277291467762, + "acc_norm": 0.26785714285714285, + "acc_norm_stderr": 0.04203277291467762 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7281553398058253, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.7281553398058253, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7948717948717948, + "acc_stderr": 0.02645350805404032, + "acc_norm": 0.7948717948717948, + "acc_norm_stderr": 0.02645350805404032 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7471264367816092, + "acc_stderr": 0.015543377313719681, + "acc_norm": 0.7471264367816092, + "acc_norm_stderr": 0.015543377313719681 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.025722802200895803, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.025722802200895803 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.394413407821229, + "acc_stderr": 0.01634538676210397, + "acc_norm": 0.394413407821229, + "acc_norm_stderr": 0.01634538676210397 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6241830065359477, + "acc_stderr": 0.027732834353363947, + "acc_norm": 0.6241830065359477, + "acc_norm_stderr": 0.027732834353363947 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6430868167202572, + "acc_stderr": 0.027210420375934023, + "acc_norm": 0.6430868167202572, + "acc_norm_stderr": 0.027210420375934023 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6512345679012346, + "acc_stderr": 0.026517597724465013, + "acc_norm": 0.6512345679012346, + "acc_norm_stderr": 0.026517597724465013 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3900709219858156, + "acc_stderr": 0.029097675599463926, + "acc_norm": 0.3900709219858156, + "acc_norm_stderr": 0.029097675599463926 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.423728813559322, + "acc_stderr": 0.012620785155885994, + "acc_norm": 0.423728813559322, + "acc_norm_stderr": 0.012620785155885994 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5367647058823529, + "acc_stderr": 0.03029061918048569, + "acc_norm": 0.5367647058823529, + "acc_norm_stderr": 0.03029061918048569 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5506535947712419, + "acc_stderr": 0.020123766528027266, + "acc_norm": 0.5506535947712419, + "acc_norm_stderr": 0.020123766528027266 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6090909090909091, + "acc_stderr": 0.04673752333670239, + "acc_norm": 0.6090909090909091, + "acc_norm_stderr": 0.04673752333670239 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.636734693877551, + "acc_stderr": 0.030789051139030806, + "acc_norm": 0.636734693877551, + "acc_norm_stderr": 0.030789051139030806 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7263681592039801, + "acc_stderr": 0.031524391865554016, + "acc_norm": 0.7263681592039801, + "acc_norm_stderr": 0.031524391865554016 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.463855421686747, + "acc_stderr": 0.03882310850890593, + "acc_norm": 0.463855421686747, + "acc_norm_stderr": 0.03882310850890593 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7602339181286549, + "acc_stderr": 0.03274485211946956, + "acc_norm": 0.7602339181286549, + "acc_norm_stderr": 0.03274485211946956 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.26805385556915545, + "mc1_stderr": 0.01550620472283456, + "mc2": 0.3738783761432801, + "mc2_stderr": 0.013688879517868343 + }, + "all": { + "acc": 0.5575719768163659, + "acc_stderr": 0.034423155721833555, + "acc_norm": 0.561845751956439, + "acc_norm_stderr": 0.03440241102939928, + "mc1": 0.26805385556915545, + "mc1_stderr": 0.01550620472283456, + "mc2": 0.3738783761432801, + "mc2_stderr": 0.013688879517868343 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/Llama-2-13B-fp16", + "model_sha": "b2e65e8ad4bb35e5abaee0170ebd5fc2134a50bb", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6181.031751394272", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Llama-2-13B-fp16/results_2023-10-22T22-53-07.629534.json b/eval-results/TheBloke/Llama-2-13B-fp16/results_2023-10-22T22-53-07.629534.json new file mode 100644 index 0000000000000000000000000000000000000000..a4ae189202cc3ec345edd2c3cefd42079e9725dd --- /dev/null +++ b/eval-results/TheBloke/Llama-2-13B-fp16/results_2023-10-22T22-53-07.629534.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/Llama-2-13B-fp16", + "model_sha": "b2e65e8ad4bb35e5abaee0170ebd5fc2134a50bb", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0014681208053691276, + "em_stderr": 0.00039210421902982666, + "f1": 0.0607822986577181, + "f1_stderr": 0.0013583957676382913 + }, + "harness|gsm8k|5": { + "acc": 0.10841546626231995, + "acc_stderr": 0.008563852506627487 + }, + "harness|winogrande|5": { + "acc": 0.7663772691397001, + "acc_stderr": 0.011892194477183524 + }, + "all": { + "em": 0.0014681208053691276, + "em_stderr": 0.00039210421902982666, + "f1": 0.0607822986577181, + "f1_stderr": 0.0013583957676382913, + "acc": 0.43739636770101, + "acc_stderr": 0.010228023491905505 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "c9346ec21b7560de" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "32cafa77d8a3f04e" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "4d8f1e04b1d56e40" + }, + "total_evaluation_time_secondes": "12350.423827171326", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Llama-2-70B-chat-GPTQ/results_2023-08-31T06-34-53.347292.json b/eval-results/TheBloke/Llama-2-70B-chat-GPTQ/results_2023-08-31T06-34-53.347292.json new file mode 100644 index 0000000000000000000000000000000000000000..7f4d269e45e7723f075d9bf3a3fdc71cc33ae336 --- /dev/null +++ b/eval-results/TheBloke/Llama-2-70B-chat-GPTQ/results_2023-08-31T06-34-53.347292.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/Llama-2-70B-chat-GPTQ", + "model_sha": "054fbf6f65e7ab7691ec07ec9ad366acf2dd90bf", + "model_dtype": "None", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5810580204778157, + "acc_stderr": 0.014418106953639011, + "acc_norm": 0.6262798634812287, + "acc_norm_stderr": 0.014137708601759093 + }, + "harness|hellaswag|10": { + "acc": 0.6557458673571002, + "acc_stderr": 0.0047415341064702835, + "acc_norm": 0.8481378211511651, + "acc_norm_stderr": 0.0035815378475817965 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5111111111111111, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.5111111111111111, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7302631578947368, + "acc_stderr": 0.03611780560284898, + "acc_norm": 0.7302631578947368, + "acc_norm_stderr": 0.03611780560284898 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6377358490566037, + "acc_stderr": 0.029582245128384303, + "acc_norm": 0.6377358490566037, + "acc_norm_stderr": 0.029582245128384303 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7291666666666666, + "acc_stderr": 0.03716177437566017, + "acc_norm": 0.7291666666666666, + "acc_norm_stderr": 0.03716177437566017 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6011560693641619, + "acc_stderr": 0.0373362665538351, + "acc_norm": 0.6011560693641619, + "acc_norm_stderr": 0.0373362665538351 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04690650298201943, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04690650298201943 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5361702127659574, + "acc_stderr": 0.032600385118357715, + "acc_norm": 0.5361702127659574, + "acc_norm_stderr": 0.032600385118357715 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.04537815354939392, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.04537815354939392 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5793103448275863, + "acc_stderr": 0.0411391498118926, + "acc_norm": 0.5793103448275863, + "acc_norm_stderr": 0.0411391498118926 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3994708994708995, + "acc_stderr": 0.025225450284067877, + "acc_norm": 0.3994708994708995, + "acc_norm_stderr": 0.025225450284067877 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.04390259265377562, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.04390259265377562 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7612903225806451, + "acc_stderr": 0.024251071262208837, + "acc_norm": 0.7612903225806451, + "acc_norm_stderr": 0.024251071262208837 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.46798029556650245, + "acc_stderr": 0.035107665979592154, + "acc_norm": 0.46798029556650245, + "acc_norm_stderr": 0.035107665979592154 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.65, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.65, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.031922715695483, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.031922715695483 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.803030303030303, + "acc_stderr": 0.028335609732463355, + "acc_norm": 0.803030303030303, + "acc_norm_stderr": 0.028335609732463355 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8860103626943006, + "acc_stderr": 0.022935144053919443, + "acc_norm": 0.8860103626943006, + "acc_norm_stderr": 0.022935144053919443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6282051282051282, + "acc_stderr": 0.024503472557110936, + "acc_norm": 0.6282051282051282, + "acc_norm_stderr": 0.024503472557110936 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.02784081149587192, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.02784081149587192 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6596638655462185, + "acc_stderr": 0.030778057422931673, + "acc_norm": 0.6596638655462185, + "acc_norm_stderr": 0.030778057422931673 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3973509933774834, + "acc_stderr": 0.039955240076816806, + "acc_norm": 0.3973509933774834, + "acc_norm_stderr": 0.039955240076816806 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8366972477064221, + "acc_stderr": 0.01584825580650154, + "acc_norm": 0.8366972477064221, + "acc_norm_stderr": 0.01584825580650154 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4861111111111111, + "acc_stderr": 0.03408655867977749, + "acc_norm": 0.4861111111111111, + "acc_norm_stderr": 0.03408655867977749 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8627450980392157, + "acc_stderr": 0.024152225962801588, + "acc_norm": 0.8627450980392157, + "acc_norm_stderr": 0.024152225962801588 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8396624472573839, + "acc_stderr": 0.02388438092596567, + "acc_norm": 0.8396624472573839, + "acc_norm_stderr": 0.02388438092596567 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7040358744394619, + "acc_stderr": 0.030636591348699813, + "acc_norm": 0.7040358744394619, + "acc_norm_stderr": 0.030636591348699813 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.732824427480916, + "acc_stderr": 0.038808483010823944, + "acc_norm": 0.732824427480916, + "acc_norm_stderr": 0.038808483010823944 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7768595041322314, + "acc_stderr": 0.03800754475228733, + "acc_norm": 0.7768595041322314, + "acc_norm_stderr": 0.03800754475228733 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.03602814176392645, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.03602814176392645 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7361963190184049, + "acc_stderr": 0.03462419931615623, + "acc_norm": 0.7361963190184049, + "acc_norm_stderr": 0.03462419931615623 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4732142857142857, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.4732142857142857, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8155339805825242, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.8155339805825242, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8632478632478633, + "acc_stderr": 0.022509033937077812, + "acc_norm": 0.8632478632478633, + "acc_norm_stderr": 0.022509033937077812 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.80970625798212, + "acc_stderr": 0.01403694585038138, + "acc_norm": 0.80970625798212, + "acc_norm_stderr": 0.01403694585038138 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7167630057803468, + "acc_stderr": 0.02425790170532338, + "acc_norm": 0.7167630057803468, + "acc_norm_stderr": 0.02425790170532338 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.329608938547486, + "acc_stderr": 0.015721531075183873, + "acc_norm": 0.329608938547486, + "acc_norm_stderr": 0.015721531075183873 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6895424836601307, + "acc_stderr": 0.026493033225145905, + "acc_norm": 0.6895424836601307, + "acc_norm_stderr": 0.026493033225145905 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7170418006430869, + "acc_stderr": 0.02558306248998482, + "acc_norm": 0.7170418006430869, + "acc_norm_stderr": 0.02558306248998482 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7037037037037037, + "acc_stderr": 0.025407197798890162, + "acc_norm": 0.7037037037037037, + "acc_norm_stderr": 0.025407197798890162 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4432624113475177, + "acc_stderr": 0.029634838473766002, + "acc_norm": 0.4432624113475177, + "acc_norm_stderr": 0.029634838473766002 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4680573663624511, + "acc_stderr": 0.012744149704869647, + "acc_norm": 0.4680573663624511, + "acc_norm_stderr": 0.012744149704869647 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5698529411764706, + "acc_stderr": 0.030074971917302875, + "acc_norm": 0.5698529411764706, + "acc_norm_stderr": 0.030074971917302875 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6601307189542484, + "acc_stderr": 0.019162418588623553, + "acc_norm": 0.6601307189542484, + "acc_norm_stderr": 0.019162418588623553 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7, + "acc_stderr": 0.04389311454644287, + "acc_norm": 0.7, + "acc_norm_stderr": 0.04389311454644287 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7591836734693878, + "acc_stderr": 0.027372942201788156, + "acc_norm": 0.7591836734693878, + "acc_norm_stderr": 0.027372942201788156 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.845771144278607, + "acc_stderr": 0.02553843336857833, + "acc_norm": 0.845771144278607, + "acc_norm_stderr": 0.02553843336857833 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.88, + "acc_stderr": 0.03265986323710906, + "acc_norm": 0.88, + "acc_norm_stderr": 0.03265986323710906 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5, + "acc_stderr": 0.03892494720807614, + "acc_norm": 0.5, + "acc_norm_stderr": 0.03892494720807614 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8128654970760234, + "acc_stderr": 0.029913127232368036, + "acc_norm": 0.8128654970760234, + "acc_norm_stderr": 0.029913127232368036 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3463892288861689, + "mc1_stderr": 0.01665699710912514, + "mc2": 0.509833074268794, + "mc2_stderr": 0.015705295947458175 + }, + "all": { + "acc": 0.6271417335583028, + "acc_stderr": 0.032984385584095444, + "acc_norm": 0.6311690860463957, + "acc_norm_stderr": 0.032959972116116315, + "mc1": 0.3463892288861689, + "mc1_stderr": 0.01665699710912514, + "mc2": 0.509833074268794, + "mc2_stderr": 0.015705295947458175 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "40458.150785684586", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Llama-2-70B-chat-GPTQ/results_2023-11-07T22-45-40.943285.json b/eval-results/TheBloke/Llama-2-70B-chat-GPTQ/results_2023-11-07T22-45-40.943285.json new file mode 100644 index 0000000000000000000000000000000000000000..2a7d2ebc2ef59df14a0a46b45536fd2abd19d4ad --- /dev/null +++ b/eval-results/TheBloke/Llama-2-70B-chat-GPTQ/results_2023-11-07T22-45-40.943285.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/Llama-2-70B-chat-GPTQ", + "model_sha": "ac53ed5e8ebdbbe6db22ecf816e21f55905418df", + "model_dtype": "None", + "model_size": "33.06 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.03208892617449664, + "em_stderr": 0.0018048244787816476, + "f1": 0.09399328859060421, + "f1_stderr": 0.002230499103816446 + }, + "harness|gsm8k|5": { + "acc": 0.1865049279757392, + "acc_stderr": 0.010729140039689904 + }, + "harness|winogrande|5": { + "acc": 0.7868981846882399, + "acc_stderr": 0.011508957690722752 + }, + "all": { + "em": 0.03208892617449664, + "em_stderr": 0.0018048244787816476, + "f1": 0.09399328859060421, + "f1_stderr": 0.002230499103816446, + "acc": 0.4867015563319895, + "acc_stderr": 0.011119048865206328 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "568ded3e83f86bfe" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "eda21f1fcb47a27d" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "a001ce48bfddaa9a" + }, + "truncated": 3, + "non_truncated": 12119, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Llama-2-70B-fp16/results_2023-07-31T16-40-00.231770.json b/eval-results/TheBloke/Llama-2-70B-fp16/results_2023-07-31T16-40-00.231770.json new file mode 100644 index 0000000000000000000000000000000000000000..5fca3b44eed2e91a4bfcf90878d598f60285781c --- /dev/null +++ b/eval-results/TheBloke/Llama-2-70B-fp16/results_2023-07-31T16-40-00.231770.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.6262798634812287, + "acc_stderr": 0.014137708601759091, + "acc_norm": 0.6732081911262798, + "acc_norm_stderr": 0.013706665975587333 + }, + "harness|hellaswag|10": { + "acc": 0.6760605457080263, + "acc_stderr": 0.00467020812857923, + "acc_norm": 0.8733320055765784, + "acc_norm_stderr": 0.0033192094001351187 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6296296296296297, + "acc_stderr": 0.04171654161354544, + "acc_norm": 0.6296296296296297, + "acc_norm_stderr": 0.04171654161354544 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.8092105263157895, + "acc_stderr": 0.031975658210325, + "acc_norm": 0.8092105263157895, + "acc_norm_stderr": 0.031975658210325 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7169811320754716, + "acc_stderr": 0.027724236492700918, + "acc_norm": 0.7169811320754716, + "acc_norm_stderr": 0.027724236492700918 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8472222222222222, + "acc_stderr": 0.030085743248565666, + "acc_norm": 0.8472222222222222, + "acc_norm_stderr": 0.030085743248565666 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6416184971098265, + "acc_stderr": 0.03656343653353159, + "acc_norm": 0.6416184971098265, + "acc_norm_stderr": 0.03656343653353159 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.37254901960784315, + "acc_stderr": 0.04810840148082635, + "acc_norm": 0.37254901960784315, + "acc_norm_stderr": 0.04810840148082635 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6638297872340425, + "acc_stderr": 0.030881618520676942, + "acc_norm": 0.6638297872340425, + "acc_norm_stderr": 0.030881618520676942 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4473684210526316, + "acc_stderr": 0.04677473004491199, + "acc_norm": 0.4473684210526316, + "acc_norm_stderr": 0.04677473004491199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6551724137931034, + "acc_stderr": 0.03960933549451207, + "acc_norm": 0.6551724137931034, + "acc_norm_stderr": 0.03960933549451207 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.43386243386243384, + "acc_stderr": 0.025525034382474894, + "acc_norm": 0.43386243386243384, + "acc_norm_stderr": 0.025525034382474894 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.47619047619047616, + "acc_stderr": 0.04467062628403273, + "acc_norm": 0.47619047619047616, + "acc_norm_stderr": 0.04467062628403273 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8193548387096774, + "acc_stderr": 0.02188617856717253, + "acc_norm": 0.8193548387096774, + "acc_norm_stderr": 0.02188617856717253 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5123152709359606, + "acc_stderr": 0.035169204442208966, + "acc_norm": 0.5123152709359606, + "acc_norm_stderr": 0.035169204442208966 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8303030303030303, + "acc_stderr": 0.029311188674983134, + "acc_norm": 0.8303030303030303, + "acc_norm_stderr": 0.029311188674983134 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8787878787878788, + "acc_stderr": 0.023253157951942084, + "acc_norm": 0.8787878787878788, + "acc_norm_stderr": 0.023253157951942084 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9430051813471503, + "acc_stderr": 0.016731085293607555, + "acc_norm": 0.9430051813471503, + "acc_norm_stderr": 0.016731085293607555 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7410256410256411, + "acc_stderr": 0.02221110681006167, + "acc_norm": 0.7410256410256411, + "acc_norm_stderr": 0.02221110681006167 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.35555555555555557, + "acc_stderr": 0.029185714949857403, + "acc_norm": 0.35555555555555557, + "acc_norm_stderr": 0.029185714949857403 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7647058823529411, + "acc_stderr": 0.02755361446786381, + "acc_norm": 0.7647058823529411, + "acc_norm_stderr": 0.02755361446786381 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4304635761589404, + "acc_stderr": 0.04042809961395634, + "acc_norm": 0.4304635761589404, + "acc_norm_stderr": 0.04042809961395634 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8733944954128441, + "acc_stderr": 0.014257128686165169, + "acc_norm": 0.8733944954128441, + "acc_norm_stderr": 0.014257128686165169 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.6342592592592593, + "acc_stderr": 0.032847388576472056, + "acc_norm": 0.6342592592592593, + "acc_norm_stderr": 0.032847388576472056 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8970588235294118, + "acc_stderr": 0.02132833757080437, + "acc_norm": 0.8970588235294118, + "acc_norm_stderr": 0.02132833757080437 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8776371308016878, + "acc_stderr": 0.021331741829746786, + "acc_norm": 0.8776371308016878, + "acc_norm_stderr": 0.021331741829746786 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.8026905829596412, + "acc_stderr": 0.02670985334496796, + "acc_norm": 0.8026905829596412, + "acc_norm_stderr": 0.02670985334496796 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8778625954198473, + "acc_stderr": 0.028718776889342344, + "acc_norm": 0.8778625954198473, + "acc_norm_stderr": 0.028718776889342344 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8760330578512396, + "acc_stderr": 0.03008309871603521, + "acc_norm": 0.8760330578512396, + "acc_norm_stderr": 0.03008309871603521 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.03602814176392645, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.03602814176392645 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.803680981595092, + "acc_stderr": 0.031207970394709218, + "acc_norm": 0.803680981595092, + "acc_norm_stderr": 0.031207970394709218 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5357142857142857, + "acc_stderr": 0.04733667890053756, + "acc_norm": 0.5357142857142857, + "acc_norm_stderr": 0.04733667890053756 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8349514563106796, + "acc_stderr": 0.03675668832233188, + "acc_norm": 0.8349514563106796, + "acc_norm_stderr": 0.03675668832233188 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.905982905982906, + "acc_stderr": 0.01911989279892498, + "acc_norm": 0.905982905982906, + "acc_norm_stderr": 0.01911989279892498 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768077, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768077 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8620689655172413, + "acc_stderr": 0.012331009307795656, + "acc_norm": 0.8620689655172413, + "acc_norm_stderr": 0.012331009307795656 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7774566473988439, + "acc_stderr": 0.02239421566194282, + "acc_norm": 0.7774566473988439, + "acc_norm_stderr": 0.02239421566194282 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4547486033519553, + "acc_stderr": 0.016653875777524012, + "acc_norm": 0.4547486033519553, + "acc_norm_stderr": 0.016653875777524012 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7810457516339869, + "acc_stderr": 0.02367908986180772, + "acc_norm": 0.7810457516339869, + "acc_norm_stderr": 0.02367908986180772 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7877813504823151, + "acc_stderr": 0.023222756797435115, + "acc_norm": 0.7877813504823151, + "acc_norm_stderr": 0.023222756797435115 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8364197530864198, + "acc_stderr": 0.020581466138257114, + "acc_norm": 0.8364197530864198, + "acc_norm_stderr": 0.020581466138257114 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5673758865248227, + "acc_stderr": 0.02955545423677884, + "acc_norm": 0.5673758865248227, + "acc_norm_stderr": 0.02955545423677884 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5319426336375489, + "acc_stderr": 0.012744149704869645, + "acc_norm": 0.5319426336375489, + "acc_norm_stderr": 0.012744149704869645 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.75, + "acc_stderr": 0.026303648393696036, + "acc_norm": 0.75, + "acc_norm_stderr": 0.026303648393696036 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7565359477124183, + "acc_stderr": 0.01736247376214662, + "acc_norm": 0.7565359477124183, + "acc_norm_stderr": 0.01736247376214662 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7918367346938775, + "acc_stderr": 0.0259911176728133, + "acc_norm": 0.7918367346938775, + "acc_norm_stderr": 0.0259911176728133 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.900497512437811, + "acc_stderr": 0.021166216304659393, + "acc_norm": 0.900497512437811, + "acc_norm_stderr": 0.021166216304659393 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.92, + "acc_stderr": 0.0272659924344291, + "acc_norm": 0.92, + "acc_norm_stderr": 0.0272659924344291 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5301204819277109, + "acc_stderr": 0.03885425420866767, + "acc_norm": 0.5301204819277109, + "acc_norm_stderr": 0.03885425420866767 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8538011695906432, + "acc_stderr": 0.027097290118070806, + "acc_norm": 0.8538011695906432, + "acc_norm_stderr": 0.027097290118070806 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3108935128518972, + "mc1_stderr": 0.016203316673559696, + "mc2": 0.44923493721887353, + "mc2_stderr": 0.01390226410719232 + }, + "all": { + "acc": 0.6967225637378714, + "acc_stderr": 0.030867069907791145, + "acc_norm": 0.7008615431872544, + "acc_norm_stderr": 0.030836865817034945, + "mc1": 0.3108935128518972, + "mc1_stderr": 0.016203316673559696, + "mc2": 0.44923493721887353, + "mc2_stderr": 0.01390226410719232 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/Llama-2-70B-fp16", + "model_sha": "b25061ef1b440e970d15d4ac99bc42937cd442a2", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "26263.30303668976", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Llama-2-70B-fp16/results_2023-10-23T03-18-37.286787.json b/eval-results/TheBloke/Llama-2-70B-fp16/results_2023-10-23T03-18-37.286787.json new file mode 100644 index 0000000000000000000000000000000000000000..b9c2bbd5c21b97dcb8e5e7e18dcb55de8975a3e8 --- /dev/null +++ b/eval-results/TheBloke/Llama-2-70B-fp16/results_2023-10-23T03-18-37.286787.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/Llama-2-70B-fp16", + "model_sha": "e9a40d73f8b9c160cf11b04644f6d3fcecf46687", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0017827181208053692, + "em_stderr": 0.00043200973460388544, + "f1": 0.06615562080536916, + "f1_stderr": 0.0013739852117668813 + }, + "harness|gsm8k|5": { + "acc": 0.33965125094768767, + "acc_stderr": 0.01304504506766526 + }, + "harness|winogrande|5": { + "acc": 0.8374112075769534, + "acc_stderr": 0.010370455551343326 + }, + "all": { + "em": 0.0017827181208053692, + "em_stderr": 0.00043200973460388544, + "f1": 0.06615562080536916, + "f1_stderr": 0.0013739852117668813, + "acc": 0.5885312292623206, + "acc_stderr": 0.011707750309504293 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "32bc149506251e60" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "a95ce63226eb9a2d" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "dff37de5e6c9aeb7" + }, + "total_evaluation_time_secondes": "45414.71742939949", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Llama-2-7B-GPTQ/results_2023-08-29T12-13-30.420278.json b/eval-results/TheBloke/Llama-2-7B-GPTQ/results_2023-08-29T12-13-30.420278.json new file mode 100644 index 0000000000000000000000000000000000000000..25eaea643dbb89e5208c4d9ec2701d76e592de7f --- /dev/null +++ b/eval-results/TheBloke/Llama-2-7B-GPTQ/results_2023-08-29T12-13-30.420278.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/Llama-2-7B-GPTQ", + "model_sha": "ecd7ab9f6adc36ecbe0d751eeea0d90ae1863c3b", + "model_dtype": "torch.float16", + "lighteval_sha": "2108e6d7ff766a8df132a73d138d42a559e21d18", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4786689419795222, + "acc_stderr": 0.014598087973127108, + "acc_norm": 0.5204778156996587, + "acc_norm_stderr": 0.014599131353035012 + }, + "harness|hellaswag|10": { + "acc": 0.5760804620593507, + "acc_stderr": 0.004931679059919375, + "acc_norm": 0.7759410476000796, + "acc_norm_stderr": 0.004161089244867776 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45925925925925926, + "acc_stderr": 0.04304979692464242, + "acc_norm": 0.45925925925925926, + "acc_norm_stderr": 0.04304979692464242 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.39473684210526316, + "acc_stderr": 0.039777499346220734, + "acc_norm": 0.39473684210526316, + "acc_norm_stderr": 0.039777499346220734 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4188679245283019, + "acc_stderr": 0.030365050829115208, + "acc_norm": 0.4188679245283019, + "acc_norm_stderr": 0.030365050829115208 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4097222222222222, + "acc_stderr": 0.04112490974670787, + "acc_norm": 0.4097222222222222, + "acc_norm_stderr": 0.04112490974670787 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816508, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816508 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952344, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952344 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.37572254335260113, + "acc_stderr": 0.036928207672648664, + "acc_norm": 0.37572254335260113, + "acc_norm_stderr": 0.036928207672648664 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.04023382273617746, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.04023382273617746 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4085106382978723, + "acc_stderr": 0.03213418026701576, + "acc_norm": 0.4085106382978723, + "acc_norm_stderr": 0.03213418026701576 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.32456140350877194, + "acc_stderr": 0.04404556157374768, + "acc_norm": 0.32456140350877194, + "acc_norm_stderr": 0.04404556157374768 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.46206896551724136, + "acc_stderr": 0.041546596717075474, + "acc_norm": 0.46206896551724136, + "acc_norm_stderr": 0.041546596717075474 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24338624338624337, + "acc_stderr": 0.022101128787415433, + "acc_norm": 0.24338624338624337, + "acc_norm_stderr": 0.022101128787415433 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3412698412698413, + "acc_stderr": 0.04240799327574924, + "acc_norm": 0.3412698412698413, + "acc_norm_stderr": 0.04240799327574924 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.45483870967741935, + "acc_stderr": 0.02832774309156107, + "acc_norm": 0.45483870967741935, + "acc_norm_stderr": 0.02832774309156107 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3054187192118227, + "acc_stderr": 0.03240661565868408, + "acc_norm": 0.3054187192118227, + "acc_norm_stderr": 0.03240661565868408 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5393939393939394, + "acc_stderr": 0.03892207016552013, + "acc_norm": 0.5393939393939394, + "acc_norm_stderr": 0.03892207016552013 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.4797979797979798, + "acc_stderr": 0.035594435655639196, + "acc_norm": 0.4797979797979798, + "acc_norm_stderr": 0.035594435655639196 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6476683937823834, + "acc_stderr": 0.03447478286414357, + "acc_norm": 0.6476683937823834, + "acc_norm_stderr": 0.03447478286414357 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.37948717948717947, + "acc_stderr": 0.02460362692409742, + "acc_norm": 0.37948717948717947, + "acc_norm_stderr": 0.02460362692409742 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.29259259259259257, + "acc_stderr": 0.02773896963217609, + "acc_norm": 0.29259259259259257, + "acc_norm_stderr": 0.02773896963217609 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.031566630992154156, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.031566630992154156 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2847682119205298, + "acc_stderr": 0.03684881521389023, + "acc_norm": 0.2847682119205298, + "acc_norm_stderr": 0.03684881521389023 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.5834862385321101, + "acc_stderr": 0.021136376504030874, + "acc_norm": 0.5834862385321101, + "acc_norm_stderr": 0.021136376504030874 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.02835321286686342, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.02835321286686342 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.49019607843137253, + "acc_stderr": 0.03508637358630572, + "acc_norm": 0.49019607843137253, + "acc_norm_stderr": 0.03508637358630572 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.5021097046413502, + "acc_stderr": 0.032546938018020076, + "acc_norm": 0.5021097046413502, + "acc_norm_stderr": 0.032546938018020076 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5291479820627802, + "acc_stderr": 0.03350073248773404, + "acc_norm": 0.5291479820627802, + "acc_norm_stderr": 0.03350073248773404 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5114503816793893, + "acc_stderr": 0.04384140024078016, + "acc_norm": 0.5114503816793893, + "acc_norm_stderr": 0.04384140024078016 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6115702479338843, + "acc_stderr": 0.044492703500683836, + "acc_norm": 0.6115702479338843, + "acc_norm_stderr": 0.044492703500683836 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.49074074074074076, + "acc_stderr": 0.04832853553437055, + "acc_norm": 0.49074074074074076, + "acc_norm_stderr": 0.04832853553437055 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.49079754601226994, + "acc_stderr": 0.039277056007874414, + "acc_norm": 0.49079754601226994, + "acc_norm_stderr": 0.039277056007874414 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764376, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.4854368932038835, + "acc_stderr": 0.049486373240266376, + "acc_norm": 0.4854368932038835, + "acc_norm_stderr": 0.049486373240266376 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.030882736974138666, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.030882736974138666 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5810983397190294, + "acc_stderr": 0.017643205052377188, + "acc_norm": 0.5810983397190294, + "acc_norm_stderr": 0.017643205052377188 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.49710982658959535, + "acc_stderr": 0.02691864538323901, + "acc_norm": 0.49710982658959535, + "acc_norm_stderr": 0.02691864538323901 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23910614525139665, + "acc_stderr": 0.014265554192331144, + "acc_norm": 0.23910614525139665, + "acc_norm_stderr": 0.014265554192331144 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.47058823529411764, + "acc_stderr": 0.02858034106513829, + "acc_norm": 0.47058823529411764, + "acc_norm_stderr": 0.02858034106513829 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5659163987138264, + "acc_stderr": 0.0281502322445356, + "acc_norm": 0.5659163987138264, + "acc_norm_stderr": 0.0281502322445356 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.49074074074074076, + "acc_stderr": 0.027815973433878014, + "acc_norm": 0.49074074074074076, + "acc_norm_stderr": 0.027815973433878014 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.32978723404255317, + "acc_stderr": 0.028045946942042405, + "acc_norm": 0.32978723404255317, + "acc_norm_stderr": 0.028045946942042405 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3578878748370274, + "acc_stderr": 0.012243563850490309, + "acc_norm": 0.3578878748370274, + "acc_norm_stderr": 0.012243563850490309 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4264705882352941, + "acc_stderr": 0.030042615832714874, + "acc_norm": 0.4264705882352941, + "acc_norm_stderr": 0.030042615832714874 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.020087362076702857, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.020087362076702857 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5, + "acc_stderr": 0.04789131426105757, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04789131426105757 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.49387755102040815, + "acc_stderr": 0.03200682020163908, + "acc_norm": 0.49387755102040815, + "acc_norm_stderr": 0.03200682020163908 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6069651741293532, + "acc_stderr": 0.0345368246603156, + "acc_norm": 0.6069651741293532, + "acc_norm_stderr": 0.0345368246603156 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42168674698795183, + "acc_stderr": 0.03844453181770917, + "acc_norm": 0.42168674698795183, + "acc_norm_stderr": 0.03844453181770917 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6783625730994152, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.6783625730994152, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2484700122399021, + "mc1_stderr": 0.015127427096520697, + "mc2": 0.39318347243467555, + "mc2_stderr": 0.013670242009997141 + }, + "all": { + "acc": 0.44282708077943145, + "acc_stderr": 0.03524402594377024, + "acc_norm": 0.44692317330927667, + "acc_norm_stderr": 0.03523098278385255, + "mc1": 0.2484700122399021, + "mc1_stderr": 0.015127427096520697, + "mc2": 0.39318347243467555, + "mc2_stderr": 0.013670242009997141 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6017.317279577255", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Llama-2-7B-GPTQ/results_2023-08-30T09-33-50.119005.json b/eval-results/TheBloke/Llama-2-7B-GPTQ/results_2023-08-30T09-33-50.119005.json new file mode 100644 index 0000000000000000000000000000000000000000..cafed05e1894d8721edf8de5d4034d7b219d46df --- /dev/null +++ b/eval-results/TheBloke/Llama-2-7B-GPTQ/results_2023-08-30T09-33-50.119005.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/Llama-2-7B-GPTQ", + "model_sha": "ecd7ab9f6adc36ecbe0d751eeea0d90ae1863c3b", + "model_dtype": "None", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4786689419795222, + "acc_stderr": 0.014598087973127108, + "acc_norm": 0.5204778156996587, + "acc_norm_stderr": 0.014599131353035012 + }, + "harness|hellaswag|10": { + "acc": 0.5760804620593507, + "acc_stderr": 0.004931679059919375, + "acc_norm": 0.7759410476000796, + "acc_norm_stderr": 0.004161089244867776 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45925925925925926, + "acc_stderr": 0.04304979692464242, + "acc_norm": 0.45925925925925926, + "acc_norm_stderr": 0.04304979692464242 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.39473684210526316, + "acc_stderr": 0.039777499346220734, + "acc_norm": 0.39473684210526316, + "acc_norm_stderr": 0.039777499346220734 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4188679245283019, + "acc_stderr": 0.030365050829115208, + "acc_norm": 0.4188679245283019, + "acc_norm_stderr": 0.030365050829115208 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4097222222222222, + "acc_stderr": 0.04112490974670787, + "acc_norm": 0.4097222222222222, + "acc_norm_stderr": 0.04112490974670787 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816508, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816508 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952344, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952344 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.37572254335260113, + "acc_stderr": 0.036928207672648664, + "acc_norm": 0.37572254335260113, + "acc_norm_stderr": 0.036928207672648664 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.04023382273617746, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.04023382273617746 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4085106382978723, + "acc_stderr": 0.03213418026701576, + "acc_norm": 0.4085106382978723, + "acc_norm_stderr": 0.03213418026701576 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.32456140350877194, + "acc_stderr": 0.04404556157374768, + "acc_norm": 0.32456140350877194, + "acc_norm_stderr": 0.04404556157374768 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.46206896551724136, + "acc_stderr": 0.041546596717075474, + "acc_norm": 0.46206896551724136, + "acc_norm_stderr": 0.041546596717075474 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24338624338624337, + "acc_stderr": 0.022101128787415433, + "acc_norm": 0.24338624338624337, + "acc_norm_stderr": 0.022101128787415433 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3412698412698413, + "acc_stderr": 0.04240799327574924, + "acc_norm": 0.3412698412698413, + "acc_norm_stderr": 0.04240799327574924 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.45483870967741935, + "acc_stderr": 0.02832774309156107, + "acc_norm": 0.45483870967741935, + "acc_norm_stderr": 0.02832774309156107 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3054187192118227, + "acc_stderr": 0.03240661565868408, + "acc_norm": 0.3054187192118227, + "acc_norm_stderr": 0.03240661565868408 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5393939393939394, + "acc_stderr": 0.03892207016552013, + "acc_norm": 0.5393939393939394, + "acc_norm_stderr": 0.03892207016552013 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.4797979797979798, + "acc_stderr": 0.035594435655639196, + "acc_norm": 0.4797979797979798, + "acc_norm_stderr": 0.035594435655639196 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6476683937823834, + "acc_stderr": 0.03447478286414357, + "acc_norm": 0.6476683937823834, + "acc_norm_stderr": 0.03447478286414357 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.37948717948717947, + "acc_stderr": 0.02460362692409742, + "acc_norm": 0.37948717948717947, + "acc_norm_stderr": 0.02460362692409742 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.29259259259259257, + "acc_stderr": 0.02773896963217609, + "acc_norm": 0.29259259259259257, + "acc_norm_stderr": 0.02773896963217609 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.031566630992154156, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.031566630992154156 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2847682119205298, + "acc_stderr": 0.03684881521389023, + "acc_norm": 0.2847682119205298, + "acc_norm_stderr": 0.03684881521389023 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.5834862385321101, + "acc_stderr": 0.021136376504030874, + "acc_norm": 0.5834862385321101, + "acc_norm_stderr": 0.021136376504030874 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.02835321286686342, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.02835321286686342 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.49019607843137253, + "acc_stderr": 0.03508637358630572, + "acc_norm": 0.49019607843137253, + "acc_norm_stderr": 0.03508637358630572 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.5021097046413502, + "acc_stderr": 0.032546938018020076, + "acc_norm": 0.5021097046413502, + "acc_norm_stderr": 0.032546938018020076 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5291479820627802, + "acc_stderr": 0.03350073248773404, + "acc_norm": 0.5291479820627802, + "acc_norm_stderr": 0.03350073248773404 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5114503816793893, + "acc_stderr": 0.04384140024078016, + "acc_norm": 0.5114503816793893, + "acc_norm_stderr": 0.04384140024078016 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6115702479338843, + "acc_stderr": 0.044492703500683836, + "acc_norm": 0.6115702479338843, + "acc_norm_stderr": 0.044492703500683836 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.49074074074074076, + "acc_stderr": 0.04832853553437055, + "acc_norm": 0.49074074074074076, + "acc_norm_stderr": 0.04832853553437055 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.49079754601226994, + "acc_stderr": 0.039277056007874414, + "acc_norm": 0.49079754601226994, + "acc_norm_stderr": 0.039277056007874414 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764376, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.4854368932038835, + "acc_stderr": 0.049486373240266376, + "acc_norm": 0.4854368932038835, + "acc_norm_stderr": 0.049486373240266376 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.030882736974138666, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.030882736974138666 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5810983397190294, + "acc_stderr": 0.017643205052377188, + "acc_norm": 0.5810983397190294, + "acc_norm_stderr": 0.017643205052377188 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.49710982658959535, + "acc_stderr": 0.02691864538323901, + "acc_norm": 0.49710982658959535, + "acc_norm_stderr": 0.02691864538323901 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23910614525139665, + "acc_stderr": 0.014265554192331144, + "acc_norm": 0.23910614525139665, + "acc_norm_stderr": 0.014265554192331144 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.47058823529411764, + "acc_stderr": 0.02858034106513829, + "acc_norm": 0.47058823529411764, + "acc_norm_stderr": 0.02858034106513829 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5659163987138264, + "acc_stderr": 0.0281502322445356, + "acc_norm": 0.5659163987138264, + "acc_norm_stderr": 0.0281502322445356 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.49074074074074076, + "acc_stderr": 0.027815973433878014, + "acc_norm": 0.49074074074074076, + "acc_norm_stderr": 0.027815973433878014 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.32978723404255317, + "acc_stderr": 0.028045946942042405, + "acc_norm": 0.32978723404255317, + "acc_norm_stderr": 0.028045946942042405 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3578878748370274, + "acc_stderr": 0.012243563850490309, + "acc_norm": 0.3578878748370274, + "acc_norm_stderr": 0.012243563850490309 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4264705882352941, + "acc_stderr": 0.030042615832714874, + "acc_norm": 0.4264705882352941, + "acc_norm_stderr": 0.030042615832714874 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.020087362076702857, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.020087362076702857 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5, + "acc_stderr": 0.04789131426105757, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04789131426105757 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.49387755102040815, + "acc_stderr": 0.03200682020163908, + "acc_norm": 0.49387755102040815, + "acc_norm_stderr": 0.03200682020163908 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6069651741293532, + "acc_stderr": 0.0345368246603156, + "acc_norm": 0.6069651741293532, + "acc_norm_stderr": 0.0345368246603156 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42168674698795183, + "acc_stderr": 0.03844453181770917, + "acc_norm": 0.42168674698795183, + "acc_norm_stderr": 0.03844453181770917 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6783625730994152, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.6783625730994152, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2484700122399021, + "mc1_stderr": 0.015127427096520697, + "mc2": 0.39318347243467555, + "mc2_stderr": 0.013670242009997141 + }, + "all": { + "acc": 0.44282708077943145, + "acc_stderr": 0.03524402594377024, + "acc_norm": 0.44692317330927667, + "acc_norm_stderr": 0.03523098278385255, + "mc1": 0.2484700122399021, + "mc1_stderr": 0.015127427096520697, + "mc2": 0.39318347243467555, + "mc2_stderr": 0.013670242009997141 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6014.452301263809", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Llama-2-7B-GPTQ/results_2023-10-21T20-13-14.412039.json b/eval-results/TheBloke/Llama-2-7B-GPTQ/results_2023-10-21T20-13-14.412039.json new file mode 100644 index 0000000000000000000000000000000000000000..a4461c797447b80bdafb869489d2ceb4771c9a3d --- /dev/null +++ b/eval-results/TheBloke/Llama-2-7B-GPTQ/results_2023-10-21T20-13-14.412039.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/Llama-2-7B-GPTQ", + "model_sha": "b76c9c666726b4461f5c888648cdca9a364fbd7b", + "model_size": "3.66 GB", + "model_dtype": "None", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0007340604026845638, + "em_stderr": 0.00027736144573356367, + "f1": 0.054487206375839085, + "f1_stderr": 0.001279202944739141 + }, + "harness|gsm8k|5": { + "acc": 0.050037907505686124, + "acc_stderr": 0.006005442354577735 + }, + "harness|winogrande|5": { + "acc": 0.7292817679558011, + "acc_stderr": 0.012487904760626303 + }, + "all": { + "em": 0.0007340604026845638, + "em_stderr": 0.00027736144573356367, + "f1": 0.054487206375839085, + "f1_stderr": 0.001279202944739141, + "acc": 0.38965983773074364, + "acc_stderr": 0.009246673557602019 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "b0c19eb5f3fc8a74" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "be9a689cc5192036" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "000c64af2fff295b" + }, + "total_evaluation_time_secondes": "5659.167922735214", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Llama-2-7b-Chat-AWQ/results_2023-10-03T10-54-21.847398.json b/eval-results/TheBloke/Llama-2-7b-Chat-AWQ/results_2023-10-03T10-54-21.847398.json new file mode 100644 index 0000000000000000000000000000000000000000..88b74811c0732a75f92723581c27b2cccf4196fc --- /dev/null +++ b/eval-results/TheBloke/Llama-2-7b-Chat-AWQ/results_2023-10-03T10-54-21.847398.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "TheBloke/Llama-2-7b-Chat-AWQ", + "model_sha": "a065961fd627aa3b3e6dde21e77fd5e20f712189", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.22866894197952217, + "acc_stderr": 0.012272853582540799, + "acc_norm": 0.2721843003412969, + "acc_norm_stderr": 0.013006600406423707 + }, + "harness|hellaswag|10": { + "acc": 0.2551284604660426, + "acc_stderr": 0.004350424750646203, + "acc_norm": 0.2548297151961761, + "acc_norm_stderr": 0.004348748730529938 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.03749850709174022, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.03749850709174022 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.035834961763610645, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.035834961763610645 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.20754716981132076, + "acc_stderr": 0.02495991802891127, + "acc_norm": 0.20754716981132076, + "acc_norm_stderr": 0.02495991802891127 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036625, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036625 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.26011560693641617, + "acc_stderr": 0.03345036916788992, + "acc_norm": 0.26011560693641617, + "acc_norm_stderr": 0.03345036916788992 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171452, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171452 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.25957446808510637, + "acc_stderr": 0.02865917937429232, + "acc_norm": 0.25957446808510637, + "acc_norm_stderr": 0.02865917937429232 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748142, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748142 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2689655172413793, + "acc_stderr": 0.03695183311650232, + "acc_norm": 0.2689655172413793, + "acc_norm_stderr": 0.03695183311650232 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.23015873015873015, + "acc_stderr": 0.021679219663693135, + "acc_norm": 0.23015873015873015, + "acc_norm_stderr": 0.021679219663693135 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.19047619047619047, + "acc_stderr": 0.03512207412302054, + "acc_norm": 0.19047619047619047, + "acc_norm_stderr": 0.03512207412302054 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768077, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768077 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.22258064516129034, + "acc_stderr": 0.023664216671642518, + "acc_norm": 0.22258064516129034, + "acc_norm_stderr": 0.023664216671642518 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.20689655172413793, + "acc_stderr": 0.02850137816789395, + "acc_norm": 0.20689655172413793, + "acc_norm_stderr": 0.02850137816789395 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.30303030303030304, + "acc_stderr": 0.03588624800091709, + "acc_norm": 0.30303030303030304, + "acc_norm_stderr": 0.03588624800091709 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2474747474747475, + "acc_stderr": 0.03074630074212451, + "acc_norm": 0.2474747474747475, + "acc_norm_stderr": 0.03074630074212451 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.18134715025906736, + "acc_stderr": 0.02780703236068609, + "acc_norm": 0.18134715025906736, + "acc_norm_stderr": 0.02780703236068609 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.21794871794871795, + "acc_stderr": 0.02093244577446318, + "acc_norm": 0.21794871794871795, + "acc_norm_stderr": 0.02093244577446318 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.27037037037037037, + "acc_stderr": 0.027080372815145668, + "acc_norm": 0.27037037037037037, + "acc_norm_stderr": 0.027080372815145668 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.02626502460827589, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.02626502460827589 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.25165562913907286, + "acc_stderr": 0.035433042343899844, + "acc_norm": 0.25165562913907286, + "acc_norm_stderr": 0.035433042343899844 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.22752293577981653, + "acc_stderr": 0.017974463578776502, + "acc_norm": 0.22752293577981653, + "acc_norm_stderr": 0.017974463578776502 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.029157522184605607, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.029157522184605607 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.20098039215686275, + "acc_stderr": 0.028125972265654373, + "acc_norm": 0.20098039215686275, + "acc_norm_stderr": 0.028125972265654373 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.24050632911392406, + "acc_stderr": 0.027820781981149685, + "acc_norm": 0.24050632911392406, + "acc_norm_stderr": 0.027820781981149685 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.28699551569506726, + "acc_stderr": 0.030360379710291954, + "acc_norm": 0.28699551569506726, + "acc_norm_stderr": 0.030360379710291954 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.21374045801526717, + "acc_stderr": 0.0359546161177469, + "acc_norm": 0.21374045801526717, + "acc_norm_stderr": 0.0359546161177469 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.04065578140908705, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.04065578140908705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.04236511258094633, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.04236511258094633 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.25766871165644173, + "acc_stderr": 0.03436150827846917, + "acc_norm": 0.25766871165644173, + "acc_norm_stderr": 0.03436150827846917 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25, + "acc_stderr": 0.04109974682633932, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04109974682633932 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.21359223300970873, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.21359223300970873, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2692307692307692, + "acc_stderr": 0.029058588303748842, + "acc_norm": 0.2692307692307692, + "acc_norm_stderr": 0.029058588303748842 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.26181353767560667, + "acc_stderr": 0.01572083867844526, + "acc_norm": 0.26181353767560667, + "acc_norm_stderr": 0.01572083867844526 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2947976878612717, + "acc_stderr": 0.02454761779480383, + "acc_norm": 0.2947976878612717, + "acc_norm_stderr": 0.02454761779480383 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2547486033519553, + "acc_stderr": 0.014572650383409158, + "acc_norm": 0.2547486033519553, + "acc_norm_stderr": 0.014572650383409158 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.26143790849673204, + "acc_stderr": 0.025160998214292456, + "acc_norm": 0.26143790849673204, + "acc_norm_stderr": 0.025160998214292456 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24115755627009647, + "acc_stderr": 0.024296594034763426, + "acc_norm": 0.24115755627009647, + "acc_norm_stderr": 0.024296594034763426 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25, + "acc_stderr": 0.02409347123262133, + "acc_norm": 0.25, + "acc_norm_stderr": 0.02409347123262133 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.25886524822695034, + "acc_stderr": 0.026129572527180848, + "acc_norm": 0.25886524822695034, + "acc_norm_stderr": 0.026129572527180848 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2627118644067797, + "acc_stderr": 0.011240545514995674, + "acc_norm": 0.2627118644067797, + "acc_norm_stderr": 0.011240545514995674 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.20955882352941177, + "acc_stderr": 0.024723110407677048, + "acc_norm": 0.20955882352941177, + "acc_norm_stderr": 0.024723110407677048 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2565359477124183, + "acc_stderr": 0.017667841612379, + "acc_norm": 0.2565359477124183, + "acc_norm_stderr": 0.017667841612379 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.24545454545454545, + "acc_stderr": 0.041220665028782855, + "acc_norm": 0.24545454545454545, + "acc_norm_stderr": 0.041220665028782855 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.23673469387755103, + "acc_stderr": 0.027212835884073153, + "acc_norm": 0.23673469387755103, + "acc_norm_stderr": 0.027212835884073153 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2885572139303483, + "acc_stderr": 0.03203841040213321, + "acc_norm": 0.2885572139303483, + "acc_norm_stderr": 0.03203841040213321 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.21084337349397592, + "acc_stderr": 0.0317555478662992, + "acc_norm": 0.21084337349397592, + "acc_norm_stderr": 0.0317555478662992 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.28654970760233917, + "acc_stderr": 0.034678266857038266, + "acc_norm": 0.28654970760233917, + "acc_norm_stderr": 0.034678266857038266 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2423500611995104, + "mc1_stderr": 0.01500067437357034, + "mc2": 0.499548040569627, + "mc2_stderr": 0.017139623909179967 + }, + "all": { + "acc": 0.24649856315672244, + "acc_stderr": 0.03141071505730311, + "acc_norm": 0.2472310481243819, + "acc_norm_stderr": 0.031423123037027975, + "mc1": 0.2423500611995104, + "mc1_stderr": 0.01500067437357034, + "mc2": 0.499548040569627, + "mc2_stderr": 0.017139623909179967 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4794.976619720459", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Llama-2-7b-Chat-AWQ/results_2023-10-24T01-23-20.549960.json b/eval-results/TheBloke/Llama-2-7b-Chat-AWQ/results_2023-10-24T01-23-20.549960.json new file mode 100644 index 0000000000000000000000000000000000000000..dc8cee73a48459e1908ecbb3916172b06df997b2 --- /dev/null +++ b/eval-results/TheBloke/Llama-2-7b-Chat-AWQ/results_2023-10-24T01-23-20.549960.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/Llama-2-7b-Chat-AWQ", + "model_sha": "a065961fd627aa3b3e6dde21e77fd5e20f712189", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.47513812154696133, + "acc_stderr": 0.01403510288362775 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0, + "acc": 0.23756906077348067, + "acc_stderr": 0.007017551441813875 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "517e0bedbda845a8" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "f59f339e29093d92" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "422cb9b8c19718a3" + }, + "total_evaluation_time_secondes": "24847.293817043304", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/LongChat-13B-GPTQ/results_2023-08-22T02-02-56.447076.json b/eval-results/TheBloke/LongChat-13B-GPTQ/results_2023-08-22T02-02-56.447076.json new file mode 100644 index 0000000000000000000000000000000000000000..33b9074c517e569ad6711abb49e805d8344d21ab --- /dev/null +++ b/eval-results/TheBloke/LongChat-13B-GPTQ/results_2023-08-22T02-02-56.447076.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.24744027303754265, + "acc_stderr": 0.01261035266329267, + "acc_norm": 0.2832764505119454, + "acc_norm_stderr": 0.013167478735134576 + }, + "harness|hellaswag|10": { + "acc": 0.2513443537143995, + "acc_stderr": 0.004328995510312591, + "acc_norm": 0.261202947619996, + "acc_norm_stderr": 0.004383925147478738 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720683, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720683 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04072314811876837, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04072314811876837 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.27631578947368424, + "acc_stderr": 0.03639057569952925, + "acc_norm": 0.27631578947368424, + "acc_norm_stderr": 0.03639057569952925 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2188679245283019, + "acc_stderr": 0.02544786382510861, + "acc_norm": 0.2188679245283019, + "acc_norm_stderr": 0.02544786382510861 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2986111111111111, + "acc_stderr": 0.038270523579507554, + "acc_norm": 0.2986111111111111, + "acc_norm_stderr": 0.038270523579507554 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.23121387283236994, + "acc_stderr": 0.03214737302029469, + "acc_norm": 0.23121387283236994, + "acc_norm_stderr": 0.03214737302029469 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.20425531914893616, + "acc_stderr": 0.026355158413349424, + "acc_norm": 0.20425531914893616, + "acc_norm_stderr": 0.026355158413349424 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813365 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.296551724137931, + "acc_stderr": 0.03806142687309993, + "acc_norm": 0.296551724137931, + "acc_norm_stderr": 0.03806142687309993 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2275132275132275, + "acc_stderr": 0.021591269407823778, + "acc_norm": 0.2275132275132275, + "acc_norm_stderr": 0.021591269407823778 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.040735243221471276, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.040735243221471276 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24193548387096775, + "acc_stderr": 0.024362599693031093, + "acc_norm": 0.24193548387096775, + "acc_norm_stderr": 0.024362599693031093 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.03178529710642752, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.03178529710642752 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036844, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036844 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.25252525252525254, + "acc_stderr": 0.030954055470365897, + "acc_norm": 0.25252525252525254, + "acc_norm_stderr": 0.030954055470365897 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.22797927461139897, + "acc_stderr": 0.03027690994517826, + "acc_norm": 0.22797927461139897, + "acc_norm_stderr": 0.03027690994517826 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2128205128205128, + "acc_stderr": 0.020752423722128013, + "acc_norm": 0.2128205128205128, + "acc_norm_stderr": 0.020752423722128013 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.026842057873833706, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.026842057873833706 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.026265024608275886, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.026265024608275886 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2582781456953642, + "acc_stderr": 0.035737053147634576, + "acc_norm": 0.2582781456953642, + "acc_norm_stderr": 0.035737053147634576 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1981651376146789, + "acc_stderr": 0.017090573804217874, + "acc_norm": 0.1981651376146789, + "acc_norm_stderr": 0.017090573804217874 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.18518518518518517, + "acc_stderr": 0.026491914727355157, + "acc_norm": 0.18518518518518517, + "acc_norm_stderr": 0.026491914727355157 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.030587591351604246, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.030587591351604246 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2109704641350211, + "acc_stderr": 0.02655837250266192, + "acc_norm": 0.2109704641350211, + "acc_norm_stderr": 0.02655837250266192 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.20179372197309417, + "acc_stderr": 0.026936111912802273, + "acc_norm": 0.20179372197309417, + "acc_norm_stderr": 0.026936111912802273 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.22900763358778625, + "acc_stderr": 0.036853466317118506, + "acc_norm": 0.22900763358778625, + "acc_norm_stderr": 0.036853466317118506 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2809917355371901, + "acc_stderr": 0.04103203830514512, + "acc_norm": 0.2809917355371901, + "acc_norm_stderr": 0.04103203830514512 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.26993865030674846, + "acc_stderr": 0.034878251684978906, + "acc_norm": 0.26993865030674846, + "acc_norm_stderr": 0.034878251684978906 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285712, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285712 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.1941747572815534, + "acc_stderr": 0.039166677628225836, + "acc_norm": 0.1941747572815534, + "acc_norm_stderr": 0.039166677628225836 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.23931623931623933, + "acc_stderr": 0.027951826808924333, + "acc_norm": 0.23931623931623933, + "acc_norm_stderr": 0.027951826808924333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.2, + "acc_stderr": 0.040201512610368445, + "acc_norm": 0.2, + "acc_norm_stderr": 0.040201512610368445 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2707535121328225, + "acc_stderr": 0.015889888362560486, + "acc_norm": 0.2707535121328225, + "acc_norm_stderr": 0.015889888362560486 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2861271676300578, + "acc_stderr": 0.024332146779134124, + "acc_norm": 0.2861271676300578, + "acc_norm_stderr": 0.024332146779134124 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2759776536312849, + "acc_stderr": 0.014950103002475363, + "acc_norm": 0.2759776536312849, + "acc_norm_stderr": 0.014950103002475363 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.27124183006535946, + "acc_stderr": 0.02545775669666787, + "acc_norm": 0.27124183006535946, + "acc_norm_stderr": 0.02545775669666787 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2958199356913183, + "acc_stderr": 0.025922371788818798, + "acc_norm": 0.2958199356913183, + "acc_norm_stderr": 0.025922371788818798 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.30246913580246915, + "acc_stderr": 0.02555765398186806, + "acc_norm": 0.30246913580246915, + "acc_norm_stderr": 0.02555765398186806 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2695035460992908, + "acc_stderr": 0.02646903681859063, + "acc_norm": 0.2695035460992908, + "acc_norm_stderr": 0.02646903681859063 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24445893089960888, + "acc_stderr": 0.010976425013113886, + "acc_norm": 0.24445893089960888, + "acc_norm_stderr": 0.010976425013113886 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2610294117647059, + "acc_stderr": 0.02667925227010312, + "acc_norm": 0.2610294117647059, + "acc_norm_stderr": 0.02667925227010312 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2630718954248366, + "acc_stderr": 0.017812676542320657, + "acc_norm": 0.2630718954248366, + "acc_norm_stderr": 0.017812676542320657 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.23636363636363636, + "acc_stderr": 0.04069306319721376, + "acc_norm": 0.23636363636363636, + "acc_norm_stderr": 0.04069306319721376 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3183673469387755, + "acc_stderr": 0.029822533793982062, + "acc_norm": 0.3183673469387755, + "acc_norm_stderr": 0.029822533793982062 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24875621890547264, + "acc_stderr": 0.030567675938916707, + "acc_norm": 0.24875621890547264, + "acc_norm_stderr": 0.030567675938916707 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.21084337349397592, + "acc_stderr": 0.0317555478662992, + "acc_norm": 0.21084337349397592, + "acc_norm_stderr": 0.0317555478662992 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.29239766081871343, + "acc_stderr": 0.034886477134579215, + "acc_norm": 0.29239766081871343, + "acc_norm_stderr": 0.034886477134579215 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862668, + "mc2": 0.4827158174737024, + "mc2_stderr": 0.017135796039690687 + }, + "all": { + "acc": 0.25535690904664204, + "acc_stderr": 0.03178469309728165, + "acc_norm": 0.2561313966971505, + "acc_norm_stderr": 0.03179506692285806, + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862668, + "mc2": 0.4827158174737024, + "mc2_stderr": 0.017135796039690687 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/LongChat-13B-GPTQ", + "model_sha": "8ec25a29033b7be5daeafa26f08e1ea7cf232b98", + "model_dtype": "4bit", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "36935.77130699158", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/LongChat-13B-GPTQ/results_2023-11-04T20-06-00.633401.json b/eval-results/TheBloke/LongChat-13B-GPTQ/results_2023-11-04T20-06-00.633401.json new file mode 100644 index 0000000000000000000000000000000000000000..d7ac799e5cf599da12bc8bded2f7510dc5e2ed53 --- /dev/null +++ b/eval-results/TheBloke/LongChat-13B-GPTQ/results_2023-11-04T20-06-00.633401.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/LongChat-13B-GPTQ", + "model_sha": "8ec25a29033b7be5daeafa26f08e1ea7cf232b98", + "model_dtype": "None", + "model_size": "6.92 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 5.243288590604027e-05, + "f1_stderr": 5.243288590604094e-05 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5114443567482242, + "acc_stderr": 0.014048804199859329 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 5.243288590604027e-05, + "f1_stderr": 5.243288590604094e-05, + "acc": 0.2557221783741121, + "acc_stderr": 0.007024402099929664 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "e72093e065ea3712" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "a16b4401961bdbd8" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "c6390464e333ce13" + }, + "truncated": 0, + "non_truncated": 12122, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Manticore-13B-Chat-Pyg-Guanaco-SuperHOT-8K-GPTQ/results_2023-08-29T20-55-05.081055.json b/eval-results/TheBloke/Manticore-13B-Chat-Pyg-Guanaco-SuperHOT-8K-GPTQ/results_2023-08-29T20-55-05.081055.json new file mode 100644 index 0000000000000000000000000000000000000000..b79764f61cebc99825aa9cecc2531eb65e39a602 --- /dev/null +++ b/eval-results/TheBloke/Manticore-13B-Chat-Pyg-Guanaco-SuperHOT-8K-GPTQ/results_2023-08-29T20-55-05.081055.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/Manticore-13B-Chat-Pyg-Guanaco-SuperHOT-8K-GPTQ", + "model_sha": "bd3c66e626c81de4977f197e1534bd3dfa2f569d", + "model_dtype": "torch.float16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5051194539249146, + "acc_stderr": 0.014610624890309157, + "acc_norm": 0.5281569965870307, + "acc_norm_stderr": 0.014588204105102203 + }, + "harness|hellaswag|10": { + "acc": 0.5962955586536547, + "acc_stderr": 0.00489636818576524, + "acc_norm": 0.7962557259510058, + "acc_norm_stderr": 0.004019578428155064 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.37037037037037035, + "acc_stderr": 0.04171654161354543, + "acc_norm": 0.37037037037037035, + "acc_norm_stderr": 0.04171654161354543 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.42105263157894735, + "acc_stderr": 0.040179012759817494, + "acc_norm": 0.42105263157894735, + "acc_norm_stderr": 0.040179012759817494 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4226415094339623, + "acc_stderr": 0.03040233144576954, + "acc_norm": 0.4226415094339623, + "acc_norm_stderr": 0.03040233144576954 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4305555555555556, + "acc_stderr": 0.04140685639111503, + "acc_norm": 0.4305555555555556, + "acc_norm_stderr": 0.04140685639111503 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.32947976878612717, + "acc_stderr": 0.03583901754736413, + "acc_norm": 0.32947976878612717, + "acc_norm_stderr": 0.03583901754736413 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3404255319148936, + "acc_stderr": 0.03097669299853442, + "acc_norm": 0.3404255319148936, + "acc_norm_stderr": 0.03097669299853442 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.04372748290278006, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.04372748290278006 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.33793103448275863, + "acc_stderr": 0.039417076320648906, + "acc_norm": 0.33793103448275863, + "acc_norm_stderr": 0.039417076320648906 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.022569897074918417, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.022569897074918417 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.0393253768039287, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.0393253768039287 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.42258064516129035, + "acc_stderr": 0.02810096472427264, + "acc_norm": 0.42258064516129035, + "acc_norm_stderr": 0.02810096472427264 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.30049261083743845, + "acc_stderr": 0.032257994762334846, + "acc_norm": 0.30049261083743845, + "acc_norm_stderr": 0.032257994762334846 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.22424242424242424, + "acc_stderr": 0.032568666616811015, + "acc_norm": 0.22424242424242424, + "acc_norm_stderr": 0.032568666616811015 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.494949494949495, + "acc_stderr": 0.035621707606254015, + "acc_norm": 0.494949494949495, + "acc_norm_stderr": 0.035621707606254015 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.5751295336787565, + "acc_stderr": 0.035674713352125395, + "acc_norm": 0.5751295336787565, + "acc_norm_stderr": 0.035674713352125395 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3871794871794872, + "acc_stderr": 0.02469721693087894, + "acc_norm": 0.3871794871794872, + "acc_norm_stderr": 0.02469721693087894 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24814814814814815, + "acc_stderr": 0.0263357394040558, + "acc_norm": 0.24814814814814815, + "acc_norm_stderr": 0.0263357394040558 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.36134453781512604, + "acc_stderr": 0.03120469122515002, + "acc_norm": 0.36134453781512604, + "acc_norm_stderr": 0.03120469122515002 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23841059602649006, + "acc_stderr": 0.03479185572599661, + "acc_norm": 0.23841059602649006, + "acc_norm_stderr": 0.03479185572599661 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.5302752293577981, + "acc_stderr": 0.021397988604936965, + "acc_norm": 0.5302752293577981, + "acc_norm_stderr": 0.021397988604936965 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.27314814814814814, + "acc_stderr": 0.03038805130167812, + "acc_norm": 0.27314814814814814, + "acc_norm_stderr": 0.03038805130167812 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.030778554678693264, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.030778554678693264 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.5611814345991561, + "acc_stderr": 0.032302649315470375, + "acc_norm": 0.5611814345991561, + "acc_norm_stderr": 0.032302649315470375 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5381165919282511, + "acc_stderr": 0.033460150119732274, + "acc_norm": 0.5381165919282511, + "acc_norm_stderr": 0.033460150119732274 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.4580152671755725, + "acc_stderr": 0.04369802690578757, + "acc_norm": 0.4580152671755725, + "acc_norm_stderr": 0.04369802690578757 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.48760330578512395, + "acc_stderr": 0.04562951548180765, + "acc_norm": 0.48760330578512395, + "acc_norm_stderr": 0.04562951548180765 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.04830366024635331, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.04830366024635331 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.44785276073619634, + "acc_stderr": 0.03906947479456602, + "acc_norm": 0.44785276073619634, + "acc_norm_stderr": 0.03906947479456602 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.26785714285714285, + "acc_stderr": 0.04203277291467762, + "acc_norm": 0.26785714285714285, + "acc_norm_stderr": 0.04203277291467762 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.49514563106796117, + "acc_stderr": 0.049505043821289195, + "acc_norm": 0.49514563106796117, + "acc_norm_stderr": 0.049505043821289195 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6752136752136753, + "acc_stderr": 0.03067902276549883, + "acc_norm": 0.6752136752136753, + "acc_norm_stderr": 0.03067902276549883 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5708812260536399, + "acc_stderr": 0.01769938848312679, + "acc_norm": 0.5708812260536399, + "acc_norm_stderr": 0.01769938848312679 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.43352601156069365, + "acc_stderr": 0.026680134761679217, + "acc_norm": 0.43352601156069365, + "acc_norm_stderr": 0.026680134761679217 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2536312849162011, + "acc_stderr": 0.014551553659369922, + "acc_norm": 0.2536312849162011, + "acc_norm_stderr": 0.014551553659369922 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4019607843137255, + "acc_stderr": 0.028074158947600653, + "acc_norm": 0.4019607843137255, + "acc_norm_stderr": 0.028074158947600653 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.45980707395498394, + "acc_stderr": 0.028306190403305693, + "acc_norm": 0.45980707395498394, + "acc_norm_stderr": 0.028306190403305693 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.4660493827160494, + "acc_stderr": 0.02775653525734767, + "acc_norm": 0.4660493827160494, + "acc_norm_stderr": 0.02775653525734767 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.32269503546099293, + "acc_stderr": 0.02788913930053479, + "acc_norm": 0.32269503546099293, + "acc_norm_stderr": 0.02788913930053479 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.35658409387222945, + "acc_stderr": 0.012233642989273891, + "acc_norm": 0.35658409387222945, + "acc_norm_stderr": 0.012233642989273891 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.39338235294117646, + "acc_stderr": 0.02967428828131118, + "acc_norm": 0.39338235294117646, + "acc_norm_stderr": 0.02967428828131118 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4150326797385621, + "acc_stderr": 0.019933627776857425, + "acc_norm": 0.4150326797385621, + "acc_norm_stderr": 0.019933627776857425 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.44545454545454544, + "acc_stderr": 0.047605488214603246, + "acc_norm": 0.44545454545454544, + "acc_norm_stderr": 0.047605488214603246 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.031680911612338825, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.031680911612338825 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.5024875621890548, + "acc_stderr": 0.03535490150137289, + "acc_norm": 0.5024875621890548, + "acc_norm_stderr": 0.03535490150137289 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.68, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.68, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3433734939759036, + "acc_stderr": 0.03696584317010601, + "acc_norm": 0.3433734939759036, + "acc_norm_stderr": 0.03696584317010601 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.5497076023391813, + "acc_stderr": 0.038158273659132366, + "acc_norm": 0.5497076023391813, + "acc_norm_stderr": 0.038158273659132366 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35006119951040393, + "mc1_stderr": 0.016697949420151032, + "mc2": 0.5254779040118085, + "mc2_stderr": 0.01594442535756773 + }, + "all": { + "acc": 0.4034689621810347, + "acc_stderr": 0.03467877900913612, + "acc_norm": 0.4072485843837375, + "acc_norm_stderr": 0.03466353815247821, + "mc1": 0.35006119951040393, + "mc1_stderr": 0.016697949420151032, + "mc2": 0.5254779040118085, + "mc2_stderr": 0.01594442535756773 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "9928.875678539276", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Manticore-13B-Chat-Pyg-Guanaco-SuperHOT-8K-GPTQ/results_2023-11-05T10-35-18.406812.json b/eval-results/TheBloke/Manticore-13B-Chat-Pyg-Guanaco-SuperHOT-8K-GPTQ/results_2023-11-05T10-35-18.406812.json new file mode 100644 index 0000000000000000000000000000000000000000..c12592d453bb78e065198e11615799d7918a3368 --- /dev/null +++ b/eval-results/TheBloke/Manticore-13B-Chat-Pyg-Guanaco-SuperHOT-8K-GPTQ/results_2023-11-05T10-35-18.406812.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/Manticore-13B-Chat-Pyg-Guanaco-SuperHOT-8K-GPTQ", + "model_sha": "bd3c66e626c81de4977f197e1534bd3dfa2f569d", + "model_dtype": "torch.float16", + "model_size": "6.92 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.13443791946308725, + "em_stderr": 0.0034934107615894167, + "f1": 0.21990876677852436, + "f1_stderr": 0.00371472104125086 + }, + "harness|gsm8k|5": { + "acc": 0.001516300227445034, + "acc_stderr": 0.0010717793485492606 + }, + "harness|winogrande|5": { + "acc": 0.7182320441988951, + "acc_stderr": 0.012643326011852944 + }, + "all": { + "em": 0.13443791946308725, + "em_stderr": 0.0034934107615894167, + "f1": 0.21990876677852436, + "f1_stderr": 0.00371472104125086, + "acc": 0.3598741722131701, + "acc_stderr": 0.006857552680201102 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "790d59a245bcebad" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "8978e4d96d9aafd5" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "b4aaa521df5228ab" + }, + "truncated": 0, + "non_truncated": 12122, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Manticore-13B-Chat-Pyg-Guanaco-SuperHOT-8K-GPTQ/results_2023-11-07T10-50-58.801361.json b/eval-results/TheBloke/Manticore-13B-Chat-Pyg-Guanaco-SuperHOT-8K-GPTQ/results_2023-11-07T10-50-58.801361.json new file mode 100644 index 0000000000000000000000000000000000000000..454223d9218ec79484a35cb21d6547515766aee4 --- /dev/null +++ b/eval-results/TheBloke/Manticore-13B-Chat-Pyg-Guanaco-SuperHOT-8K-GPTQ/results_2023-11-07T10-50-58.801361.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/Manticore-13B-Chat-Pyg-Guanaco-SuperHOT-8K-GPTQ", + "model_sha": "bd3c66e626c81de4977f197e1534bd3dfa2f569d", + "model_dtype": "torch.float16", + "model_size": "6.92 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.13485738255033558, + "em_stderr": 0.003498008556560615, + "f1": 0.2201814177852358, + "f1_stderr": 0.003718008519979711 + }, + "harness|gsm8k|5": { + "acc": 0.001516300227445034, + "acc_stderr": 0.0010717793485492606 + }, + "harness|winogrande|5": { + "acc": 0.7182320441988951, + "acc_stderr": 0.012643326011852944 + }, + "all": { + "em": 0.13485738255033558, + "em_stderr": 0.003498008556560615, + "f1": 0.2201814177852358, + "f1_stderr": 0.003718008519979711, + "acc": 0.3598741722131701, + "acc_stderr": 0.006857552680201102 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "30c7582e92d7d1e9" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "2418e1fe0d1c450b" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "e67af21147a06099" + }, + "truncated": 0, + "non_truncated": 12122, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Mixtral-8x7B-v0.1-GPTQ/results_2023-12-16T08-25-08.853393.json b/eval-results/TheBloke/Mixtral-8x7B-v0.1-GPTQ/results_2023-12-16T08-25-08.853393.json new file mode 100644 index 0000000000000000000000000000000000000000..b8cf80f01308bbe49cfa53aeebde9ab15d8589a4 --- /dev/null +++ b/eval-results/TheBloke/Mixtral-8x7B-v0.1-GPTQ/results_2023-12-16T08-25-08.853393.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 311946.199044127, + "end_time": 346061.60772738, + "total_evaluation_time_secondes": "34115.40868325299", + "model_name": "TheBloke/Mixtral-8x7B-v0.1-GPTQ", + "model_sha": "7d1eb57b65f823458e27509cd0aac7172f54a260", + "model_dtype": "None", + "model_size": "22.68 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6168941979522184, + "acc_stderr": 0.014206472661672883, + "acc_norm": 0.6518771331058021, + "acc_norm_stderr": 0.013921008595179347 + }, + "harness|hellaswag|10": { + "acc": 0.6413065126468831, + "acc_stderr": 0.004786368011500458, + "acc_norm": 0.8472415853415655, + "acc_norm_stderr": 0.0035901923719696637 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.04072314811876837, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.04072314811876837 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7631578947368421, + "acc_stderr": 0.034597776068105365, + "acc_norm": 0.7631578947368421, + "acc_norm_stderr": 0.034597776068105365 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.68, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.68, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7547169811320755, + "acc_stderr": 0.0264803571798957, + "acc_norm": 0.7547169811320755, + "acc_norm_stderr": 0.0264803571798957 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8194444444444444, + "acc_stderr": 0.03216600808802269, + "acc_norm": 0.8194444444444444, + "acc_norm_stderr": 0.03216600808802269 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6647398843930635, + "acc_stderr": 0.03599586301247077, + "acc_norm": 0.6647398843930635, + "acc_norm_stderr": 0.03599586301247077 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.49019607843137253, + "acc_stderr": 0.04974229460422817, + "acc_norm": 0.49019607843137253, + "acc_norm_stderr": 0.04974229460422817 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6425531914893617, + "acc_stderr": 0.031329417894764254, + "acc_norm": 0.6425531914893617, + "acc_norm_stderr": 0.031329417894764254 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.543859649122807, + "acc_stderr": 0.046854730419077895, + "acc_norm": 0.543859649122807, + "acc_norm_stderr": 0.046854730419077895 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6620689655172414, + "acc_stderr": 0.039417076320648906, + "acc_norm": 0.6620689655172414, + "acc_norm_stderr": 0.039417076320648906 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4576719576719577, + "acc_stderr": 0.02565886886205832, + "acc_norm": 0.4576719576719577, + "acc_norm_stderr": 0.02565886886205832 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4603174603174603, + "acc_stderr": 0.04458029125470973, + "acc_norm": 0.4603174603174603, + "acc_norm_stderr": 0.04458029125470973 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8290322580645161, + "acc_stderr": 0.021417242936321582, + "acc_norm": 0.8290322580645161, + "acc_norm_stderr": 0.021417242936321582 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5960591133004927, + "acc_stderr": 0.03452453903822033, + "acc_norm": 0.5960591133004927, + "acc_norm_stderr": 0.03452453903822033 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.031922715695482995, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.031922715695482995 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8383838383838383, + "acc_stderr": 0.02622591986362928, + "acc_norm": 0.8383838383838383, + "acc_norm_stderr": 0.02622591986362928 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9067357512953368, + "acc_stderr": 0.02098685459328972, + "acc_norm": 0.9067357512953368, + "acc_norm_stderr": 0.02098685459328972 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.023901157979402534, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.023901157979402534 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34074074074074073, + "acc_stderr": 0.02889774874113114, + "acc_norm": 0.34074074074074073, + "acc_norm_stderr": 0.02889774874113114 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7563025210084033, + "acc_stderr": 0.027886828078380558, + "acc_norm": 0.7563025210084033, + "acc_norm_stderr": 0.027886828078380558 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.44370860927152317, + "acc_stderr": 0.04056527902281732, + "acc_norm": 0.44370860927152317, + "acc_norm_stderr": 0.04056527902281732 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8752293577981651, + "acc_stderr": 0.014168298359156327, + "acc_norm": 0.8752293577981651, + "acc_norm_stderr": 0.014168298359156327 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.625, + "acc_stderr": 0.033016908987210894, + "acc_norm": 0.625, + "acc_norm_stderr": 0.033016908987210894 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8480392156862745, + "acc_stderr": 0.025195658428931785, + "acc_norm": 0.8480392156862745, + "acc_norm_stderr": 0.025195658428931785 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8481012658227848, + "acc_stderr": 0.02336387809663245, + "acc_norm": 0.8481012658227848, + "acc_norm_stderr": 0.02336387809663245 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.726457399103139, + "acc_stderr": 0.029918586707798824, + "acc_norm": 0.726457399103139, + "acc_norm_stderr": 0.029918586707798824 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8473282442748091, + "acc_stderr": 0.031545216720054725, + "acc_norm": 0.8473282442748091, + "acc_norm_stderr": 0.031545216720054725 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8842975206611571, + "acc_stderr": 0.02919980245562281, + "acc_norm": 0.8842975206611571, + "acc_norm_stderr": 0.02919980245562281 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8425925925925926, + "acc_stderr": 0.03520703990517965, + "acc_norm": 0.8425925925925926, + "acc_norm_stderr": 0.03520703990517965 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7607361963190185, + "acc_stderr": 0.0335195387952127, + "acc_norm": 0.7607361963190185, + "acc_norm_stderr": 0.0335195387952127 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.883495145631068, + "acc_stderr": 0.031766839486404075, + "acc_norm": 0.883495145631068, + "acc_norm_stderr": 0.031766839486404075 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9230769230769231, + "acc_stderr": 0.017456987872436183, + "acc_norm": 0.9230769230769231, + "acc_norm_stderr": 0.017456987872436183 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8786717752234994, + "acc_stderr": 0.011675913883906723, + "acc_norm": 0.8786717752234994, + "acc_norm_stderr": 0.011675913883906723 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7601156069364162, + "acc_stderr": 0.022989592543123563, + "acc_norm": 0.7601156069364162, + "acc_norm_stderr": 0.022989592543123563 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3743016759776536, + "acc_stderr": 0.01618544417945717, + "acc_norm": 0.3743016759776536, + "acc_norm_stderr": 0.01618544417945717 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.8104575163398693, + "acc_stderr": 0.02244235826333621, + "acc_norm": 0.8104575163398693, + "acc_norm_stderr": 0.02244235826333621 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7717041800643086, + "acc_stderr": 0.023839303311398195, + "acc_norm": 0.7717041800643086, + "acc_norm_stderr": 0.023839303311398195 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8148148148148148, + "acc_stderr": 0.021613809395224805, + "acc_norm": 0.8148148148148148, + "acc_norm_stderr": 0.021613809395224805 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5709219858156028, + "acc_stderr": 0.029525914302558562, + "acc_norm": 0.5709219858156028, + "acc_norm_stderr": 0.029525914302558562 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5071707953063885, + "acc_stderr": 0.012768922739553304, + "acc_norm": 0.5071707953063885, + "acc_norm_stderr": 0.012768922739553304 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7683823529411765, + "acc_stderr": 0.025626533803777562, + "acc_norm": 0.7683823529411765, + "acc_norm_stderr": 0.025626533803777562 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7418300653594772, + "acc_stderr": 0.01770453165325007, + "acc_norm": 0.7418300653594772, + "acc_norm_stderr": 0.01770453165325007 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6818181818181818, + "acc_stderr": 0.044612721759105085, + "acc_norm": 0.6818181818181818, + "acc_norm_stderr": 0.044612721759105085 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.763265306122449, + "acc_stderr": 0.027212835884073142, + "acc_norm": 0.763265306122449, + "acc_norm_stderr": 0.027212835884073142 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8756218905472637, + "acc_stderr": 0.023335401790166327, + "acc_norm": 0.8756218905472637, + "acc_norm_stderr": 0.023335401790166327 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.9, + "acc_stderr": 0.030151134457776334, + "acc_norm": 0.9, + "acc_norm_stderr": 0.030151134457776334 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4939759036144578, + "acc_stderr": 0.03892212195333045, + "acc_norm": 0.4939759036144578, + "acc_norm_stderr": 0.03892212195333045 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8654970760233918, + "acc_stderr": 0.026168221344662297, + "acc_norm": 0.8654970760233918, + "acc_norm_stderr": 0.026168221344662297 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2998776009791922, + "mc1_stderr": 0.016040352966713616, + "mc2": 0.4543006274006171, + "mc2_stderr": 0.014099486144642947 + }, + "harness|winogrande|5": { + "acc": 0.8113654301499605, + "acc_stderr": 0.010995172318019785 + }, + "harness|gsm8k|5": { + "acc": 0.4829416224412434, + "acc_stderr": 0.013764467123761316 + }, + "all": { + "acc": 0.6905952821572917, + "acc_stderr": 0.030759474803252005, + "acc_norm": 0.696151240116133, + "acc_norm_stderr": 0.03135740142119605, + "mc1": 0.2998776009791922, + "mc1_stderr": 0.016040352966713616, + "mc2": 0.4543006274006171, + "mc2_stderr": 0.014099486144642947 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "25f9ceda4b8caf7e" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "62631ae16ec183aa" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16/results_2023-08-01T13-07-54.585648.json b/eval-results/TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16/results_2023-08-01T13-07-54.585648.json new file mode 100644 index 0000000000000000000000000000000000000000..06c403e3fb3c0d69f8a8fb67e714f459e0ffa690 --- /dev/null +++ b/eval-results/TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16/results_2023-08-01T13-07-54.585648.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5401023890784983, + "acc_stderr": 0.01456431885692485, + "acc_norm": 0.552901023890785, + "acc_norm_stderr": 0.014529380160526842 + }, + "harness|hellaswag|10": { + "acc": 0.6229834694284008, + "acc_stderr": 0.004836486437527262, + "acc_norm": 0.8186616211909978, + "acc_norm_stderr": 0.0038451084764013045 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.43703703703703706, + "acc_stderr": 0.04284958639753399, + "acc_norm": 0.43703703703703706, + "acc_norm_stderr": 0.04284958639753399 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5, + "acc_stderr": 0.04068942293855797, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04068942293855797 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5283018867924528, + "acc_stderr": 0.030723535249006107, + "acc_norm": 0.5283018867924528, + "acc_norm_stderr": 0.030723535249006107 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.04174752578923185, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.04174752578923185 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4508670520231214, + "acc_stderr": 0.03794012674697029, + "acc_norm": 0.4508670520231214, + "acc_norm_stderr": 0.03794012674697029 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.65, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.65, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4127659574468085, + "acc_stderr": 0.03218471141400351, + "acc_norm": 0.4127659574468085, + "acc_norm_stderr": 0.03218471141400351 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322004, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322004 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.41379310344827586, + "acc_stderr": 0.041042692118062316, + "acc_norm": 0.41379310344827586, + "acc_norm_stderr": 0.041042692118062316 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2751322751322751, + "acc_stderr": 0.02300008685906864, + "acc_norm": 0.2751322751322751, + "acc_norm_stderr": 0.02300008685906864 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.04306241259127153, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.04306241259127153 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5483870967741935, + "acc_stderr": 0.028310500348568385, + "acc_norm": 0.5483870967741935, + "acc_norm_stderr": 0.028310500348568385 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.35960591133004927, + "acc_stderr": 0.03376458246509567, + "acc_norm": 0.35960591133004927, + "acc_norm_stderr": 0.03376458246509567 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.23030303030303031, + "acc_stderr": 0.0328766675860349, + "acc_norm": 0.23030303030303031, + "acc_norm_stderr": 0.0328766675860349 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6616161616161617, + "acc_stderr": 0.033711241426263014, + "acc_norm": 0.6616161616161617, + "acc_norm_stderr": 0.033711241426263014 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.694300518134715, + "acc_stderr": 0.033248379397581594, + "acc_norm": 0.694300518134715, + "acc_norm_stderr": 0.033248379397581594 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.43333333333333335, + "acc_stderr": 0.025124653525885127, + "acc_norm": 0.43333333333333335, + "acc_norm_stderr": 0.025124653525885127 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.0259288761327661, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.0259288761327661 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4957983193277311, + "acc_stderr": 0.03247734334448111, + "acc_norm": 0.4957983193277311, + "acc_norm_stderr": 0.03247734334448111 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943342, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943342 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.691743119266055, + "acc_stderr": 0.019798366698367237, + "acc_norm": 0.691743119266055, + "acc_norm_stderr": 0.019798366698367237 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3194444444444444, + "acc_stderr": 0.03179876342176851, + "acc_norm": 0.3194444444444444, + "acc_norm_stderr": 0.03179876342176851 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.03019028245350195, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.03019028245350195 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.679324894514768, + "acc_stderr": 0.030381931949990407, + "acc_norm": 0.679324894514768, + "acc_norm_stderr": 0.030381931949990407 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5919282511210763, + "acc_stderr": 0.03298574607842821, + "acc_norm": 0.5919282511210763, + "acc_norm_stderr": 0.03298574607842821 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6030534351145038, + "acc_stderr": 0.04291135671009225, + "acc_norm": 0.6030534351145038, + "acc_norm_stderr": 0.04291135671009225 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.043913262867240704, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.043913262867240704 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5925925925925926, + "acc_stderr": 0.04750077341199984, + "acc_norm": 0.5925925925925926, + "acc_norm_stderr": 0.04750077341199984 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5644171779141104, + "acc_stderr": 0.03895632464138938, + "acc_norm": 0.5644171779141104, + "acc_norm_stderr": 0.03895632464138938 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.41964285714285715, + "acc_stderr": 0.04684099321077106, + "acc_norm": 0.41964285714285715, + "acc_norm_stderr": 0.04684099321077106 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6893203883495146, + "acc_stderr": 0.0458212416016155, + "acc_norm": 0.6893203883495146, + "acc_norm_stderr": 0.0458212416016155 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7606837606837606, + "acc_stderr": 0.027951826808924333, + "acc_norm": 0.7606837606837606, + "acc_norm_stderr": 0.027951826808924333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6832694763729247, + "acc_stderr": 0.01663556642771256, + "acc_norm": 0.6832694763729247, + "acc_norm_stderr": 0.01663556642771256 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.546242774566474, + "acc_stderr": 0.026803720583206174, + "acc_norm": 0.546242774566474, + "acc_norm_stderr": 0.026803720583206174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23910614525139665, + "acc_stderr": 0.014265554192331146, + "acc_norm": 0.23910614525139665, + "acc_norm_stderr": 0.014265554192331146 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5490196078431373, + "acc_stderr": 0.02849199358617157, + "acc_norm": 0.5490196078431373, + "acc_norm_stderr": 0.02849199358617157 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5241157556270096, + "acc_stderr": 0.028365041542564577, + "acc_norm": 0.5241157556270096, + "acc_norm_stderr": 0.028365041542564577 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5308641975308642, + "acc_stderr": 0.027767689606833932, + "acc_norm": 0.5308641975308642, + "acc_norm_stderr": 0.027767689606833932 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.36524822695035464, + "acc_stderr": 0.028723863853281278, + "acc_norm": 0.36524822695035464, + "acc_norm_stderr": 0.028723863853281278 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.38005215123859193, + "acc_stderr": 0.012397328205137812, + "acc_norm": 0.38005215123859193, + "acc_norm_stderr": 0.012397328205137812 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4485294117647059, + "acc_stderr": 0.030211479609121596, + "acc_norm": 0.4485294117647059, + "acc_norm_stderr": 0.030211479609121596 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.49673202614379086, + "acc_stderr": 0.020227402794434867, + "acc_norm": 0.49673202614379086, + "acc_norm_stderr": 0.020227402794434867 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5636363636363636, + "acc_stderr": 0.04750185058907296, + "acc_norm": 0.5636363636363636, + "acc_norm_stderr": 0.04750185058907296 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5428571428571428, + "acc_stderr": 0.03189141832421396, + "acc_norm": 0.5428571428571428, + "acc_norm_stderr": 0.03189141832421396 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7114427860696517, + "acc_stderr": 0.03203841040213321, + "acc_norm": 0.7114427860696517, + "acc_norm_stderr": 0.03203841040213321 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.78, + "acc_stderr": 0.041633319989322626, + "acc_norm": 0.78, + "acc_norm_stderr": 0.041633319989322626 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.43373493975903615, + "acc_stderr": 0.03858158940685517, + "acc_norm": 0.43373493975903615, + "acc_norm_stderr": 0.03858158940685517 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7076023391812866, + "acc_stderr": 0.03488647713457923, + "acc_norm": 0.7076023391812866, + "acc_norm_stderr": 0.03488647713457923 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35128518971848227, + "mc1_stderr": 0.016711358163544403, + "mc2": 0.5119241591172863, + "mc2_stderr": 0.01572002603242522 + }, + "all": { + "acc": 0.48568745456125834, + "acc_stderr": 0.03478932910810881, + "acc_norm": 0.4892209594184598, + "acc_norm_stderr": 0.034771933910523654, + "mc1": 0.35128518971848227, + "mc1_stderr": 0.016711358163544403, + "mc2": 0.5119241591172863, + "mc2_stderr": 0.01572002603242522 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16", + "model_sha": "b407c1ece029ad5693d38e6e0931e9482962ed15", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6239.993566274643", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16/results_2023-10-22T21-24-49.496203.json b/eval-results/TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16/results_2023-10-22T21-24-49.496203.json new file mode 100644 index 0000000000000000000000000000000000000000..b826c93bd33270514f64888ea6e5afd13a846b9d --- /dev/null +++ b/eval-results/TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16/results_2023-10-22T21-24-49.496203.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/Nous-Hermes-13B-SuperHOT-8K-fp16", + "model_sha": "b407c1ece029ad5693d38e6e0931e9482962ed15", + "model_size": "24.4 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.24779781879194632, + "em_stderr": 0.004421358038007316, + "f1": 0.3203208892617463, + "f1_stderr": 0.004418252169927022 + }, + "harness|gsm8k|5": { + "acc": 0.012130401819560273, + "acc_stderr": 0.003015294242890953 + }, + "harness|winogrande|5": { + "acc": 0.7529597474348856, + "acc_stderr": 0.012121402942855573 + }, + "all": { + "em": 0.24779781879194632, + "em_stderr": 0.004421358038007316, + "f1": 0.3203208892617463, + "f1_stderr": 0.004418252169927022, + "acc": 0.3825450746272229, + "acc_stderr": 0.007568348592873263 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "48659981bb2d425c" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "7bf4e2a4affe9c18" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "8248f7549388fc0b" + }, + "total_evaluation_time_secondes": "10317.167259693146", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/OpenAssistant-SFT-7-Llama-30B-HF/results_2023-07-19T22-44-19.720986.json b/eval-results/TheBloke/OpenAssistant-SFT-7-Llama-30B-HF/results_2023-07-19T22-44-19.720986.json new file mode 100644 index 0000000000000000000000000000000000000000..c12f9f5cc212a78fc61bea00e57fbcb810221123 --- /dev/null +++ b/eval-results/TheBloke/OpenAssistant-SFT-7-Llama-30B-HF/results_2023-07-19T22-44-19.720986.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5767918088737202, + "acc_stderr": 0.014438036220848029, + "acc_norm": 0.60580204778157, + "acc_norm_stderr": 0.014280522667467321 + }, + "harness|hellaswag|10": { + "acc": 0.6190001991635132, + "acc_stderr": 0.00484640032558525, + "acc_norm": 0.8217486556462856, + "acc_norm_stderr": 0.003819420058554165 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.046482319871173156, + "acc_norm": 0.31, + "acc_norm_stderr": 0.046482319871173156 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45925925925925926, + "acc_stderr": 0.04304979692464243, + "acc_norm": 0.45925925925925926, + "acc_norm_stderr": 0.04304979692464243 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5855263157894737, + "acc_stderr": 0.04008973785779206, + "acc_norm": 0.5855263157894737, + "acc_norm_stderr": 0.04008973785779206 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6037735849056604, + "acc_stderr": 0.030102793781791197, + "acc_norm": 0.6037735849056604, + "acc_norm_stderr": 0.030102793781791197 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6319444444444444, + "acc_stderr": 0.04032999053960719, + "acc_norm": 0.6319444444444444, + "acc_norm_stderr": 0.04032999053960719 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.42, + "acc_stderr": 0.04960449637488584, + "acc_norm": 0.42, + "acc_norm_stderr": 0.04960449637488584 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5144508670520231, + "acc_stderr": 0.03810871630454764, + "acc_norm": 0.5144508670520231, + "acc_norm_stderr": 0.03810871630454764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.047551296160629475, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.047551296160629475 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.63, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.63, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5148936170212766, + "acc_stderr": 0.032671518489247764, + "acc_norm": 0.5148936170212766, + "acc_norm_stderr": 0.032671518489247764 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.04537815354939392, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.04537815354939392 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4689655172413793, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.4689655172413793, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3386243386243386, + "acc_stderr": 0.024373197867983042, + "acc_norm": 0.3386243386243386, + "acc_norm_stderr": 0.024373197867983042 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04216370213557835, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04216370213557835 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7161290322580646, + "acc_stderr": 0.02564938106302926, + "acc_norm": 0.7161290322580646, + "acc_norm_stderr": 0.02564938106302926 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4187192118226601, + "acc_stderr": 0.034711928605184676, + "acc_norm": 0.4187192118226601, + "acc_norm_stderr": 0.034711928605184676 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.03546563019624337, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.03546563019624337 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7525252525252525, + "acc_stderr": 0.030746300742124495, + "acc_norm": 0.7525252525252525, + "acc_norm_stderr": 0.030746300742124495 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8290155440414507, + "acc_stderr": 0.027171213683164542, + "acc_norm": 0.8290155440414507, + "acc_norm_stderr": 0.027171213683164542 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5923076923076923, + "acc_stderr": 0.024915243985987847, + "acc_norm": 0.5923076923076923, + "acc_norm_stderr": 0.024915243985987847 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.026719240783712166, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.026719240783712166 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6302521008403361, + "acc_stderr": 0.031357095996135904, + "acc_norm": 0.6302521008403361, + "acc_norm_stderr": 0.031357095996135904 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.03802039760107903, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.03802039760107903 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7908256880733945, + "acc_stderr": 0.017437937173343233, + "acc_norm": 0.7908256880733945, + "acc_norm_stderr": 0.017437937173343233 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.49537037037037035, + "acc_stderr": 0.03409825519163572, + "acc_norm": 0.49537037037037035, + "acc_norm_stderr": 0.03409825519163572 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.02812597226565437, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.02812597226565437 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7805907172995781, + "acc_stderr": 0.026939106581553945, + "acc_norm": 0.7805907172995781, + "acc_norm_stderr": 0.026939106581553945 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6771300448430493, + "acc_stderr": 0.03138147637575499, + "acc_norm": 0.6771300448430493, + "acc_norm_stderr": 0.03138147637575499 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6717557251908397, + "acc_stderr": 0.04118438565806298, + "acc_norm": 0.6717557251908397, + "acc_norm_stderr": 0.04118438565806298 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098825, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098825 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6574074074074074, + "acc_stderr": 0.045879047413018105, + "acc_norm": 0.6574074074074074, + "acc_norm_stderr": 0.045879047413018105 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6871165644171779, + "acc_stderr": 0.036429145782924055, + "acc_norm": 0.6871165644171779, + "acc_norm_stderr": 0.036429145782924055 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7669902912621359, + "acc_stderr": 0.041858325989283136, + "acc_norm": 0.7669902912621359, + "acc_norm_stderr": 0.041858325989283136 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8589743589743589, + "acc_stderr": 0.022801382534597552, + "acc_norm": 0.8589743589743589, + "acc_norm_stderr": 0.022801382534597552 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7624521072796935, + "acc_stderr": 0.015218733046150193, + "acc_norm": 0.7624521072796935, + "acc_norm_stderr": 0.015218733046150193 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6560693641618497, + "acc_stderr": 0.02557412378654667, + "acc_norm": 0.6560693641618497, + "acc_norm_stderr": 0.02557412378654667 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4212290502793296, + "acc_stderr": 0.0165136760311796, + "acc_norm": 0.4212290502793296, + "acc_norm_stderr": 0.0165136760311796 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6209150326797386, + "acc_stderr": 0.02778014120702334, + "acc_norm": 0.6209150326797386, + "acc_norm_stderr": 0.02778014120702334 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6270096463022508, + "acc_stderr": 0.027466610213140112, + "acc_norm": 0.6270096463022508, + "acc_norm_stderr": 0.027466610213140112 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6635802469135802, + "acc_stderr": 0.026289734945952926, + "acc_norm": 0.6635802469135802, + "acc_norm_stderr": 0.026289734945952926 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4148936170212766, + "acc_stderr": 0.029392236584612503, + "acc_norm": 0.4148936170212766, + "acc_norm_stderr": 0.029392236584612503 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4348109517601043, + "acc_stderr": 0.012661233805616292, + "acc_norm": 0.4348109517601043, + "acc_norm_stderr": 0.012661233805616292 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6066176470588235, + "acc_stderr": 0.029674288281311155, + "acc_norm": 0.6066176470588235, + "acc_norm_stderr": 0.029674288281311155 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6160130718954249, + "acc_stderr": 0.019675808135281518, + "acc_norm": 0.6160130718954249, + "acc_norm_stderr": 0.019675808135281518 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6272727272727273, + "acc_stderr": 0.04631381319425465, + "acc_norm": 0.6272727272727273, + "acc_norm_stderr": 0.04631381319425465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6408163265306123, + "acc_stderr": 0.03071356045510849, + "acc_norm": 0.6408163265306123, + "acc_norm_stderr": 0.03071356045510849 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7761194029850746, + "acc_stderr": 0.029475250236017193, + "acc_norm": 0.7761194029850746, + "acc_norm_stderr": 0.029475250236017193 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.783625730994152, + "acc_stderr": 0.031581495393387324, + "acc_norm": 0.783625730994152, + "acc_norm_stderr": 0.031581495393387324 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.30966952264381886, + "mc1_stderr": 0.016185744355144912, + "mc2": 0.4693711489528687, + "mc2_stderr": 0.014679729171255115 + }, + "all": { + "acc": 0.5799641405750319, + "acc_stderr": 0.0341507810807547, + "acc_norm": 0.5838922540562288, + "acc_norm_stderr": 0.034130704914307045, + "mc1": 0.30966952264381886, + "mc1_stderr": 0.016185744355144912, + "mc2": 0.4693711489528687, + "mc2_stderr": 0.014679729171255115 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/OpenAssistant-SFT-7-Llama-30B-HF", + "model_sha": "a7a2306b9a63de2c545f35b24735f4540baf5903", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "4d41ef08f7f15a87", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "e5cdbeaabd59fe25", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "f761c98a583630b0", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "80e0c48ccffa00ed", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "2b329c6ca67607dc", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bde478206654aa12", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "de5e1db4f0637b77", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "0150fa239f4db10c", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "0bee7f47bee63c79", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "e718ffe7615f023a", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "786c85630e928fad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "3c2f62f5c1fe6a2e", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f4deb8123d2bfd4a", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "c0e1e1b475ae50f7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "42a3769ad3670df3", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "907d28b129d51d56", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e456dc04a081add9", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "c1ff30907d03d949", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "b83e70701038ee89", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "690d1342b56ec0c0", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "fd7dcca51bc36ed3", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "73042d776e504db9", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "ae788fb4bea00ecf", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "58b516ecb02f2e40", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "0e66256865617a7e", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ef84b9228c4a8a14", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "21e09eeedb3bd8d1", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5c67d84b5de728e", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a8adfaec234b6241", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "20cb425a37d2f8a3", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "083b9ab4a12ec287", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "ec8bbb3a1fd686ab", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "3913b529e3c7d97c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "8dd5a70368b6dbaf", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "a3507093fa9ca1ac", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "808f16991ac2375a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "8f047748092d60d7", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "93059d5f2bb285ab", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "f252fe7542a09428", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af226d6094825afe", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "fe5ba86eaf1086de", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "475603bced2f3d7f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "418f52434a830ea3", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7faa6a56dccc3884", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "3ed3790cd29fec29", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "fc91e4b12993bad0", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "39719c3315549563", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "be1b34046d49026f", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "df4a6015393b6489", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "31b608e60b8f9a26", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "f04dcc40aa61dff1", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "924078ed7bcb5027", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "34ef07b913df3c85", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "7910870d0c39ccf4", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "3acc80abbdef09b7", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "7a0a1a457d1b44b3", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "fdd0a4eda46435e3", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "e8d34404f8d50781", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/OpenAssistant-SFT-7-Llama-30B-HF/results_2023-10-18T12-34-46.585647.json b/eval-results/TheBloke/OpenAssistant-SFT-7-Llama-30B-HF/results_2023-10-18T12-34-46.585647.json new file mode 100644 index 0000000000000000000000000000000000000000..8c23166ae672d1aac09523c00ca29c4778dec5cb --- /dev/null +++ b/eval-results/TheBloke/OpenAssistant-SFT-7-Llama-30B-HF/results_2023-10-18T12-34-46.585647.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/OpenAssistant-SFT-7-Llama-30B-HF", + "model_sha": "a7a2306b9a63de2c545f35b24735f4540baf5903", + "model_size": "60.65 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.30463506711409394, + "em_stderr": 0.004713418382367835, + "f1": 0.3681375838926183, + "f1_stderr": 0.0046109589189275765 + }, + "harness|gsm8k|5": { + "acc": 0.2979529946929492, + "acc_stderr": 0.012597932232914508 + }, + "harness|winogrande|5": { + "acc": 0.7861089187056038, + "acc_stderr": 0.011524466954090248 + }, + "all": { + "em": 0.30463506711409394, + "em_stderr": 0.004713418382367835, + "f1": 0.3681375838926183, + "f1_stderr": 0.0046109589189275765, + "acc": 0.5420309566992765, + "acc_stderr": 0.012061199593502377 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f70227603c1b1bfe", + "hash_cont_tokens": "d7bcbd8f96cf1234" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "e3d5b3003c52b880", + "hash_cont_tokens": "6e4fecb081c44c14" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "5be2b0947cee07a9", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "ce9af2df9f2847fa", + "hash_cont_tokens": "d2a83d5cccd9683f" + }, + "total_evaluation_time_secondes": "17719.584295272827", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/OpenOrca-Platypus2-13B-GPTQ/results_2023-08-31T16-41-28.579874.json b/eval-results/TheBloke/OpenOrca-Platypus2-13B-GPTQ/results_2023-08-31T16-41-28.579874.json new file mode 100644 index 0000000000000000000000000000000000000000..7f5338c0233f26562b29bb56101d6eda00d8779f --- /dev/null +++ b/eval-results/TheBloke/OpenOrca-Platypus2-13B-GPTQ/results_2023-08-31T16-41-28.579874.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/OpenOrca-Platypus2-13B-GPTQ", + "model_sha": "0fa9a56066656fbc94e3ec088bc900fd1d4d38e8", + "model_dtype": "None", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.575938566552901, + "acc_stderr": 0.014441889627464394, + "acc_norm": 0.6254266211604096, + "acc_norm_stderr": 0.01414419347189345 + }, + "harness|hellaswag|10": { + "acc": 0.6208922525393348, + "acc_stderr": 0.004841734453506665, + "acc_norm": 0.826727743477395, + "acc_norm_stderr": 0.003777089607095471 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4962962962962963, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.4962962962962963, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6052631578947368, + "acc_stderr": 0.039777499346220734, + "acc_norm": 0.6052631578947368, + "acc_norm_stderr": 0.039777499346220734 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6415094339622641, + "acc_stderr": 0.029514703583981762, + "acc_norm": 0.6415094339622641, + "acc_norm_stderr": 0.029514703583981762 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.039420826399272135, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.039420826399272135 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5606936416184971, + "acc_stderr": 0.03784271932887467, + "acc_norm": 0.5606936416184971, + "acc_norm_stderr": 0.03784271932887467 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04690650298201942, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04690650298201942 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.48936170212765956, + "acc_stderr": 0.03267862331014063, + "acc_norm": 0.48936170212765956, + "acc_norm_stderr": 0.03267862331014063 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.32456140350877194, + "acc_stderr": 0.04404556157374767, + "acc_norm": 0.32456140350877194, + "acc_norm_stderr": 0.04404556157374767 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.04164188720169377, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.04164188720169377 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.35978835978835977, + "acc_stderr": 0.024718075944129277, + "acc_norm": 0.35978835978835977, + "acc_norm_stderr": 0.024718075944129277 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.04263906892795133, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.04263906892795133 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6774193548387096, + "acc_stderr": 0.026593084516572277, + "acc_norm": 0.6774193548387096, + "acc_norm_stderr": 0.026593084516572277 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43349753694581283, + "acc_stderr": 0.03486731727419872, + "acc_norm": 0.43349753694581283, + "acc_norm_stderr": 0.03486731727419872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.030532892233932026, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.030532892233932026 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.844559585492228, + "acc_stderr": 0.026148483469153314, + "acc_norm": 0.844559585492228, + "acc_norm_stderr": 0.026148483469153314 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6205128205128205, + "acc_stderr": 0.024603626924097417, + "acc_norm": 0.6205128205128205, + "acc_norm_stderr": 0.024603626924097417 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34814814814814815, + "acc_stderr": 0.029045600290616258, + "acc_norm": 0.34814814814814815, + "acc_norm_stderr": 0.029045600290616258 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6050420168067226, + "acc_stderr": 0.03175367846096625, + "acc_norm": 0.6050420168067226, + "acc_norm_stderr": 0.03175367846096625 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.03861557546255169, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.03861557546255169 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8110091743119267, + "acc_stderr": 0.016785481159203627, + "acc_norm": 0.8110091743119267, + "acc_norm_stderr": 0.016785481159203627 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4398148148148148, + "acc_stderr": 0.03385177976044811, + "acc_norm": 0.4398148148148148, + "acc_norm_stderr": 0.03385177976044811 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8284313725490197, + "acc_stderr": 0.02646056956124065, + "acc_norm": 0.8284313725490197, + "acc_norm_stderr": 0.02646056956124065 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7721518987341772, + "acc_stderr": 0.027303484599069432, + "acc_norm": 0.7721518987341772, + "acc_norm_stderr": 0.027303484599069432 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6367713004484304, + "acc_stderr": 0.032277904428505, + "acc_norm": 0.6367713004484304, + "acc_norm_stderr": 0.032277904428505 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6717557251908397, + "acc_stderr": 0.041184385658062976, + "acc_norm": 0.6717557251908397, + "acc_norm_stderr": 0.041184385658062976 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6942148760330579, + "acc_stderr": 0.042059539338841226, + "acc_norm": 0.6942148760330579, + "acc_norm_stderr": 0.042059539338841226 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.75, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7055214723926381, + "acc_stderr": 0.03581165790474082, + "acc_norm": 0.7055214723926381, + "acc_norm_stderr": 0.03581165790474082 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4107142857142857, + "acc_stderr": 0.04669510663875191, + "acc_norm": 0.4107142857142857, + "acc_norm_stderr": 0.04669510663875191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7281553398058253, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.7281553398058253, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.024414947304543674, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.024414947304543674 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7828863346104725, + "acc_stderr": 0.01474312539482329, + "acc_norm": 0.7828863346104725, + "acc_norm_stderr": 0.01474312539482329 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6445086705202312, + "acc_stderr": 0.025770292082977254, + "acc_norm": 0.6445086705202312, + "acc_norm_stderr": 0.025770292082977254 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4793296089385475, + "acc_stderr": 0.016708205559996137, + "acc_norm": 0.4793296089385475, + "acc_norm_stderr": 0.016708205559996137 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6372549019607843, + "acc_stderr": 0.0275300784471103, + "acc_norm": 0.6372549019607843, + "acc_norm_stderr": 0.0275300784471103 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6945337620578779, + "acc_stderr": 0.026160584450140446, + "acc_norm": 0.6945337620578779, + "acc_norm_stderr": 0.026160584450140446 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7037037037037037, + "acc_stderr": 0.025407197798890162, + "acc_norm": 0.7037037037037037, + "acc_norm_stderr": 0.025407197798890162 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.45390070921985815, + "acc_stderr": 0.02970045324729148, + "acc_norm": 0.45390070921985815, + "acc_norm_stderr": 0.02970045324729148 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4491525423728814, + "acc_stderr": 0.012704030518851484, + "acc_norm": 0.4491525423728814, + "acc_norm_stderr": 0.012704030518851484 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5514705882352942, + "acc_stderr": 0.030211479609121596, + "acc_norm": 0.5514705882352942, + "acc_norm_stderr": 0.030211479609121596 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.576797385620915, + "acc_stderr": 0.01998780976948206, + "acc_norm": 0.576797385620915, + "acc_norm_stderr": 0.01998780976948206 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.046075820907199756, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.046075820907199756 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6408163265306123, + "acc_stderr": 0.03071356045510849, + "acc_norm": 0.6408163265306123, + "acc_norm_stderr": 0.03071356045510849 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7213930348258707, + "acc_stderr": 0.031700561834973086, + "acc_norm": 0.7213930348258707, + "acc_norm_stderr": 0.031700561834973086 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4939759036144578, + "acc_stderr": 0.03892212195333045, + "acc_norm": 0.4939759036144578, + "acc_norm_stderr": 0.03892212195333045 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.03188578017686397, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.03188578017686397 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3635250917992656, + "mc1_stderr": 0.01683886288396583, + "mc2": 0.519328776659898, + "mc2_stderr": 0.015584387403745438 + }, + "all": { + "acc": 0.5860014845107272, + "acc_stderr": 0.03409238663132619, + "acc_norm": 0.5903290022318386, + "acc_norm_stderr": 0.03406929610586886, + "mc1": 0.3635250917992656, + "mc1_stderr": 0.01683886288396583, + "mc2": 0.519328776659898, + "mc2_stderr": 0.015584387403745438 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "3ced177a9740ab72" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "736cbacfc627c9ce" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "d2d9cf5534b74b0b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "9d8617775e7afb7e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "8a729845cf844415" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "258d18b5a76e9d51" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "41f6ee2445154160" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "69114fe474fd53fa" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "76b2fa379520c907" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "b515d408b1bdf6f5" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "935dc99247031e33" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "85f7f7d7ac099657" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "d41d04de2e5e5d4b" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "587dad76855b6265" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "84745da13334a4b5" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "05f39a5a580500e1" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "8181ae2e48363b69" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "6d11e1c9a9d46862" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "8e94e84c0b1d140d" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "79e75724ab447f67" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "ec2a22eed7584a34" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "2ed2183b9bdf6b00" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "91fb99cbc39ad638" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "fdfb0c61160424af" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "793bad98a4990ca2" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "af786994f8c0cec8" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "37734a01ffbfc9c8" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "faf445de2faeb578" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "640c8dab253ca811" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "b51d8363b9d664e5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "12f3db94ad7a571a" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "e93f00105a26e30c" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "42d667fb2f670b76" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "fcea00b906601945" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "d83e6d4f7eacf9cd" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "ff40ec7eb62e0c4a" + }, + "total_evaluation_time_secondes": "9977.666625261307", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/OpenOrca-Platypus2-13B-GPTQ/results_2023-09-16T19-40-18.805309.json b/eval-results/TheBloke/OpenOrca-Platypus2-13B-GPTQ/results_2023-09-16T19-40-18.805309.json new file mode 100644 index 0000000000000000000000000000000000000000..459331b2eb8ce6d316aaededfe515f33eacdee56 --- /dev/null +++ b/eval-results/TheBloke/OpenOrca-Platypus2-13B-GPTQ/results_2023-09-16T19-40-18.805309.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/OpenOrca-Platypus2-13B-GPTQ", + "model_sha": "235539f9d7eba419d4525f571c5d4df371c2d6f0", + "model_size": "6.84 GB", + "model_dtype": "None", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.00776006711409396, + "em_stderr": 0.0008986296432392665, + "f1": 0.09614198825503374, + "f1_stderr": 0.001960302320596267 + }, + "harness|gsm8k|5": { + "acc": 0.09401061410159212, + "acc_stderr": 0.008038819818872464 + }, + "harness|winogrande|5": { + "acc": 0.7679558011049724, + "acc_stderr": 0.01186414969182794 + }, + "all": { + "em": 0.00776006711409396, + "em_stderr": 0.0008986296432392665, + "f1": 0.09614198825503374, + "f1_stderr": 0.001960302320596267, + "acc": 0.43098320760328224, + "acc_stderr": 0.009951484755350203 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "04450374ab199b29" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "88260c7c898951f8" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "e6af07bebd69769e" + }, + "total_evaluation_time_secondes": "6554.239681959152", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/OpenOrcaxOpenChat-Preview2-13B-GPTQ/results_2023-08-30T22-06-48.097340.json b/eval-results/TheBloke/OpenOrcaxOpenChat-Preview2-13B-GPTQ/results_2023-08-30T22-06-48.097340.json new file mode 100644 index 0000000000000000000000000000000000000000..6141d8a633c506efeecbd68f0fdf782a720027b4 --- /dev/null +++ b/eval-results/TheBloke/OpenOrcaxOpenChat-Preview2-13B-GPTQ/results_2023-08-30T22-06-48.097340.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/OpenOrcaxOpenChat-Preview2-13B-GPTQ", + "model_sha": "ec9eb4f471b5bb6a7e5e505369628586c0c72252", + "model_dtype": "None", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.591296928327645, + "acc_stderr": 0.014365750345427001, + "acc_norm": 0.6126279863481229, + "acc_norm_stderr": 0.01423587248790987 + }, + "harness|hellaswag|10": { + "acc": 0.6223859788886676, + "acc_stderr": 0.004837995637638539, + "acc_norm": 0.821449910376419, + "acc_norm_stderr": 0.0038219244335487754 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5789473684210527, + "acc_stderr": 0.040179012759817494, + "acc_norm": 0.5789473684210527, + "acc_norm_stderr": 0.040179012759817494 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6113207547169811, + "acc_stderr": 0.030000485448675986, + "acc_norm": 0.6113207547169811, + "acc_norm_stderr": 0.030000485448675986 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.625, + "acc_stderr": 0.04048439222695598, + "acc_norm": 0.625, + "acc_norm_stderr": 0.04048439222695598 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5722543352601156, + "acc_stderr": 0.03772446857518026, + "acc_norm": 0.5722543352601156, + "acc_norm_stderr": 0.03772446857518026 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3137254901960784, + "acc_stderr": 0.04617034827006717, + "acc_norm": 0.3137254901960784, + "acc_norm_stderr": 0.04617034827006717 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.04560480215720685, + "acc_norm": 0.71, + "acc_norm_stderr": 0.04560480215720685 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4595744680851064, + "acc_stderr": 0.03257901482099835, + "acc_norm": 0.4595744680851064, + "acc_norm_stderr": 0.03257901482099835 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.04372748290278007, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.04372748290278007 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5103448275862069, + "acc_stderr": 0.04165774775728763, + "acc_norm": 0.5103448275862069, + "acc_norm_stderr": 0.04165774775728763 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.024278568024307695, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.024278568024307695 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.041905964388711366, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.041905964388711366 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6967741935483871, + "acc_stderr": 0.026148685930671746, + "acc_norm": 0.6967741935483871, + "acc_norm_stderr": 0.026148685930671746 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.45320197044334976, + "acc_stderr": 0.035025446508458714, + "acc_norm": 0.45320197044334976, + "acc_norm_stderr": 0.035025446508458714 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562427, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562427 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7151515151515152, + "acc_stderr": 0.03524390844511781, + "acc_norm": 0.7151515151515152, + "acc_norm_stderr": 0.03524390844511781 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.030532892233932026, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.030532892233932026 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8082901554404145, + "acc_stderr": 0.028408953626245265, + "acc_norm": 0.8082901554404145, + "acc_norm_stderr": 0.028408953626245265 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5641025641025641, + "acc_stderr": 0.025141801511177498, + "acc_norm": 0.5641025641025641, + "acc_norm_stderr": 0.025141801511177498 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.31851851851851853, + "acc_stderr": 0.028406533090608463, + "acc_norm": 0.31851851851851853, + "acc_norm_stderr": 0.028406533090608463 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.03156663099215416, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.03156663099215416 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7559633027522936, + "acc_stderr": 0.018415286351416402, + "acc_norm": 0.7559633027522936, + "acc_norm_stderr": 0.018415286351416402 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4351851851851852, + "acc_stderr": 0.03381200005643525, + "acc_norm": 0.4351851851851852, + "acc_norm_stderr": 0.03381200005643525 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7696078431372549, + "acc_stderr": 0.029554292605695066, + "acc_norm": 0.7696078431372549, + "acc_norm_stderr": 0.029554292605695066 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229966, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229966 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6636771300448431, + "acc_stderr": 0.031708824268455005, + "acc_norm": 0.6636771300448431, + "acc_norm_stderr": 0.031708824268455005 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7022900763358778, + "acc_stderr": 0.04010358942462203, + "acc_norm": 0.7022900763358778, + "acc_norm_stderr": 0.04010358942462203 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.04026187527591207, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.04026187527591207 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7962962962962963, + "acc_stderr": 0.03893542518824847, + "acc_norm": 0.7962962962962963, + "acc_norm_stderr": 0.03893542518824847 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6809815950920245, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.6809815950920245, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.0457237235873743, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.0457237235873743 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7475728155339806, + "acc_stderr": 0.04301250399690879, + "acc_norm": 0.7475728155339806, + "acc_norm_stderr": 0.04301250399690879 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8418803418803419, + "acc_stderr": 0.02390232554956041, + "acc_norm": 0.8418803418803419, + "acc_norm_stderr": 0.02390232554956041 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7637292464878672, + "acc_stderr": 0.015190473717037497, + "acc_norm": 0.7637292464878672, + "acc_norm_stderr": 0.015190473717037497 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.0257228022008958, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.0257228022008958 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.47374301675977654, + "acc_stderr": 0.01669942767278477, + "acc_norm": 0.47374301675977654, + "acc_norm_stderr": 0.01669942767278477 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6470588235294118, + "acc_stderr": 0.02736359328468496, + "acc_norm": 0.6470588235294118, + "acc_norm_stderr": 0.02736359328468496 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6495176848874598, + "acc_stderr": 0.027098652621301754, + "acc_norm": 0.6495176848874598, + "acc_norm_stderr": 0.027098652621301754 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6759259259259259, + "acc_stderr": 0.02604176620271716, + "acc_norm": 0.6759259259259259, + "acc_norm_stderr": 0.02604176620271716 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.029766675075873866, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.029766675075873866 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4348109517601043, + "acc_stderr": 0.012661233805616292, + "acc_norm": 0.4348109517601043, + "acc_norm_stderr": 0.012661233805616292 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5257352941176471, + "acc_stderr": 0.03033257809455502, + "acc_norm": 0.5257352941176471, + "acc_norm_stderr": 0.03033257809455502 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5931372549019608, + "acc_stderr": 0.019873802005061173, + "acc_norm": 0.5931372549019608, + "acc_norm_stderr": 0.019873802005061173 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6181818181818182, + "acc_stderr": 0.046534298079135075, + "acc_norm": 0.6181818181818182, + "acc_norm_stderr": 0.046534298079135075 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6938775510204082, + "acc_stderr": 0.029504896454595957, + "acc_norm": 0.6938775510204082, + "acc_norm_stderr": 0.029504896454595957 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7512437810945274, + "acc_stderr": 0.030567675938916714, + "acc_norm": 0.7512437810945274, + "acc_norm_stderr": 0.030567675938916714 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.45180722891566266, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.45180722891566266, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.030944459778533214, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.030944459778533214 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3623011015911873, + "mc1_stderr": 0.016826646897262255, + "mc2": 0.5021851197251754, + "mc2_stderr": 0.015571255623835292 + }, + "all": { + "acc": 0.579449026070526, + "acc_stderr": 0.034083790596487896, + "acc_norm": 0.5831845343672756, + "acc_norm_stderr": 0.03406436773103694, + "mc1": 0.3623011015911873, + "mc1_stderr": 0.016826646897262255, + "mc2": 0.5021851197251754, + "mc2_stderr": 0.015571255623835292 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "3ced177a9740ab72" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "736cbacfc627c9ce" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "d2d9cf5534b74b0b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "9d8617775e7afb7e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "8a729845cf844415" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "258d18b5a76e9d51" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "41f6ee2445154160" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "69114fe474fd53fa" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "76b2fa379520c907" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "b515d408b1bdf6f5" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "935dc99247031e33" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "85f7f7d7ac099657" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "d41d04de2e5e5d4b" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "587dad76855b6265" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "84745da13334a4b5" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "05f39a5a580500e1" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "8181ae2e48363b69" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "6d11e1c9a9d46862" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "8e94e84c0b1d140d" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "79e75724ab447f67" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "ec2a22eed7584a34" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "2ed2183b9bdf6b00" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "91fb99cbc39ad638" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "fdfb0c61160424af" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "793bad98a4990ca2" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "af786994f8c0cec8" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "37734a01ffbfc9c8" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "faf445de2faeb578" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "640c8dab253ca811" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "b51d8363b9d664e5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "12f3db94ad7a571a" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "e93f00105a26e30c" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "42d667fb2f670b76" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "fcea00b906601945" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "d83e6d4f7eacf9cd" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "ff40ec7eb62e0c4a" + }, + "total_evaluation_time_secondes": "9972.957553386688", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/OpenOrcaxOpenChat-Preview2-13B-GPTQ/results_2023-10-22T10-53-17.967443.json b/eval-results/TheBloke/OpenOrcaxOpenChat-Preview2-13B-GPTQ/results_2023-10-22T10-53-17.967443.json new file mode 100644 index 0000000000000000000000000000000000000000..d11f4339ba497dfb2532b6ac639274e0052b0d72 --- /dev/null +++ b/eval-results/TheBloke/OpenOrcaxOpenChat-Preview2-13B-GPTQ/results_2023-10-22T10-53-17.967443.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/OpenOrcaxOpenChat-Preview2-13B-GPTQ", + "model_sha": "cd25093038f1c9d735c39e0b65b1ffd8dc6e9d80", + "model_size": "6.84 GB", + "model_dtype": "None", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.004823825503355705, + "em_stderr": 0.0007095539645563046, + "f1": 0.08351929530201369, + "f1_stderr": 0.0017605531187545353 + }, + "harness|gsm8k|5": { + "acc": 0.1243366186504928, + "acc_stderr": 0.009088880962028442 + }, + "harness|winogrande|5": { + "acc": 0.771112865035517, + "acc_stderr": 0.011807360224025395 + }, + "all": { + "em": 0.004823825503355705, + "em_stderr": 0.0007095539645563046, + "f1": 0.08351929530201369, + "f1_stderr": 0.0017605531187545353, + "acc": 0.4477247418430049, + "acc_stderr": 0.010448120593026917 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "b92f285c0e6fee66" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "828f276042907c9b" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "b6b7a8b4d5e69273" + }, + "total_evaluation_time_secondes": "6986.181921720505", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Orca-2-13B-GPTQ/results_2023-11-25T03-42-21.410226.json b/eval-results/TheBloke/Orca-2-13B-GPTQ/results_2023-11-25T03-42-21.410226.json new file mode 100644 index 0000000000000000000000000000000000000000..850fb7c4091de667e842a28d70719597bcb4bcac --- /dev/null +++ b/eval-results/TheBloke/Orca-2-13B-GPTQ/results_2023-11-25T03-42-21.410226.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 739758.740612878, + "end_time": 762447.364627219, + "total_evaluation_time_secondes": "22688.62401434104", + "model_name": "TheBloke/Orca-2-13B-GPTQ", + "model_sha": "2fc627e11b197c7d563eeea9c4338c2adc8e2c93", + "model_dtype": "None", + "model_size": "6.84 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5614334470989761, + "acc_stderr": 0.01450068261821286, + "acc_norm": 0.5981228668941979, + "acc_norm_stderr": 0.014327268614578274 + }, + "harness|hellaswag|10": { + "acc": 0.6037641904003187, + "acc_stderr": 0.004881148866874181, + "acc_norm": 0.7911770563632743, + "acc_norm_stderr": 0.004056369096954941 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6074074074074074, + "acc_stderr": 0.04218506215368879, + "acc_norm": 0.6074074074074074, + "acc_norm_stderr": 0.04218506215368879 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7302631578947368, + "acc_stderr": 0.03611780560284898, + "acc_norm": 0.7302631578947368, + "acc_norm_stderr": 0.03611780560284898 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6150943396226415, + "acc_stderr": 0.029946498567699948, + "acc_norm": 0.6150943396226415, + "acc_norm_stderr": 0.029946498567699948 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6388888888888888, + "acc_stderr": 0.040166600304512336, + "acc_norm": 0.6388888888888888, + "acc_norm_stderr": 0.040166600304512336 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5375722543352601, + "acc_stderr": 0.03801685104524458, + "acc_norm": 0.5375722543352601, + "acc_norm_stderr": 0.03801685104524458 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.04784060704105655, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.04784060704105655 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.548936170212766, + "acc_stderr": 0.032529096196131965, + "acc_norm": 0.548936170212766, + "acc_norm_stderr": 0.032529096196131965 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.0433913832257986, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.0433913832257986 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.37037037037037035, + "acc_stderr": 0.0248708152510571, + "acc_norm": 0.37037037037037035, + "acc_norm_stderr": 0.0248708152510571 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.041905964388711366, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.041905964388711366 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7161290322580646, + "acc_stderr": 0.02564938106302926, + "acc_norm": 0.7161290322580646, + "acc_norm_stderr": 0.02564938106302926 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4482758620689655, + "acc_stderr": 0.034991131376767445, + "acc_norm": 0.4482758620689655, + "acc_norm_stderr": 0.034991131376767445 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7171717171717171, + "acc_stderr": 0.032087795587867514, + "acc_norm": 0.7171717171717171, + "acc_norm_stderr": 0.032087795587867514 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8393782383419689, + "acc_stderr": 0.026499057701397443, + "acc_norm": 0.8393782383419689, + "acc_norm_stderr": 0.026499057701397443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6025641025641025, + "acc_stderr": 0.024811920017903836, + "acc_norm": 0.6025641025641025, + "acc_norm_stderr": 0.024811920017903836 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.02831753349606648, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.02831753349606648 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6050420168067226, + "acc_stderr": 0.031753678460966245, + "acc_norm": 0.6050420168067226, + "acc_norm_stderr": 0.031753678460966245 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.037345356767871984, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.037345356767871984 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8036697247706422, + "acc_stderr": 0.017030719339154343, + "acc_norm": 0.8036697247706422, + "acc_norm_stderr": 0.017030719339154343 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4675925925925926, + "acc_stderr": 0.03402801581358966, + "acc_norm": 0.4675925925925926, + "acc_norm_stderr": 0.03402801581358966 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.028867431449849313, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.028867431449849313 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7974683544303798, + "acc_stderr": 0.02616056824660146, + "acc_norm": 0.7974683544303798, + "acc_norm_stderr": 0.02616056824660146 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.672645739910314, + "acc_stderr": 0.031493846709941306, + "acc_norm": 0.672645739910314, + "acc_norm_stderr": 0.031493846709941306 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7175572519083969, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.7175572519083969, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.03984979653302872, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.03984979653302872 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7300613496932515, + "acc_stderr": 0.03487825168497892, + "acc_norm": 0.7300613496932515, + "acc_norm_stderr": 0.03487825168497892 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764377, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764377 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8589743589743589, + "acc_stderr": 0.02280138253459753, + "acc_norm": 0.8589743589743589, + "acc_norm_stderr": 0.02280138253459753 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.014866821664709595, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.014866821664709595 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6791907514450867, + "acc_stderr": 0.025131000233647897, + "acc_norm": 0.6791907514450867, + "acc_norm_stderr": 0.025131000233647897 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.20782122905027933, + "acc_stderr": 0.013570248325081347, + "acc_norm": 0.20782122905027933, + "acc_norm_stderr": 0.013570248325081347 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6339869281045751, + "acc_stderr": 0.02758281141515961, + "acc_norm": 0.6339869281045751, + "acc_norm_stderr": 0.02758281141515961 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.662379421221865, + "acc_stderr": 0.026858825879488544, + "acc_norm": 0.662379421221865, + "acc_norm_stderr": 0.026858825879488544 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6697530864197531, + "acc_stderr": 0.026168298456732846, + "acc_norm": 0.6697530864197531, + "acc_norm_stderr": 0.026168298456732846 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.450354609929078, + "acc_stderr": 0.029680105565029036, + "acc_norm": 0.450354609929078, + "acc_norm_stderr": 0.029680105565029036 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4348109517601043, + "acc_stderr": 0.012661233805616302, + "acc_norm": 0.4348109517601043, + "acc_norm_stderr": 0.012661233805616302 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5625, + "acc_stderr": 0.030134614954403924, + "acc_norm": 0.5625, + "acc_norm_stderr": 0.030134614954403924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6045751633986928, + "acc_stderr": 0.019780465954777508, + "acc_norm": 0.6045751633986928, + "acc_norm_stderr": 0.019780465954777508 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.045820048415054174, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.045820048415054174 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7142857142857143, + "acc_stderr": 0.0289205832206756, + "acc_norm": 0.7142857142857143, + "acc_norm_stderr": 0.0289205832206756 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7761194029850746, + "acc_stderr": 0.029475250236017204, + "acc_norm": 0.7761194029850746, + "acc_norm_stderr": 0.029475250236017204 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.83, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4939759036144578, + "acc_stderr": 0.03892212195333045, + "acc_norm": 0.4939759036144578, + "acc_norm_stderr": 0.03892212195333045 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.03188578017686398, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.03188578017686398 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.38555691554467564, + "mc1_stderr": 0.01703883901059167, + "mc2": 0.5514098320774886, + "mc2_stderr": 0.0160327733300155 + }, + "harness|winogrande|5": { + "acc": 0.7663772691397001, + "acc_stderr": 0.011892194477183525 + }, + "harness|drop|3": { + "em": 0.42606963087248323, + "em_stderr": 0.0050641847856105855, + "f1": 0.5302139261744996, + "f1_stderr": 0.004659796001509701 + }, + "harness|gsm8k|5": { + "acc": 0.155420773313116, + "acc_stderr": 0.009979689409499152 + }, + "all": { + "acc": 0.5887851314518572, + "acc_stderr": 0.032958137391722146, + "acc_norm": 0.5969185976587905, + "acc_norm_stderr": 0.03368773395313244, + "mc1": 0.38555691554467564, + "mc1_stderr": 0.01703883901059167, + "mc2": 0.5514098320774886, + "mc2_stderr": 0.0160327733300155, + "em": 0.42606963087248323, + "em_stderr": 0.0050641847856105855, + "f1": 0.5302139261744996, + "f1_stderr": 0.004659796001509701 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "28a9713c87d95c3e" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "3382d2b8923b38f6" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "379266f3a5365f9d", + "hash_cont_tokens": "1d31b4423c663dc1" + }, + "truncated": 3, + "non_truncated": 38192, + "padded": 113348, + "non_padded": 11060, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Planner-7B-fp16/results_2023-07-19T16-47-15.541190.json b/eval-results/TheBloke/Planner-7B-fp16/results_2023-07-19T16-47-15.541190.json new file mode 100644 index 0000000000000000000000000000000000000000..327a6e4fe656339e7fbab66009fcad2dabc3f973 --- /dev/null +++ b/eval-results/TheBloke/Planner-7B-fp16/results_2023-07-19T16-47-15.541190.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.4761092150170648, + "acc_stderr": 0.014594701798071654, + "acc_norm": 0.5102389078498294, + "acc_norm_stderr": 0.014608326906285012 + }, + "harness|hellaswag|10": { + "acc": 0.575682135032862, + "acc_stderr": 0.004932289405608947, + "acc_norm": 0.77823142800239, + "acc_norm_stderr": 0.004145872091615228 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768081, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768081 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3851851851851852, + "acc_stderr": 0.042039210401562783, + "acc_norm": 0.3851851851851852, + "acc_norm_stderr": 0.042039210401562783 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.34868421052631576, + "acc_stderr": 0.038781398887976104, + "acc_norm": 0.34868421052631576, + "acc_norm_stderr": 0.038781398887976104 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.35471698113207545, + "acc_stderr": 0.02944517532819959, + "acc_norm": 0.35471698113207545, + "acc_norm_stderr": 0.02944517532819959 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.375, + "acc_stderr": 0.04048439222695598, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04048439222695598 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.32947976878612717, + "acc_stderr": 0.0358390175473641, + "acc_norm": 0.32947976878612717, + "acc_norm_stderr": 0.0358390175473641 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171451, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171451 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3702127659574468, + "acc_stderr": 0.03156564682236784, + "acc_norm": 0.3702127659574468, + "acc_norm_stderr": 0.03156564682236784 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.04142439719489362, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.04142439719489362 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.23448275862068965, + "acc_stderr": 0.035306258743465914, + "acc_norm": 0.23448275862068965, + "acc_norm_stderr": 0.035306258743465914 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.26455026455026454, + "acc_stderr": 0.02271746789770862, + "acc_norm": 0.26455026455026454, + "acc_norm_stderr": 0.02271746789770862 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.03932537680392871, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.03932537680392871 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.32903225806451614, + "acc_stderr": 0.026729499068349965, + "acc_norm": 0.32903225806451614, + "acc_norm_stderr": 0.026729499068349965 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.03178529710642749, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.03178529710642749 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.44242424242424244, + "acc_stderr": 0.03878372113711275, + "acc_norm": 0.44242424242424244, + "acc_norm_stderr": 0.03878372113711275 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.03358618145732522, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.03358618145732522 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.44559585492227977, + "acc_stderr": 0.0358701498607566, + "acc_norm": 0.44559585492227977, + "acc_norm_stderr": 0.0358701498607566 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3384615384615385, + "acc_stderr": 0.023991500500313036, + "acc_norm": 0.3384615384615385, + "acc_norm_stderr": 0.023991500500313036 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.026719240783712173, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.026719240783712173 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3319327731092437, + "acc_stderr": 0.030588697013783663, + "acc_norm": 0.3319327731092437, + "acc_norm_stderr": 0.030588697013783663 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2582781456953642, + "acc_stderr": 0.035737053147634576, + "acc_norm": 0.2582781456953642, + "acc_norm_stderr": 0.035737053147634576 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.47889908256880737, + "acc_stderr": 0.02141822475426464, + "acc_norm": 0.47889908256880737, + "acc_norm_stderr": 0.02141822475426464 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3101851851851852, + "acc_stderr": 0.031546962856566295, + "acc_norm": 0.3101851851851852, + "acc_norm_stderr": 0.031546962856566295 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.03374499356319355, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.03374499356319355 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.43037974683544306, + "acc_stderr": 0.03223017195937598, + "acc_norm": 0.43037974683544306, + "acc_norm_stderr": 0.03223017195937598 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3991031390134529, + "acc_stderr": 0.032867453125679603, + "acc_norm": 0.3991031390134529, + "acc_norm_stderr": 0.032867453125679603 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3511450381679389, + "acc_stderr": 0.0418644516301375, + "acc_norm": 0.3511450381679389, + "acc_norm_stderr": 0.0418644516301375 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5206611570247934, + "acc_stderr": 0.04560456086387235, + "acc_norm": 0.5206611570247934, + "acc_norm_stderr": 0.04560456086387235 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.4166666666666667, + "acc_stderr": 0.04766075165356461, + "acc_norm": 0.4166666666666667, + "acc_norm_stderr": 0.04766075165356461 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4294478527607362, + "acc_stderr": 0.038890666191127216, + "acc_norm": 0.4294478527607362, + "acc_norm_stderr": 0.038890666191127216 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2767857142857143, + "acc_stderr": 0.042466243366976256, + "acc_norm": 0.2767857142857143, + "acc_norm_stderr": 0.042466243366976256 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.33980582524271846, + "acc_stderr": 0.04689765937278133, + "acc_norm": 0.33980582524271846, + "acc_norm_stderr": 0.04689765937278133 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.47863247863247865, + "acc_stderr": 0.03272616447634954, + "acc_norm": 0.47863247863247865, + "acc_norm_stderr": 0.03272616447634954 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.42528735632183906, + "acc_stderr": 0.017679225489431447, + "acc_norm": 0.42528735632183906, + "acc_norm_stderr": 0.017679225489431447 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.3901734104046243, + "acc_stderr": 0.02626167760780665, + "acc_norm": 0.3901734104046243, + "acc_norm_stderr": 0.02626167760780665 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.39215686274509803, + "acc_stderr": 0.027956046165424516, + "acc_norm": 0.39215686274509803, + "acc_norm_stderr": 0.027956046165424516 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3987138263665595, + "acc_stderr": 0.0278093225857745, + "acc_norm": 0.3987138263665595, + "acc_norm_stderr": 0.0278093225857745 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.345679012345679, + "acc_stderr": 0.026462487777001883, + "acc_norm": 0.345679012345679, + "acc_norm_stderr": 0.026462487777001883 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2730496453900709, + "acc_stderr": 0.026577860943307857, + "acc_norm": 0.2730496453900709, + "acc_norm_stderr": 0.026577860943307857 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3011734028683181, + "acc_stderr": 0.011717148751648426, + "acc_norm": 0.3011734028683181, + "acc_norm_stderr": 0.011717148751648426 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4375, + "acc_stderr": 0.030134614954403924, + "acc_norm": 0.4375, + "acc_norm_stderr": 0.030134614954403924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.01933314202079706, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.01933314202079706 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.4, + "acc_stderr": 0.0469237132203465, + "acc_norm": 0.4, + "acc_norm_stderr": 0.0469237132203465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.33877551020408164, + "acc_stderr": 0.030299506562154185, + "acc_norm": 0.33877551020408164, + "acc_norm_stderr": 0.030299506562154185 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.46766169154228854, + "acc_stderr": 0.035281314729336065, + "acc_norm": 0.46766169154228854, + "acc_norm_stderr": 0.035281314729336065 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3373493975903614, + "acc_stderr": 0.03680783690727581, + "acc_norm": 0.3373493975903614, + "acc_norm_stderr": 0.03680783690727581 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.49122807017543857, + "acc_stderr": 0.038342347441649924, + "acc_norm": 0.49122807017543857, + "acc_norm_stderr": 0.038342347441649924 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.22031823745410037, + "mc1_stderr": 0.014509045171487291, + "mc2": 0.3432950455512755, + "mc2_stderr": 0.013189887366499535 + }, + "all": { + "acc": 0.3628321389404663, + "acc_stderr": 0.03461657401345451, + "acc_norm": 0.36684364717440354, + "acc_norm_stderr": 0.034603475840475184, + "mc1": 0.22031823745410037, + "mc1_stderr": 0.014509045171487291, + "mc2": 0.3432950455512755, + "mc2_stderr": 0.013189887366499535 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/Planner-7B-fp16", + "model_sha": "afb4604a06c8541960fb51240259777764c4ce7e", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Planner-7B-fp16/results_2023-10-21T22-53-17.425716.json b/eval-results/TheBloke/Planner-7B-fp16/results_2023-10-21T22-53-17.425716.json new file mode 100644 index 0000000000000000000000000000000000000000..8fe4972bf3dae06fcefddf852c484cf957a917f6 --- /dev/null +++ b/eval-results/TheBloke/Planner-7B-fp16/results_2023-10-21T22-53-17.425716.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/Planner-7B-fp16", + "model_sha": "afb4604a06c8541960fb51240259777764c4ce7e", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0010486577181208054, + "em_stderr": 0.0003314581465219126, + "f1": 0.056186031879194784, + "f1_stderr": 0.0012858243614759428 + }, + "harness|gsm8k|5": { + "acc": 0.0356330553449583, + "acc_stderr": 0.00510610785374419 + }, + "harness|winogrande|5": { + "acc": 0.7142857142857143, + "acc_stderr": 0.012696531870038616 + }, + "all": { + "em": 0.0010486577181208054, + "em_stderr": 0.0003314581465219126, + "f1": 0.056186031879194784, + "f1_stderr": 0.0012858243614759428, + "acc": 0.3749593848153363, + "acc_stderr": 0.008901319861891403 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "0cf98d9c102e554c" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "08fa7a519625d524" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "fa24476e3f52a08c" + }, + "total_evaluation_time_secondes": "11054.566107988358", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Platypus-30B-SuperHOT-8K-fp16/results_2023-08-18T16-25-34.320244.json b/eval-results/TheBloke/Platypus-30B-SuperHOT-8K-fp16/results_2023-08-18T16-25-34.320244.json new file mode 100644 index 0000000000000000000000000000000000000000..d9a76da3cde2d295443db7b53078313578c5b9da --- /dev/null +++ b/eval-results/TheBloke/Platypus-30B-SuperHOT-8K-fp16/results_2023-08-18T16-25-34.320244.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.21843003412969283, + "acc_stderr": 0.012074291605700987, + "acc_norm": 0.2568259385665529, + "acc_norm_stderr": 0.0127669237941168 + }, + "harness|hellaswag|10": { + "acc": 0.2731527584146584, + "acc_stderr": 0.004446680081493746, + "acc_norm": 0.3082055367456682, + "acc_norm_stderr": 0.004608082815535489 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2074074074074074, + "acc_stderr": 0.035025531706783186, + "acc_norm": 0.2074074074074074, + "acc_norm_stderr": 0.035025531706783186 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.20394736842105263, + "acc_stderr": 0.032790004063100515, + "acc_norm": 0.20394736842105263, + "acc_norm_stderr": 0.032790004063100515 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.22641509433962265, + "acc_stderr": 0.025757559893106748, + "acc_norm": 0.22641509433962265, + "acc_norm_stderr": 0.025757559893106748 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2361111111111111, + "acc_stderr": 0.03551446610810826, + "acc_norm": 0.2361111111111111, + "acc_norm_stderr": 0.03551446610810826 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.17, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.17, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749874, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749874 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171453, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171453 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.25957446808510637, + "acc_stderr": 0.02865917937429232, + "acc_norm": 0.25957446808510637, + "acc_norm_stderr": 0.02865917937429232 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.040969851398436695, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.040969851398436695 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2482758620689655, + "acc_stderr": 0.036001056927277716, + "acc_norm": 0.2482758620689655, + "acc_norm_stderr": 0.036001056927277716 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.20899470899470898, + "acc_stderr": 0.02094048156533486, + "acc_norm": 0.20899470899470898, + "acc_norm_stderr": 0.02094048156533486 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.040061680838488746, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.040061680838488746 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.16, + "acc_stderr": 0.03684529491774708, + "acc_norm": 0.16, + "acc_norm_stderr": 0.03684529491774708 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25483870967741934, + "acc_stderr": 0.024790118459332208, + "acc_norm": 0.25483870967741934, + "acc_norm_stderr": 0.024790118459332208 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.18226600985221675, + "acc_stderr": 0.02716334085964515, + "acc_norm": 0.18226600985221675, + "acc_norm_stderr": 0.02716334085964515 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.18686868686868688, + "acc_stderr": 0.02777253333421898, + "acc_norm": 0.18686868686868688, + "acc_norm_stderr": 0.02777253333421898 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.21243523316062177, + "acc_stderr": 0.02951928261681723, + "acc_norm": 0.21243523316062177, + "acc_norm_stderr": 0.02951928261681723 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.23333333333333334, + "acc_stderr": 0.021444547301560476, + "acc_norm": 0.23333333333333334, + "acc_norm_stderr": 0.021444547301560476 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2037037037037037, + "acc_stderr": 0.024556172219141265, + "acc_norm": 0.2037037037037037, + "acc_norm_stderr": 0.024556172219141265 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.026653531596715494, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.026653531596715494 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.17218543046357615, + "acc_stderr": 0.030826136961962396, + "acc_norm": 0.17218543046357615, + "acc_norm_stderr": 0.030826136961962396 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1981651376146789, + "acc_stderr": 0.017090573804217885, + "acc_norm": 0.1981651376146789, + "acc_norm_stderr": 0.017090573804217885 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2037037037037037, + "acc_stderr": 0.02746740180405799, + "acc_norm": 0.2037037037037037, + "acc_norm_stderr": 0.02746740180405799 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.28270042194092826, + "acc_stderr": 0.02931281415395592, + "acc_norm": 0.28270042194092826, + "acc_norm_stderr": 0.02931281415395592 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3094170403587444, + "acc_stderr": 0.031024411740572203, + "acc_norm": 0.3094170403587444, + "acc_norm_stderr": 0.031024411740572203 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.29770992366412213, + "acc_stderr": 0.04010358942462203, + "acc_norm": 0.29770992366412213, + "acc_norm_stderr": 0.04010358942462203 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.24793388429752067, + "acc_stderr": 0.039418975265163025, + "acc_norm": 0.24793388429752067, + "acc_norm_stderr": 0.039418975265163025 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.24539877300613497, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.24539877300613497, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.30357142857142855, + "acc_stderr": 0.04364226155841044, + "acc_norm": 0.30357142857142855, + "acc_norm_stderr": 0.04364226155841044 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2905982905982906, + "acc_stderr": 0.02974504857267404, + "acc_norm": 0.2905982905982906, + "acc_norm_stderr": 0.02974504857267404 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2515964240102171, + "acc_stderr": 0.015517322365529619, + "acc_norm": 0.2515964240102171, + "acc_norm_stderr": 0.015517322365529619 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24277456647398843, + "acc_stderr": 0.023083658586984204, + "acc_norm": 0.24277456647398843, + "acc_norm_stderr": 0.023083658586984204 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23910614525139665, + "acc_stderr": 0.014265554192331144, + "acc_norm": 0.23910614525139665, + "acc_norm_stderr": 0.014265554192331144 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22875816993464052, + "acc_stderr": 0.024051029739912255, + "acc_norm": 0.22875816993464052, + "acc_norm_stderr": 0.024051029739912255 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2347266881028939, + "acc_stderr": 0.024071805887677048, + "acc_norm": 0.2347266881028939, + "acc_norm_stderr": 0.024071805887677048 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21604938271604937, + "acc_stderr": 0.02289916291844581, + "acc_norm": 0.21604938271604937, + "acc_norm_stderr": 0.02289916291844581 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23049645390070922, + "acc_stderr": 0.025123739226872405, + "acc_norm": 0.23049645390070922, + "acc_norm_stderr": 0.025123739226872405 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24967405475880053, + "acc_stderr": 0.011054538377832318, + "acc_norm": 0.24967405475880053, + "acc_norm_stderr": 0.011054538377832318 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.1948529411764706, + "acc_stderr": 0.024060599423487428, + "acc_norm": 0.1948529411764706, + "acc_norm_stderr": 0.024060599423487428 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2630718954248366, + "acc_stderr": 0.017812676542320657, + "acc_norm": 0.2630718954248366, + "acc_norm_stderr": 0.017812676542320657 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.17959183673469387, + "acc_stderr": 0.024573293589585637, + "acc_norm": 0.17959183673469387, + "acc_norm_stderr": 0.024573293589585637 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23880597014925373, + "acc_stderr": 0.030147775935409217, + "acc_norm": 0.23880597014925373, + "acc_norm_stderr": 0.030147775935409217 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.27710843373493976, + "acc_stderr": 0.034843315926805875, + "acc_norm": 0.27710843373493976, + "acc_norm_stderr": 0.034843315926805875 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.2573099415204678, + "acc_stderr": 0.03352799844161865, + "acc_norm": 0.2573099415204678, + "acc_norm_stderr": 0.03352799844161865 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2178702570379437, + "mc1_stderr": 0.014450846714123892, + "mc2": 0.471292004765754, + "mc2_stderr": 0.01664156844910162 + }, + "all": { + "acc": 0.23647488823331855, + "acc_stderr": 0.030908567573023033, + "acc_norm": 0.23771978116158754, + "acc_norm_stderr": 0.030923042741200276, + "mc1": 0.2178702570379437, + "mc1_stderr": 0.014450846714123892, + "mc2": 0.471292004765754, + "mc2_stderr": 0.01664156844910162 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/Platypus-30B-SuperHOT-8K-fp16", + "model_sha": "e8ac508308911475125252dcf2677fe355dd3700", + "model_dtype": "torch.float16", + "lighteval_sha": "2b9e1cf249accf9b8168101189269701a82bfb9c", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "13967.2234582901", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Platypus2-70B-Instruct-GPTQ/results_2023-09-01T08-39-03.285201.json b/eval-results/TheBloke/Platypus2-70B-Instruct-GPTQ/results_2023-09-01T08-39-03.285201.json new file mode 100644 index 0000000000000000000000000000000000000000..7dcfb45119e9334f705c0036dbda619c0a10532f --- /dev/null +++ b/eval-results/TheBloke/Platypus2-70B-Instruct-GPTQ/results_2023-09-01T08-39-03.285201.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/Platypus2-70B-Instruct-GPTQ", + "model_sha": "4a44568aadd8a4babfa5549cf33e6e84cbae7ab8", + "model_dtype": "None", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6919795221843004, + "acc_stderr": 0.013491429517292038, + "acc_norm": 0.712457337883959, + "acc_norm_stderr": 0.013226719056266129 + }, + "harness|hellaswag|10": { + "acc": 0.6863174666401115, + "acc_stderr": 0.004630407476835178, + "acc_norm": 0.8755228042222665, + "acc_norm_stderr": 0.003294504807555233 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5925925925925926, + "acc_stderr": 0.042446332383532286, + "acc_norm": 0.5925925925925926, + "acc_norm_stderr": 0.042446332383532286 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7894736842105263, + "acc_stderr": 0.03317672787533157, + "acc_norm": 0.7894736842105263, + "acc_norm_stderr": 0.03317672787533157 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.71, + "acc_stderr": 0.04560480215720683, + "acc_norm": 0.71, + "acc_norm_stderr": 0.04560480215720683 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7471698113207547, + "acc_stderr": 0.026749899771241214, + "acc_norm": 0.7471698113207547, + "acc_norm_stderr": 0.026749899771241214 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.031164899666948614, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.031164899666948614 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6705202312138728, + "acc_stderr": 0.03583901754736411, + "acc_norm": 0.6705202312138728, + "acc_norm_stderr": 0.03583901754736411 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.04755129616062947, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.04755129616062947 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.76, + "acc_stderr": 0.04292346959909281, + "acc_norm": 0.76, + "acc_norm_stderr": 0.04292346959909281 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6680851063829787, + "acc_stderr": 0.03078373675774565, + "acc_norm": 0.6680851063829787, + "acc_norm_stderr": 0.03078373675774565 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.43859649122807015, + "acc_stderr": 0.04668000738510455, + "acc_norm": 0.43859649122807015, + "acc_norm_stderr": 0.04668000738510455 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6137931034482759, + "acc_stderr": 0.04057324734419036, + "acc_norm": 0.6137931034482759, + "acc_norm_stderr": 0.04057324734419036 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4656084656084656, + "acc_stderr": 0.02569032176249384, + "acc_norm": 0.4656084656084656, + "acc_norm_stderr": 0.02569032176249384 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5396825396825397, + "acc_stderr": 0.04458029125470973, + "acc_norm": 0.5396825396825397, + "acc_norm_stderr": 0.04458029125470973 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8064516129032258, + "acc_stderr": 0.022475258525536057, + "acc_norm": 0.8064516129032258, + "acc_norm_stderr": 0.022475258525536057 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5467980295566502, + "acc_stderr": 0.03502544650845872, + "acc_norm": 0.5467980295566502, + "acc_norm_stderr": 0.03502544650845872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8787878787878788, + "acc_stderr": 0.025485498373343237, + "acc_norm": 0.8787878787878788, + "acc_norm_stderr": 0.025485498373343237 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8585858585858586, + "acc_stderr": 0.02482590979334334, + "acc_norm": 0.8585858585858586, + "acc_norm_stderr": 0.02482590979334334 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9481865284974094, + "acc_stderr": 0.01599622932024412, + "acc_norm": 0.9481865284974094, + "acc_norm_stderr": 0.01599622932024412 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7025641025641025, + "acc_stderr": 0.023177408131465942, + "acc_norm": 0.7025641025641025, + "acc_norm_stderr": 0.023177408131465942 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.028037929969114982, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.028037929969114982 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7815126050420168, + "acc_stderr": 0.02684151432295894, + "acc_norm": 0.7815126050420168, + "acc_norm_stderr": 0.02684151432295894 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.47019867549668876, + "acc_stderr": 0.040752249922169775, + "acc_norm": 0.47019867549668876, + "acc_norm_stderr": 0.040752249922169775 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.908256880733945, + "acc_stderr": 0.012376323409137116, + "acc_norm": 0.908256880733945, + "acc_norm_stderr": 0.012376323409137116 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5972222222222222, + "acc_stderr": 0.03344887382997866, + "acc_norm": 0.5972222222222222, + "acc_norm_stderr": 0.03344887382997866 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9068627450980392, + "acc_stderr": 0.020397853969427, + "acc_norm": 0.9068627450980392, + "acc_norm_stderr": 0.020397853969427 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8987341772151899, + "acc_stderr": 0.019637720526065494, + "acc_norm": 0.8987341772151899, + "acc_norm_stderr": 0.019637720526065494 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7982062780269058, + "acc_stderr": 0.026936111912802277, + "acc_norm": 0.7982062780269058, + "acc_norm_stderr": 0.026936111912802277 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8091603053435115, + "acc_stderr": 0.03446513350752596, + "acc_norm": 0.8091603053435115, + "acc_norm_stderr": 0.03446513350752596 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8760330578512396, + "acc_stderr": 0.030083098716035216, + "acc_norm": 0.8760330578512396, + "acc_norm_stderr": 0.030083098716035216 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8055555555555556, + "acc_stderr": 0.038260763248848646, + "acc_norm": 0.8055555555555556, + "acc_norm_stderr": 0.038260763248848646 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8282208588957055, + "acc_stderr": 0.02963471727237103, + "acc_norm": 0.8282208588957055, + "acc_norm_stderr": 0.02963471727237103 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.04697113923010213, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.04697113923010213 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8446601941747572, + "acc_stderr": 0.03586594738573974, + "acc_norm": 0.8446601941747572, + "acc_norm_stderr": 0.03586594738573974 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9017094017094017, + "acc_stderr": 0.019503444900757567, + "acc_norm": 0.9017094017094017, + "acc_norm_stderr": 0.019503444900757567 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8659003831417624, + "acc_stderr": 0.012185528166499978, + "acc_norm": 0.8659003831417624, + "acc_norm_stderr": 0.012185528166499978 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7687861271676301, + "acc_stderr": 0.02269865716785571, + "acc_norm": 0.7687861271676301, + "acc_norm_stderr": 0.02269865716785571 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.646927374301676, + "acc_stderr": 0.01598420454526858, + "acc_norm": 0.646927374301676, + "acc_norm_stderr": 0.01598420454526858 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7647058823529411, + "acc_stderr": 0.024288619466046105, + "acc_norm": 0.7647058823529411, + "acc_norm_stderr": 0.024288619466046105 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.77491961414791, + "acc_stderr": 0.023720088516179027, + "acc_norm": 0.77491961414791, + "acc_norm_stderr": 0.023720088516179027 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8271604938271605, + "acc_stderr": 0.02103851777015737, + "acc_norm": 0.8271604938271605, + "acc_norm_stderr": 0.02103851777015737 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5673758865248227, + "acc_stderr": 0.029555454236778852, + "acc_norm": 0.5673758865248227, + "acc_norm_stderr": 0.029555454236778852 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5860495436766623, + "acc_stderr": 0.012579699631289262, + "acc_norm": 0.5860495436766623, + "acc_norm_stderr": 0.012579699631289262 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7132352941176471, + "acc_stderr": 0.027472274473233818, + "acc_norm": 0.7132352941176471, + "acc_norm_stderr": 0.027472274473233818 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7565359477124183, + "acc_stderr": 0.01736247376214661, + "acc_norm": 0.7565359477124183, + "acc_norm_stderr": 0.01736247376214661 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7181818181818181, + "acc_stderr": 0.04309118709946458, + "acc_norm": 0.7181818181818181, + "acc_norm_stderr": 0.04309118709946458 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7795918367346939, + "acc_stderr": 0.02653704531214529, + "acc_norm": 0.7795918367346939, + "acc_norm_stderr": 0.02653704531214529 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8706467661691543, + "acc_stderr": 0.023729830881018526, + "acc_norm": 0.8706467661691543, + "acc_norm_stderr": 0.023729830881018526 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774708, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774708 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5481927710843374, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.5481927710843374, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8421052631578947, + "acc_stderr": 0.027966785859160875, + "acc_norm": 0.8421052631578947, + "acc_norm_stderr": 0.027966785859160875 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.4455324357405141, + "mc1_stderr": 0.017399335280140354, + "mc2": 0.6253657801165746, + "mc2_stderr": 0.01474854589221215 + }, + "all": { + "acc": 0.6985296232204664, + "acc_stderr": 0.03125037426870383, + "acc_norm": 0.7020835749710057, + "acc_norm_stderr": 0.031223245232596956, + "mc1": 0.4455324357405141, + "mc1_stderr": 0.017399335280140354, + "mc2": 0.6253657801165746, + "mc2_stderr": 0.01474854589221215 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "49788.65458726883", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Poro-34B-GPTQ/results_2023-12-08T00-50-01.441893.json b/eval-results/TheBloke/Poro-34B-GPTQ/results_2023-12-08T00-50-01.441893.json new file mode 100644 index 0000000000000000000000000000000000000000..61f52f66996049e5579c2fc8cb0736bb8436b909 --- /dev/null +++ b/eval-results/TheBloke/Poro-34B-GPTQ/results_2023-12-08T00-50-01.441893.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 428424.883976715, + "end_time": 451470.211013169, + "total_evaluation_time_secondes": "23045.32703645405", + "model_name": "TheBloke/Poro-34B-GPTQ", + "model_sha": "f6e034384e36b411d6b831157fb6063060ec1169", + "model_dtype": "None", + "model_size": "17.24 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4206484641638225, + "acc_stderr": 0.014426211252508406, + "acc_norm": 0.47013651877133106, + "acc_norm_stderr": 0.014585305840007102 + }, + "harness|hellaswag|10": { + "acc": 0.5477992431786497, + "acc_stderr": 0.004966928094797573, + "acc_norm": 0.7375024895439155, + "acc_norm_stderr": 0.004390923353200555 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04072314811876837, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04072314811876837 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3223684210526316, + "acc_stderr": 0.03803510248351585, + "acc_norm": 0.3223684210526316, + "acc_norm_stderr": 0.03803510248351585 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.3471698113207547, + "acc_stderr": 0.029300101705549655, + "acc_norm": 0.3471698113207547, + "acc_norm_stderr": 0.029300101705549655 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2916666666666667, + "acc_stderr": 0.038009680605548574, + "acc_norm": 0.2916666666666667, + "acc_norm_stderr": 0.038009680605548574 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3179190751445087, + "acc_stderr": 0.0355068398916558, + "acc_norm": 0.3179190751445087, + "acc_norm_stderr": 0.0355068398916558 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.03793281185307809, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.03793281185307809 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3021276595744681, + "acc_stderr": 0.030017554471880557, + "acc_norm": 0.3021276595744681, + "acc_norm_stderr": 0.030017554471880557 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.34210526315789475, + "acc_stderr": 0.04462917535336937, + "acc_norm": 0.34210526315789475, + "acc_norm_stderr": 0.04462917535336937 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.35172413793103446, + "acc_stderr": 0.0397923663749741, + "acc_norm": 0.35172413793103446, + "acc_norm_stderr": 0.0397923663749741 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.26455026455026454, + "acc_stderr": 0.022717467897708617, + "acc_norm": 0.26455026455026454, + "acc_norm_stderr": 0.022717467897708617 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.19047619047619047, + "acc_stderr": 0.035122074123020534, + "acc_norm": 0.19047619047619047, + "acc_norm_stderr": 0.035122074123020534 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.27741935483870966, + "acc_stderr": 0.025470196835900055, + "acc_norm": 0.27741935483870966, + "acc_norm_stderr": 0.025470196835900055 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.031447125816782426, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.031447125816782426 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206824, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206824 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.37575757575757573, + "acc_stderr": 0.03781887353205982, + "acc_norm": 0.37575757575757573, + "acc_norm_stderr": 0.03781887353205982 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.03191178226713548, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.03191178226713548 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.36787564766839376, + "acc_stderr": 0.034801756684660366, + "acc_norm": 0.36787564766839376, + "acc_norm_stderr": 0.034801756684660366 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.27692307692307694, + "acc_stderr": 0.022688042352424994, + "acc_norm": 0.27692307692307694, + "acc_norm_stderr": 0.022688042352424994 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.22592592592592592, + "acc_stderr": 0.02549753263960955, + "acc_norm": 0.22592592592592592, + "acc_norm_stderr": 0.02549753263960955 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.29831932773109243, + "acc_stderr": 0.029719142876342853, + "acc_norm": 0.29831932773109243, + "acc_norm_stderr": 0.029719142876342853 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2582781456953642, + "acc_stderr": 0.035737053147634576, + "acc_norm": 0.2582781456953642, + "acc_norm_stderr": 0.035737053147634576 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3247706422018349, + "acc_stderr": 0.020077729109310327, + "acc_norm": 0.3247706422018349, + "acc_norm_stderr": 0.020077729109310327 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.25, + "acc_stderr": 0.029531221160930918, + "acc_norm": 0.25, + "acc_norm_stderr": 0.029531221160930918 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.3480392156862745, + "acc_stderr": 0.03343311240488419, + "acc_norm": 0.3480392156862745, + "acc_norm_stderr": 0.03343311240488419 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.38396624472573837, + "acc_stderr": 0.031658678064106674, + "acc_norm": 0.38396624472573837, + "acc_norm_stderr": 0.031658678064106674 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.4304932735426009, + "acc_stderr": 0.033231973029429394, + "acc_norm": 0.4304932735426009, + "acc_norm_stderr": 0.033231973029429394 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.29770992366412213, + "acc_stderr": 0.04010358942462202, + "acc_norm": 0.29770992366412213, + "acc_norm_stderr": 0.04010358942462202 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.4132231404958678, + "acc_stderr": 0.04495087843548408, + "acc_norm": 0.4132231404958678, + "acc_norm_stderr": 0.04495087843548408 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.4166666666666667, + "acc_stderr": 0.04766075165356461, + "acc_norm": 0.4166666666666667, + "acc_norm_stderr": 0.04766075165356461 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.31901840490797545, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.31901840490797545, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.33980582524271846, + "acc_stderr": 0.04689765937278132, + "acc_norm": 0.33980582524271846, + "acc_norm_stderr": 0.04689765937278132 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.42735042735042733, + "acc_stderr": 0.032408473935163266, + "acc_norm": 0.42735042735042733, + "acc_norm_stderr": 0.032408473935163266 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.4099616858237548, + "acc_stderr": 0.01758767231233604, + "acc_norm": 0.4099616858237548, + "acc_norm_stderr": 0.01758767231233604 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.32947976878612717, + "acc_stderr": 0.025305258131879706, + "acc_norm": 0.32947976878612717, + "acc_norm_stderr": 0.025305258131879706 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23910614525139665, + "acc_stderr": 0.014265554192331144, + "acc_norm": 0.23910614525139665, + "acc_norm_stderr": 0.014265554192331144 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.3562091503267974, + "acc_stderr": 0.027420477662629242, + "acc_norm": 0.3562091503267974, + "acc_norm_stderr": 0.027420477662629242 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3504823151125402, + "acc_stderr": 0.027098652621301744, + "acc_norm": 0.3504823151125402, + "acc_norm_stderr": 0.027098652621301744 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.37962962962962965, + "acc_stderr": 0.027002521034516475, + "acc_norm": 0.37962962962962965, + "acc_norm_stderr": 0.027002521034516475 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2553191489361702, + "acc_stderr": 0.026011992930902, + "acc_norm": 0.2553191489361702, + "acc_norm_stderr": 0.026011992930902 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2985658409387223, + "acc_stderr": 0.011688060141794217, + "acc_norm": 0.2985658409387223, + "acc_norm_stderr": 0.011688060141794217 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.39705882352941174, + "acc_stderr": 0.029722152099280058, + "acc_norm": 0.39705882352941174, + "acc_norm_stderr": 0.029722152099280058 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.3137254901960784, + "acc_stderr": 0.01877168389352818, + "acc_norm": 0.3137254901960784, + "acc_norm_stderr": 0.01877168389352818 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.4090909090909091, + "acc_stderr": 0.04709306978661896, + "acc_norm": 0.4090909090909091, + "acc_norm_stderr": 0.04709306978661896 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2897959183673469, + "acc_stderr": 0.02904308868330435, + "acc_norm": 0.2897959183673469, + "acc_norm_stderr": 0.02904308868330435 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.35323383084577115, + "acc_stderr": 0.03379790611796777, + "acc_norm": 0.35323383084577115, + "acc_norm_stderr": 0.03379790611796777 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3313253012048193, + "acc_stderr": 0.036643147772880864, + "acc_norm": 0.3313253012048193, + "acc_norm_stderr": 0.036643147772880864 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.4269005847953216, + "acc_stderr": 0.03793620616529917, + "acc_norm": 0.4269005847953216, + "acc_norm_stderr": 0.03793620616529917 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23745410036719705, + "mc1_stderr": 0.014896277441041843, + "mc2": 0.383670325628089, + "mc2_stderr": 0.01409469586210006 + }, + "harness|winogrande|5": { + "acc": 0.7134964483030781, + "acc_stderr": 0.012707030139960381 + }, + "harness|gsm8k|5": { + "acc": 0.05079605761940864, + "acc_stderr": 0.006048352096878092 + }, + "all": { + "acc": 0.33180686611984367, + "acc_stderr": 0.033120719430084274, + "acc_norm": 0.334154536073911, + "acc_norm_stderr": 0.03391850157363058, + "mc1": 0.23745410036719705, + "mc1_stderr": 0.014896277441041843, + "mc2": 0.383670325628089, + "mc2_stderr": 0.01409469586210006 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "429ef8dfa8579729", + "hash_cont_tokens": "1074d62e72860e45" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4683, + "non_padded": 4, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "d301b5238a91122e", + "hash_cont_tokens": "dcda825c3afa2e07" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40118, + "non_padded": 50, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "a538140af68036aa", + "hash_cont_tokens": "60fdd8b6ef1180dd" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "52129687a4984cb1", + "hash_cont_tokens": "5d4bd7669f854f01" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "6d081d2a1e41793d", + "hash_cont_tokens": "10170e50c50a8cab" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "f99db57da25246f8", + "hash_cont_tokens": "60fdd8b6ef1180dd" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "4e92bc16303e263a", + "hash_cont_tokens": "27e4dfc2d27e82f6" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1056, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "4f5cc6c8396e5edf", + "hash_cont_tokens": "772e0fc74f89ab99" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 572, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "a5e7117eaab4c9fa", + "hash_cont_tokens": "60fdd8b6ef1180dd" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "82c7307fecd27e90", + "hash_cont_tokens": "60fdd8b6ef1180dd" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "adf3fe551c7a5320", + "hash_cont_tokens": "60fdd8b6ef1180dd" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "5fdf85d2bb82326f", + "hash_cont_tokens": "846888faecc49c0d" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "1769b8b91e25fae7", + "hash_cont_tokens": "c96608b9883ca653" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "3d09c412e00faf3d", + "hash_cont_tokens": "60fdd8b6ef1180dd" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "012d170c7b6d0dd0", + "hash_cont_tokens": "3791a3369b3a81cf" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "8aa8ac19e58fe50b", + "hash_cont_tokens": "299cb792ef682239" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "01984d4ad0f9b470", + "hash_cont_tokens": "abc45648564808fa" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "dafee6c1678d17ce", + "hash_cont_tokens": "1e6c70d5a353ada0" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "e5ef405429727c74", + "hash_cont_tokens": "590065d18c153f1d" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "faa766d6e484b719", + "hash_cont_tokens": "60fdd8b6ef1180dd" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b7a49d360400c916", + "hash_cont_tokens": "d291b2bd0480123d" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "e982ac3af1a6b4d4", + "hash_cont_tokens": "d39cb8fd17e53470" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "3b541844ae324e7f", + "hash_cont_tokens": "60fdd8b6ef1180dd" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "b5f3372bab95fc07", + "hash_cont_tokens": "f00c0eccc5c5f1b8" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "1ddfe07e4c2f6840", + "hash_cont_tokens": "d92b733807776997" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "0632904086bde507", + "hash_cont_tokens": "99c9d163e41b77d1" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "71b3627e3c5853ab", + "hash_cont_tokens": "97473acf6ae33dc7" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "a8369e9e4f1bc383", + "hash_cont_tokens": "a3428d1f541e527e" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "8fac2be4d1778f91", + "hash_cont_tokens": "fb5163f64aed823d" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "3388cfd273cf8b76", + "hash_cont_tokens": "dfa1a9ea6f6a1187" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "80393d27ec192638", + "hash_cont_tokens": "c4f9537eabf1d081" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "cb85f312618e1936", + "hash_cont_tokens": "0c4cc31dd703799f" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "8c304cf67db38a85", + "hash_cont_tokens": "224f16172b227e5f" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "fb7e322555e09a91", + "hash_cont_tokens": "10b36c78f9e7138c" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "7c58e2bde1c45ffc", + "hash_cont_tokens": "a50a6d9061875d31" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "b5cdf8e4de90bb15", + "hash_cont_tokens": "8525a76dee6e1054" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "b1ced3f4b63d5d49", + "hash_cont_tokens": "7c5a28b9dfbc1215" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "257dc98540348f6a", + "hash_cont_tokens": "df10e8c7d2367c6f" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "f28af1e4adbc6b31", + "hash_cont_tokens": "c61629be41acb422" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 632, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "b4e7faa1197eada1", + "hash_cont_tokens": "73719a815ff99347" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 444, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "00828587df84dac1", + "hash_cont_tokens": "93dace482a111671" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "f79d5a2dce599507", + "hash_cont_tokens": "6fde6a144821cd61" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "897aca1412cf4e8e", + "hash_cont_tokens": "60fdd8b6ef1180dd" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "c083d6419ede31a7", + "hash_cont_tokens": "b204cec5a26870af" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3128, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "fde8ab858cbecfa1", + "hash_cont_tokens": "d578afaaaff9f1e3" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "112ebf2111769dc7", + "hash_cont_tokens": "0d423b17773472d7" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c26caf66c1233d42", + "hash_cont_tokens": "9b05a543256cedd1" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "d86498d23a9b6c3b", + "hash_cont_tokens": "064047f562467493" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "b3c7e4a30abb101e", + "hash_cont_tokens": "85c3aa7c83abe003" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "cdc42585571ca29f", + "hash_cont_tokens": "f0b02efed6dc6f48" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "7e91a56fa8aaa2e7", + "hash_cont_tokens": "965ee690c7ecd058" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "976532e17c50fe4f", + "hash_cont_tokens": "059c7812efd52558" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "dd2284dfb7abcd16", + "hash_cont_tokens": "6a93ec5ade6f0ecf" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "ba0e72e73d50a5db", + "hash_cont_tokens": "e79e87d4340293a4" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "8cd955ba2c266779", + "hash_cont_tokens": "9e20996afce57471" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ddb020cc6e247582", + "hash_cont_tokens": "c812018f366fa1b1" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 780, + "non_padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "25f0cad364aaf16b", + "hash_cont_tokens": "60fdd8b6ef1180dd" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e32340c3313f9efb", + "hash_cont_tokens": "309ff18b8ac1b509" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "3431e1212d547912", + "hash_cont_tokens": "d9f00b9b285ab492" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "3d390666c2a1780e", + "hash_cont_tokens": "f3fd2545aa802887" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "db831bf49a646021", + "hash_cont_tokens": "e59a1b33428a1f3f" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "87e526a0f1a63bc0", + "hash_cont_tokens": "84512b1a43588bc2" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "48cc469b6ead41cd", + "hash_cont_tokens": "de3a580c83208ba0" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113431, + "non_padded": 1441, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Project-Baize-v2-13B-GPTQ/results_2023-08-22T13-47-48.408564.json b/eval-results/TheBloke/Project-Baize-v2-13B-GPTQ/results_2023-08-22T13-47-48.408564.json new file mode 100644 index 0000000000000000000000000000000000000000..bc5aaeadf8cb8a93d0908ff7216a8224bc698c45 --- /dev/null +++ b/eval-results/TheBloke/Project-Baize-v2-13B-GPTQ/results_2023-08-22T13-47-48.408564.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.24573378839590443, + "acc_stderr": 0.01258103345373011, + "acc_norm": 0.27559726962457337, + "acc_norm_stderr": 0.013057169655761838 + }, + "harness|hellaswag|10": { + "acc": 0.2552280422226648, + "acc_stderr": 0.004350982826580599, + "acc_norm": 0.26419040031866164, + "acc_norm_stderr": 0.004400000822742062 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04072314811876837, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04072314811876837 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.24342105263157895, + "acc_stderr": 0.034923496688842384, + "acc_norm": 0.24342105263157895, + "acc_norm_stderr": 0.034923496688842384 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.20754716981132076, + "acc_stderr": 0.02495991802891127, + "acc_norm": 0.20754716981132076, + "acc_norm_stderr": 0.02495991802891127 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2254335260115607, + "acc_stderr": 0.03186209851641143, + "acc_norm": 0.2254335260115607, + "acc_norm_stderr": 0.03186209851641143 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.043364327079931785, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.043364327079931785 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.3, + "acc_stderr": 0.04605661864718381, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04605661864718381 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2765957446808511, + "acc_stderr": 0.02924188386962881, + "acc_norm": 0.2765957446808511, + "acc_norm_stderr": 0.02924188386962881 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2689655172413793, + "acc_stderr": 0.03695183311650232, + "acc_norm": 0.2689655172413793, + "acc_norm_stderr": 0.03695183311650232 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.0220190800122179, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.0220190800122179 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.038095238095238106, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.038095238095238106 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.23225806451612904, + "acc_stderr": 0.02402225613030824, + "acc_norm": 0.23225806451612904, + "acc_norm_stderr": 0.02402225613030824 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.03178529710642751, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.03178529710642751 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.03453131801885415, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.03453131801885415 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.25757575757575757, + "acc_stderr": 0.03115626951964686, + "acc_norm": 0.25757575757575757, + "acc_norm_stderr": 0.03115626951964686 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.20207253886010362, + "acc_stderr": 0.02897908979429673, + "acc_norm": 0.20207253886010362, + "acc_norm_stderr": 0.02897908979429673 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2128205128205128, + "acc_stderr": 0.02075242372212802, + "acc_norm": 0.2128205128205128, + "acc_norm_stderr": 0.02075242372212802 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2851851851851852, + "acc_stderr": 0.027528599210340492, + "acc_norm": 0.2851851851851852, + "acc_norm_stderr": 0.027528599210340492 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.026653531596715484, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.026653531596715484 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23841059602649006, + "acc_stderr": 0.0347918557259966, + "acc_norm": 0.23841059602649006, + "acc_norm_stderr": 0.0347918557259966 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.22568807339449543, + "acc_stderr": 0.017923087667803057, + "acc_norm": 0.22568807339449543, + "acc_norm_stderr": 0.017923087667803057 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2916666666666667, + "acc_stderr": 0.03099866630456053, + "acc_norm": 0.2916666666666667, + "acc_norm_stderr": 0.03099866630456053 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.030964517926923393, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.030964517926923393 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.21940928270042195, + "acc_stderr": 0.026939106581553945, + "acc_norm": 0.21940928270042195, + "acc_norm_stderr": 0.026939106581553945 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.2600896860986547, + "acc_stderr": 0.029442495585857473, + "acc_norm": 0.2600896860986547, + "acc_norm_stderr": 0.029442495585857473 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.16030534351145037, + "acc_stderr": 0.0321782942074463, + "acc_norm": 0.16030534351145037, + "acc_norm_stderr": 0.0321782942074463 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.04065578140908705, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.04065578140908705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.04330043749650743, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.04330043749650743 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.26380368098159507, + "acc_stderr": 0.034624199316156234, + "acc_norm": 0.26380368098159507, + "acc_norm_stderr": 0.034624199316156234 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.20535714285714285, + "acc_stderr": 0.038342410214190714, + "acc_norm": 0.20535714285714285, + "acc_norm_stderr": 0.038342410214190714 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.18446601941747573, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.18446601941747573, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.24786324786324787, + "acc_stderr": 0.0282863240755644, + "acc_norm": 0.24786324786324787, + "acc_norm_stderr": 0.0282863240755644 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036623, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036623 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2720306513409962, + "acc_stderr": 0.015913367447500524, + "acc_norm": 0.2720306513409962, + "acc_norm_stderr": 0.015913367447500524 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2630057803468208, + "acc_stderr": 0.023703099525258176, + "acc_norm": 0.2630057803468208, + "acc_norm_stderr": 0.023703099525258176 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24804469273743016, + "acc_stderr": 0.014444157808261446, + "acc_norm": 0.24804469273743016, + "acc_norm_stderr": 0.014444157808261446 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.02564686309713791, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.02564686309713791 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2861736334405145, + "acc_stderr": 0.025670259242188947, + "acc_norm": 0.2861736334405145, + "acc_norm_stderr": 0.025670259242188947 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.31790123456790126, + "acc_stderr": 0.02591006352824087, + "acc_norm": 0.31790123456790126, + "acc_norm_stderr": 0.02591006352824087 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.28368794326241137, + "acc_stderr": 0.026891709428343957, + "acc_norm": 0.28368794326241137, + "acc_norm_stderr": 0.026891709428343957 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.26140808344198174, + "acc_stderr": 0.01122252816977131, + "acc_norm": 0.26140808344198174, + "acc_norm_stderr": 0.01122252816977131 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.30514705882352944, + "acc_stderr": 0.0279715413701706, + "acc_norm": 0.30514705882352944, + "acc_norm_stderr": 0.0279715413701706 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.017848089574913226, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.017848089574913226 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.3090909090909091, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.3090909090909091, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3224489795918367, + "acc_stderr": 0.029923100563683903, + "acc_norm": 0.3224489795918367, + "acc_norm_stderr": 0.029923100563683903 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2835820895522388, + "acc_stderr": 0.031871875379197966, + "acc_norm": 0.2835820895522388, + "acc_norm_stderr": 0.031871875379197966 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.22, + "acc_stderr": 0.0416333199893227, + "acc_norm": 0.22, + "acc_norm_stderr": 0.0416333199893227 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2710843373493976, + "acc_stderr": 0.034605799075530255, + "acc_norm": 0.2710843373493976, + "acc_norm_stderr": 0.034605799075530255 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.21637426900584794, + "acc_stderr": 0.03158149539338733, + "acc_norm": 0.21637426900584794, + "acc_norm_stderr": 0.03158149539338733 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.24112607099143207, + "mc1_stderr": 0.014974827279752339, + "mc2": 0.48217112656241606, + "mc2_stderr": 0.01706048623340291 + }, + "all": { + "acc": 0.2587648736963, + "acc_stderr": 0.03187184494961934, + "acc_norm": 0.2594229387695995, + "acc_norm_stderr": 0.03188074586823278, + "mc1": 0.24112607099143207, + "mc1_stderr": 0.014974827279752339, + "mc2": 0.48217112656241606, + "mc2_stderr": 0.01706048623340291 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/Project-Baize-v2-13B-GPTQ", + "model_sha": "8dee7c7129aaad1ded245fce712ff5dbb2845258", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2b0e07d4cdd3b0fe", + "hash_cont_tokens": "52204555b6e39a6e" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "578edd77107cb2c3", + "hash_cont_tokens": "25c49737526d9f80" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "6a95a1511f8da075", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "24a78edc4d9a93aa", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "b11106668d6c0974", + "hash_cont_tokens": "ebed26cf74a85815" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "10180ba12a075cb0", + "hash_cont_tokens": "6898ac348a7ae442" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "73351ef4968750a2", + "hash_cont_tokens": "34a058958a45af94" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "a539150af234c668", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "52e12e5a43bcee35", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "d1f3721a5659f7ee", + "hash_cont_tokens": "da408cb12ab08288" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "f2d78f546b5595c2", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "c9cc19179f63d1d6", + "hash_cont_tokens": "370a1a0ab68d15cd" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5046144e67e992e8", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4b14581ba4fc06fc", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "1ee52c413b5b4cc4", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "2914077c4dd3090a", + "hash_cont_tokens": "80dea4d59245cf01" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0f88a874342378de", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "9889933f1dd02a23", + "hash_cont_tokens": "309bef1803097408" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dc309a94c4bfdd2f", + "hash_cont_tokens": "5105a3bd1b39b785" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "0801a0aebec3ba8c", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "5bc4aca8831d9c05", + "hash_cont_tokens": "205c5deee1581b02" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b92bd6b06fc3464c", + "hash_cont_tokens": "272d28867e0ff046" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a549346cde8165e9", + "hash_cont_tokens": "98b3bf311aa83f0d" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "e7e9cf91f9d6a081", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "a61a1670f854d9e1", + "hash_cont_tokens": "d9e66fc7c702b795" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8a77cb7763f28110", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "fcfcfae391f8faa1", + "hash_cont_tokens": "d4b1936084c060e0" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a29454cc1feb23ef", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "b6734a25556d75dc", + "hash_cont_tokens": "2bf9921a39e901d9" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "5720438e29473426", + "hash_cont_tokens": "cab8b16be9576360" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "486321d5858de240", + "hash_cont_tokens": "1c34fbe5a59f1ed1" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "473919e64d1b8c80", + "hash_cont_tokens": "ebd714885a59ef55" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "47a65c81fd7ed010", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "aedfcd41cbd2fcc9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "ed5f2414144d7b72", + "hash_cont_tokens": "aac52fa6a519223b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "692eaacb5b747264", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "2cbce4edca937588", + "hash_cont_tokens": "697179a0dd47c5c0" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "c2f38b19bab1aa2c", + "hash_cont_tokens": "9b19898e3ecb527f" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fde277bc547bc3d8", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "87b232bbebce39db", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "58c21af9da3e126e", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d1f5c770d368e9c6", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "98d6db15a50aaa8e", + "hash_cont_tokens": "1e30d7dedc7588c0" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "2aabd8c7337502f8", + "hash_cont_tokens": "ceee291786cbb123" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "17f8c8f2d4a0a9b1", + "hash_cont_tokens": "484df4c25a5460bd" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "dfc6df491d991966", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "cffe8139e00da9dd", + "hash_cont_tokens": "85a9de6c685b7035" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "4a69ed6ee55918fb", + "hash_cont_tokens": "ad7b5a040535bdcf" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "6cc713f12b5890de", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "b4044fc92756c377", + "hash_cont_tokens": "0b7b5aaef574dc78" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b019784da8db089a", + "hash_cont_tokens": "63a651778e8d72d2" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "f47f37c7c9bfc601", + "hash_cont_tokens": "841583ab707b25d7" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "4d282718d6142410", + "hash_cont_tokens": "9c2c01d3214f66b8" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fbc6026e500537bc", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "150dd1ff81ff642e", + "hash_cont_tokens": "96353c5969a9028a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "fcbac3e735545969", + "hash_cont_tokens": "a1f8901800ac9b68" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ffc962a38441ef13", + "hash_cont_tokens": "08c0be345e5f1c12" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "9ffb65d225ae550f", + "hash_cont_tokens": "16c760a491c6f26e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "1c61d6705b299f5c", + "hash_cont_tokens": "868d6f1055fbd51d" + }, + "total_evaluation_time_secondes": "4699.229541778564", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Project-Baize-v2-7B-GPTQ/results_2023-08-29T19-38-18.380876.json b/eval-results/TheBloke/Project-Baize-v2-7B-GPTQ/results_2023-08-29T19-38-18.380876.json new file mode 100644 index 0000000000000000000000000000000000000000..569d67e8547d0906d5850b6993380179507b96bc --- /dev/null +++ b/eval-results/TheBloke/Project-Baize-v2-7B-GPTQ/results_2023-08-29T19-38-18.380876.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/Project-Baize-v2-7B-GPTQ", + "model_sha": "5dc039834e1ea42ac334458b2e3090fe3705cc59", + "model_dtype": "torch.float16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.44112627986348124, + "acc_stderr": 0.014509747749064663, + "acc_norm": 0.4598976109215017, + "acc_norm_stderr": 0.014564318856924848 + }, + "harness|hellaswag|10": { + "acc": 0.5578570005974905, + "acc_stderr": 0.004956262919324401, + "acc_norm": 0.7344154550886277, + "acc_norm_stderr": 0.004407413723383404 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4, + "acc_stderr": 0.04232073695151589, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04232073695151589 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3355263157894737, + "acc_stderr": 0.038424985593952694, + "acc_norm": 0.3355263157894737, + "acc_norm_stderr": 0.038424985593952694 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4, + "acc_stderr": 0.030151134457776296, + "acc_norm": 0.4, + "acc_norm_stderr": 0.030151134457776296 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3472222222222222, + "acc_stderr": 0.039812405437178615, + "acc_norm": 0.3472222222222222, + "acc_norm_stderr": 0.039812405437178615 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768077, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768077 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.28901734104046245, + "acc_stderr": 0.034564257450869995, + "acc_norm": 0.28901734104046245, + "acc_norm_stderr": 0.034564257450869995 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.04488482852329017, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.04488482852329017 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2936170212765957, + "acc_stderr": 0.029771642712491227, + "acc_norm": 0.2936170212765957, + "acc_norm_stderr": 0.029771642712491227 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.04096985139843671, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.04096985139843671 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.36551724137931035, + "acc_stderr": 0.04013124195424385, + "acc_norm": 0.36551724137931035, + "acc_norm_stderr": 0.04013124195424385 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2328042328042328, + "acc_stderr": 0.02176596167215453, + "acc_norm": 0.2328042328042328, + "acc_norm_stderr": 0.02176596167215453 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30952380952380953, + "acc_stderr": 0.041349130183033156, + "acc_norm": 0.30952380952380953, + "acc_norm_stderr": 0.041349130183033156 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.36451612903225805, + "acc_stderr": 0.02737987122994325, + "acc_norm": 0.36451612903225805, + "acc_norm_stderr": 0.02737987122994325 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2512315270935961, + "acc_stderr": 0.030516530732694436, + "acc_norm": 0.2512315270935961, + "acc_norm_stderr": 0.030516530732694436 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952344, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952344 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.4484848484848485, + "acc_stderr": 0.038835659779569286, + "acc_norm": 0.4484848484848485, + "acc_norm_stderr": 0.038835659779569286 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.4292929292929293, + "acc_stderr": 0.03526552724601199, + "acc_norm": 0.4292929292929293, + "acc_norm_stderr": 0.03526552724601199 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.44559585492227977, + "acc_stderr": 0.035870149860756595, + "acc_norm": 0.44559585492227977, + "acc_norm_stderr": 0.035870149860756595 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.34102564102564104, + "acc_stderr": 0.024035489676335065, + "acc_norm": 0.34102564102564104, + "acc_norm_stderr": 0.024035489676335065 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085626, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085626 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.33613445378151263, + "acc_stderr": 0.03068473711513536, + "acc_norm": 0.33613445378151263, + "acc_norm_stderr": 0.03068473711513536 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.03734535676787198, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.03734535676787198 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.4036697247706422, + "acc_stderr": 0.02103570485657497, + "acc_norm": 0.4036697247706422, + "acc_norm_stderr": 0.02103570485657497 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.30092592592592593, + "acc_stderr": 0.031280390843298825, + "acc_norm": 0.30092592592592593, + "acc_norm_stderr": 0.031280390843298825 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.4362745098039216, + "acc_stderr": 0.03480693138457039, + "acc_norm": 0.4362745098039216, + "acc_norm_stderr": 0.03480693138457039 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.4936708860759494, + "acc_stderr": 0.03254462010767859, + "acc_norm": 0.4936708860759494, + "acc_norm_stderr": 0.03254462010767859 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3721973094170404, + "acc_stderr": 0.032443052830087304, + "acc_norm": 0.3721973094170404, + "acc_norm_stderr": 0.032443052830087304 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.4198473282442748, + "acc_stderr": 0.04328577215262972, + "acc_norm": 0.4198473282442748, + "acc_norm_stderr": 0.04328577215262972 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.45454545454545453, + "acc_stderr": 0.045454545454545456, + "acc_norm": 0.45454545454545453, + "acc_norm_stderr": 0.045454545454545456 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.04489931073591312, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.04489931073591312 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3374233128834356, + "acc_stderr": 0.037149084099355745, + "acc_norm": 0.3374233128834356, + "acc_norm_stderr": 0.037149084099355745 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.23214285714285715, + "acc_stderr": 0.04007341809755805, + "acc_norm": 0.23214285714285715, + "acc_norm_stderr": 0.04007341809755805 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.4077669902912621, + "acc_stderr": 0.04865777570410769, + "acc_norm": 0.4077669902912621, + "acc_norm_stderr": 0.04865777570410769 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.03255326307272485, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.03255326307272485 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.46998722860791825, + "acc_stderr": 0.0178477230866491, + "acc_norm": 0.46998722860791825, + "acc_norm_stderr": 0.0178477230866491 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.3670520231213873, + "acc_stderr": 0.025950054337654082, + "acc_norm": 0.3670520231213873, + "acc_norm_stderr": 0.025950054337654082 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25251396648044694, + "acc_stderr": 0.014530330201468645, + "acc_norm": 0.25251396648044694, + "acc_norm_stderr": 0.014530330201468645 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.33986928104575165, + "acc_stderr": 0.027121956071388845, + "acc_norm": 0.33986928104575165, + "acc_norm_stderr": 0.027121956071388845 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3762057877813505, + "acc_stderr": 0.02751392568354943, + "acc_norm": 0.3762057877813505, + "acc_norm_stderr": 0.02751392568354943 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.37962962962962965, + "acc_stderr": 0.027002521034516478, + "acc_norm": 0.37962962962962965, + "acc_norm_stderr": 0.027002521034516478 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2695035460992908, + "acc_stderr": 0.02646903681859062, + "acc_norm": 0.2695035460992908, + "acc_norm_stderr": 0.02646903681859062 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.303129074315515, + "acc_stderr": 0.01173866995125429, + "acc_norm": 0.303129074315515, + "acc_norm_stderr": 0.01173866995125429 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4522058823529412, + "acc_stderr": 0.03023375855159645, + "acc_norm": 0.4522058823529412, + "acc_norm_stderr": 0.03023375855159645 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.30718954248366015, + "acc_stderr": 0.01866335967146367, + "acc_norm": 0.30718954248366015, + "acc_norm_stderr": 0.01866335967146367 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.34545454545454546, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.34545454545454546, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3224489795918367, + "acc_stderr": 0.029923100563683906, + "acc_norm": 0.3224489795918367, + "acc_norm_stderr": 0.029923100563683906 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.40298507462686567, + "acc_stderr": 0.03468343295111126, + "acc_norm": 0.40298507462686567, + "acc_norm_stderr": 0.03468343295111126 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3373493975903614, + "acc_stderr": 0.0368078369072758, + "acc_norm": 0.3373493975903614, + "acc_norm_stderr": 0.0368078369072758 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.49122807017543857, + "acc_stderr": 0.038342347441649924, + "acc_norm": 0.49122807017543857, + "acc_norm_stderr": 0.038342347441649924 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.25458996328029376, + "mc1_stderr": 0.015250117079156494, + "mc2": 0.3991984976250448, + "mc2_stderr": 0.015243111830696071 + }, + "all": { + "acc": 0.3595551770399452, + "acc_stderr": 0.034528095881188374, + "acc_norm": 0.3628658513712869, + "acc_norm_stderr": 0.03451971828647514, + "mc1": 0.25458996328029376, + "mc1_stderr": 0.015250117079156494, + "mc2": 0.3991984976250448, + "mc2_stderr": 0.015243111830696071 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4540.518447875977", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Project-Baize-v2-7B-GPTQ/results_2023-10-22T21-24-57.179060.json b/eval-results/TheBloke/Project-Baize-v2-7B-GPTQ/results_2023-10-22T21-24-57.179060.json new file mode 100644 index 0000000000000000000000000000000000000000..72048581e436ddeed5db7a20b3cf1fa27813e367 --- /dev/null +++ b/eval-results/TheBloke/Project-Baize-v2-7B-GPTQ/results_2023-10-22T21-24-57.179060.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/Project-Baize-v2-7B-GPTQ", + "model_sha": "5dc039834e1ea42ac334458b2e3090fe3705cc59", + "model_size": "3.66 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001363255033557047, + "em_stderr": 0.0003778609196460633, + "f1": 0.05739828020134247, + "f1_stderr": 0.001324280220685328 + }, + "harness|gsm8k|5": { + "acc": 0.025018953752843062, + "acc_stderr": 0.0043020450465643045 + }, + "harness|winogrande|5": { + "acc": 0.696921862667719, + "acc_stderr": 0.012916727462634475 + }, + "all": { + "em": 0.001363255033557047, + "em_stderr": 0.0003778609196460633, + "f1": 0.05739828020134247, + "f1_stderr": 0.001324280220685328, + "acc": 0.36097040821028104, + "acc_stderr": 0.00860938625459939 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "5145f9427a099ecb" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "71cc5ad20087ef13" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "b63202e28fc7b1f7" + }, + "total_evaluation_time_secondes": "6006.673577070236", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/UltraLM-13B-fp16/results_2023-07-19T19-33-28.322265.json b/eval-results/TheBloke/UltraLM-13B-fp16/results_2023-07-19T19-33-28.322265.json new file mode 100644 index 0000000000000000000000000000000000000000..134d0270859e1dc3c66696e265a57aae1f58a567 --- /dev/null +++ b/eval-results/TheBloke/UltraLM-13B-fp16/results_2023-07-19T19-33-28.322265.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5631399317406144, + "acc_stderr": 0.014494421584256525, + "acc_norm": 0.575938566552901, + "acc_norm_stderr": 0.0144418896274644 + }, + "harness|hellaswag|10": { + "acc": 0.5995817566221868, + "acc_stderr": 0.004889817489739686, + "acc_norm": 0.8020314678350926, + "acc_norm_stderr": 0.003976539512078581 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.04605661864718381, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04605661864718381 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4740740740740741, + "acc_stderr": 0.04313531696750574, + "acc_norm": 0.4740740740740741, + "acc_norm_stderr": 0.04313531696750574 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5131578947368421, + "acc_stderr": 0.04067533136309174, + "acc_norm": 0.5131578947368421, + "acc_norm_stderr": 0.04067533136309174 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5283018867924528, + "acc_stderr": 0.0307235352490061, + "acc_norm": 0.5283018867924528, + "acc_norm_stderr": 0.0307235352490061 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5069444444444444, + "acc_stderr": 0.04180806750294938, + "acc_norm": 0.5069444444444444, + "acc_norm_stderr": 0.04180806750294938 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4393063583815029, + "acc_stderr": 0.03784271932887467, + "acc_norm": 0.4393063583815029, + "acc_norm_stderr": 0.03784271932887467 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.04533838195929775, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.04533838195929775 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4, + "acc_stderr": 0.03202563076101737, + "acc_norm": 0.4, + "acc_norm_stderr": 0.03202563076101737 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.32456140350877194, + "acc_stderr": 0.04404556157374767, + "acc_norm": 0.32456140350877194, + "acc_norm_stderr": 0.04404556157374767 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4896551724137931, + "acc_stderr": 0.04165774775728763, + "acc_norm": 0.4896551724137931, + "acc_norm_stderr": 0.04165774775728763 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30423280423280424, + "acc_stderr": 0.023695415009463087, + "acc_norm": 0.30423280423280424, + "acc_norm_stderr": 0.023695415009463087 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.04360314860077459, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.04360314860077459 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5967741935483871, + "acc_stderr": 0.02790615082604114, + "acc_norm": 0.5967741935483871, + "acc_norm_stderr": 0.02790615082604114 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3842364532019704, + "acc_stderr": 0.0342239856565755, + "acc_norm": 0.3842364532019704, + "acc_norm_stderr": 0.0342239856565755 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.03713158067481913, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.03713158067481913 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6565656565656566, + "acc_stderr": 0.03383201223244441, + "acc_norm": 0.6565656565656566, + "acc_norm_stderr": 0.03383201223244441 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7098445595854922, + "acc_stderr": 0.032752644677915166, + "acc_norm": 0.7098445595854922, + "acc_norm_stderr": 0.032752644677915166 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.02529460802398648, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.02529460802398648 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24814814814814815, + "acc_stderr": 0.0263357394040558, + "acc_norm": 0.24814814814814815, + "acc_norm_stderr": 0.0263357394040558 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4957983193277311, + "acc_stderr": 0.03247734334448111, + "acc_norm": 0.4957983193277311, + "acc_norm_stderr": 0.03247734334448111 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6697247706422018, + "acc_stderr": 0.020164466336342973, + "acc_norm": 0.6697247706422018, + "acc_norm_stderr": 0.020164466336342973 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3287037037037037, + "acc_stderr": 0.03203614084670058, + "acc_norm": 0.3287037037037037, + "acc_norm_stderr": 0.03203614084670058 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6813725490196079, + "acc_stderr": 0.0327028718148208, + "acc_norm": 0.6813725490196079, + "acc_norm_stderr": 0.0327028718148208 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7046413502109705, + "acc_stderr": 0.02969633871342288, + "acc_norm": 0.7046413502109705, + "acc_norm_stderr": 0.02969633871342288 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5739910313901345, + "acc_stderr": 0.03318833286217281, + "acc_norm": 0.5739910313901345, + "acc_norm_stderr": 0.03318833286217281 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6412213740458015, + "acc_stderr": 0.04206739313864908, + "acc_norm": 0.6412213740458015, + "acc_norm_stderr": 0.04206739313864908 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7107438016528925, + "acc_stderr": 0.04139112727635464, + "acc_norm": 0.7107438016528925, + "acc_norm_stderr": 0.04139112727635464 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.0471282125742677, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.0471282125742677 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6257668711656442, + "acc_stderr": 0.03802068102899615, + "acc_norm": 0.6257668711656442, + "acc_norm_stderr": 0.03802068102899615 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.04432804055291519, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.04432804055291519 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6699029126213593, + "acc_stderr": 0.0465614711001235, + "acc_norm": 0.6699029126213593, + "acc_norm_stderr": 0.0465614711001235 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7692307692307693, + "acc_stderr": 0.027601921381417597, + "acc_norm": 0.7692307692307693, + "acc_norm_stderr": 0.027601921381417597 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7049808429118773, + "acc_stderr": 0.016308363772932724, + "acc_norm": 0.7049808429118773, + "acc_norm_stderr": 0.016308363772932724 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5751445086705202, + "acc_stderr": 0.026613350840261743, + "acc_norm": 0.5751445086705202, + "acc_norm_stderr": 0.026613350840261743 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.32737430167597764, + "acc_stderr": 0.015694238967737386, + "acc_norm": 0.32737430167597764, + "acc_norm_stderr": 0.015694238967737386 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6143790849673203, + "acc_stderr": 0.02787074527829027, + "acc_norm": 0.6143790849673203, + "acc_norm_stderr": 0.02787074527829027 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5755627009646302, + "acc_stderr": 0.028071928247946205, + "acc_norm": 0.5755627009646302, + "acc_norm_stderr": 0.028071928247946205 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.595679012345679, + "acc_stderr": 0.02730662529732769, + "acc_norm": 0.595679012345679, + "acc_norm_stderr": 0.02730662529732769 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.40425531914893614, + "acc_stderr": 0.029275532159704725, + "acc_norm": 0.40425531914893614, + "acc_norm_stderr": 0.029275532159704725 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4132985658409387, + "acc_stderr": 0.012576779494860083, + "acc_norm": 0.4132985658409387, + "acc_norm_stderr": 0.012576779494860083 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5110294117647058, + "acc_stderr": 0.030365446477275668, + "acc_norm": 0.5110294117647058, + "acc_norm_stderr": 0.030365446477275668 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5506535947712419, + "acc_stderr": 0.020123766528027266, + "acc_norm": 0.5506535947712419, + "acc_norm_stderr": 0.020123766528027266 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5727272727272728, + "acc_stderr": 0.04738198703545483, + "acc_norm": 0.5727272727272728, + "acc_norm_stderr": 0.04738198703545483 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.563265306122449, + "acc_stderr": 0.031751952375833226, + "acc_norm": 0.563265306122449, + "acc_norm_stderr": 0.031751952375833226 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7313432835820896, + "acc_stderr": 0.031343283582089536, + "acc_norm": 0.7313432835820896, + "acc_norm_stderr": 0.031343283582089536 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.43373493975903615, + "acc_stderr": 0.03858158940685517, + "acc_norm": 0.43373493975903615, + "acc_norm_stderr": 0.03858158940685517 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7309941520467836, + "acc_stderr": 0.03401052620104089, + "acc_norm": 0.7309941520467836, + "acc_norm_stderr": 0.03401052620104089 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35862913096695226, + "mc1_stderr": 0.016789289499502025, + "mc2": 0.5156080724492841, + "mc2_stderr": 0.01519672450707773 + }, + "all": { + "acc": 0.5206534097013187, + "acc_stderr": 0.034922154994422465, + "acc_norm": 0.524301686752593, + "acc_norm_stderr": 0.03490578533451648, + "mc1": 0.35862913096695226, + "mc1_stderr": 0.016789289499502025, + "mc2": 0.5156080724492841, + "mc2_stderr": 0.01519672450707773 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/UltraLM-13B-fp16", + "model_sha": "734f5641f6c548474517d1536c46024517f120e0", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/UltraLM-13B-fp16/results_2023-10-22T20-20-20.923100.json b/eval-results/TheBloke/UltraLM-13B-fp16/results_2023-10-22T20-20-20.923100.json new file mode 100644 index 0000000000000000000000000000000000000000..ce9d712533e295e4750d638ee90fa9b71359d9f1 --- /dev/null +++ b/eval-results/TheBloke/UltraLM-13B-fp16/results_2023-10-22T20-20-20.923100.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/UltraLM-13B-fp16", + "model_sha": "734f5641f6c548474517d1536c46024517f120e0", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.01363255033557047, + "em_stderr": 0.0011875381552413013, + "f1": 0.08585046140939587, + "f1_stderr": 0.0018748006407108256 + }, + "harness|gsm8k|5": { + "acc": 0.1068991660348749, + "acc_stderr": 0.008510982565520497 + }, + "harness|winogrande|5": { + "acc": 0.7584846093133386, + "acc_stderr": 0.012028983782011875 + }, + "all": { + "em": 0.01363255033557047, + "em_stderr": 0.0011875381552413013, + "f1": 0.08585046140939587, + "f1_stderr": 0.0018748006407108256, + "acc": 0.43269188767410677, + "acc_stderr": 0.010269983173766185 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "acc53ff88a5383c3" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "c5555e98e9585ba5" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "0f4b7de981b6482e" + }, + "total_evaluation_time_secondes": "13364.841675043106", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/VicUnlocked-30B-LoRA-HF/results_2023-08-15T22-10-44.824822.json b/eval-results/TheBloke/VicUnlocked-30B-LoRA-HF/results_2023-08-15T22-10-44.824822.json new file mode 100644 index 0000000000000000000000000000000000000000..d4c015ffd1a6133102e22ee7579c6b21aa806d33 --- /dev/null +++ b/eval-results/TheBloke/VicUnlocked-30B-LoRA-HF/results_2023-08-15T22-10-44.824822.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5742320819112628, + "acc_stderr": 0.01444946427886881, + "acc_norm": 0.5972696245733788, + "acc_norm_stderr": 0.014332236306790145 + }, + "harness|hellaswag|10": { + "acc": 0.6362278430591516, + "acc_stderr": 0.004801009657690439, + "acc_norm": 0.8401712806213901, + "acc_norm_stderr": 0.003656982165386173 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5111111111111111, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.5111111111111111, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6118421052631579, + "acc_stderr": 0.03965842097512744, + "acc_norm": 0.6118421052631579, + "acc_norm_stderr": 0.03965842097512744 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5584905660377358, + "acc_stderr": 0.030561590426731833, + "acc_norm": 0.5584905660377358, + "acc_norm_stderr": 0.030561590426731833 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.04076663253918567, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.04076663253918567 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.043898699568087785, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.043898699568087785 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4808510638297872, + "acc_stderr": 0.03266204299064678, + "acc_norm": 0.4808510638297872, + "acc_norm_stderr": 0.03266204299064678 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.35964912280701755, + "acc_stderr": 0.045144961328736334, + "acc_norm": 0.35964912280701755, + "acc_norm_stderr": 0.045144961328736334 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.04164188720169377, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.04164188720169377 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3439153439153439, + "acc_stderr": 0.024464426625596437, + "acc_norm": 0.3439153439153439, + "acc_norm_stderr": 0.024464426625596437 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.04263906892795132, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.04263906892795132 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6741935483870968, + "acc_stderr": 0.0266620105785671, + "acc_norm": 0.6741935483870968, + "acc_norm_stderr": 0.0266620105785671 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3842364532019704, + "acc_stderr": 0.03422398565657551, + "acc_norm": 0.3842364532019704, + "acc_norm_stderr": 0.03422398565657551 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.64, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.64, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7636363636363637, + "acc_stderr": 0.033175059300091826, + "acc_norm": 0.7636363636363637, + "acc_norm_stderr": 0.033175059300091826 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.696969696969697, + "acc_stderr": 0.032742879140268674, + "acc_norm": 0.696969696969697, + "acc_norm_stderr": 0.032742879140268674 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8290155440414507, + "acc_stderr": 0.027171213683164535, + "acc_norm": 0.8290155440414507, + "acc_norm_stderr": 0.027171213683164535 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5461538461538461, + "acc_stderr": 0.025242770987126184, + "acc_norm": 0.5461538461538461, + "acc_norm_stderr": 0.025242770987126184 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.02730914058823018, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.02730914058823018 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5840336134453782, + "acc_stderr": 0.03201650100739611, + "acc_norm": 0.5840336134453782, + "acc_norm_stderr": 0.03201650100739611 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3576158940397351, + "acc_stderr": 0.03913453431177258, + "acc_norm": 0.3576158940397351, + "acc_norm_stderr": 0.03913453431177258 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7614678899082569, + "acc_stderr": 0.018272575810231874, + "acc_norm": 0.7614678899082569, + "acc_norm_stderr": 0.018272575810231874 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4398148148148148, + "acc_stderr": 0.03385177976044812, + "acc_norm": 0.4398148148148148, + "acc_norm_stderr": 0.03385177976044812 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7794117647058824, + "acc_stderr": 0.029102254389674082, + "acc_norm": 0.7794117647058824, + "acc_norm_stderr": 0.029102254389674082 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8059071729957806, + "acc_stderr": 0.025744902532290916, + "acc_norm": 0.8059071729957806, + "acc_norm_stderr": 0.025744902532290916 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6591928251121076, + "acc_stderr": 0.03181149747055361, + "acc_norm": 0.6591928251121076, + "acc_norm_stderr": 0.03181149747055361 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6335877862595419, + "acc_stderr": 0.04225875451969638, + "acc_norm": 0.6335877862595419, + "acc_norm_stderr": 0.04225875451969638 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7520661157024794, + "acc_stderr": 0.03941897526516304, + "acc_norm": 0.7520661157024794, + "acc_norm_stderr": 0.03941897526516304 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7037037037037037, + "acc_stderr": 0.04414343666854932, + "acc_norm": 0.7037037037037037, + "acc_norm_stderr": 0.04414343666854932 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6809815950920245, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.6809815950920245, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7184466019417476, + "acc_stderr": 0.04453254836326466, + "acc_norm": 0.7184466019417476, + "acc_norm_stderr": 0.04453254836326466 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8632478632478633, + "acc_stderr": 0.022509033937077795, + "acc_norm": 0.8632478632478633, + "acc_norm_stderr": 0.022509033937077795 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.65, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.65, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7739463601532567, + "acc_stderr": 0.014957458504335833, + "acc_norm": 0.7739463601532567, + "acc_norm_stderr": 0.014957458504335833 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6589595375722543, + "acc_stderr": 0.02552247463212161, + "acc_norm": 0.6589595375722543, + "acc_norm_stderr": 0.02552247463212161 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3743016759776536, + "acc_stderr": 0.01618544417945717, + "acc_norm": 0.3743016759776536, + "acc_norm_stderr": 0.01618544417945717 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6143790849673203, + "acc_stderr": 0.02787074527829027, + "acc_norm": 0.6143790849673203, + "acc_norm_stderr": 0.02787074527829027 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6913183279742765, + "acc_stderr": 0.02623696588115327, + "acc_norm": 0.6913183279742765, + "acc_norm_stderr": 0.02623696588115327 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6635802469135802, + "acc_stderr": 0.026289734945952926, + "acc_norm": 0.6635802469135802, + "acc_norm_stderr": 0.026289734945952926 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.43617021276595747, + "acc_stderr": 0.02958345203628407, + "acc_norm": 0.43617021276595747, + "acc_norm_stderr": 0.02958345203628407 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.46284224250325945, + "acc_stderr": 0.012734923579532069, + "acc_norm": 0.46284224250325945, + "acc_norm_stderr": 0.012734923579532069 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5367647058823529, + "acc_stderr": 0.03029061918048569, + "acc_norm": 0.5367647058823529, + "acc_norm_stderr": 0.03029061918048569 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6127450980392157, + "acc_stderr": 0.019706875804085627, + "acc_norm": 0.6127450980392157, + "acc_norm_stderr": 0.019706875804085627 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.046075820907199756, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.046075820907199756 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6530612244897959, + "acc_stderr": 0.0304725260267265, + "acc_norm": 0.6530612244897959, + "acc_norm_stderr": 0.0304725260267265 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7960199004975125, + "acc_stderr": 0.02849317624532607, + "acc_norm": 0.7960199004975125, + "acc_norm_stderr": 0.02849317624532607 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.0358870281282637, + "acc_norm": 0.85, + "acc_norm_stderr": 0.0358870281282637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5120481927710844, + "acc_stderr": 0.03891364495835817, + "acc_norm": 0.5120481927710844, + "acc_norm_stderr": 0.03891364495835817 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8011695906432749, + "acc_stderr": 0.03061111655743253, + "acc_norm": 0.8011695906432749, + "acc_norm_stderr": 0.03061111655743253 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.31334149326805383, + "mc1_stderr": 0.0162380650690596, + "mc2": 0.4854286644286285, + "mc2_stderr": 0.01461142776118866 + }, + "all": { + "acc": 0.5790223807911279, + "acc_stderr": 0.0340314862539054, + "acc_norm": 0.5828695160491679, + "acc_norm_stderr": 0.034010109042644676, + "mc1": 0.31334149326805383, + "mc1_stderr": 0.0162380650690596, + "mc2": 0.4854286644286285, + "mc2_stderr": 0.01461142776118866 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/VicUnlocked-30B-LoRA-HF", + "model_sha": "3259cb3c2a10cfb429fb51c4a76fffa049f4c44d", + "model_dtype": "torch.float16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "9149.398374557495", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/VicUnlocked-30B-LoRA-HF/results_2023-10-22T21-45-52.426808.json b/eval-results/TheBloke/VicUnlocked-30B-LoRA-HF/results_2023-10-22T21-45-52.426808.json new file mode 100644 index 0000000000000000000000000000000000000000..6995fead85ecf80cc2c032f52c213d852370bd91 --- /dev/null +++ b/eval-results/TheBloke/VicUnlocked-30B-LoRA-HF/results_2023-10-22T21-45-52.426808.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/VicUnlocked-30B-LoRA-HF", + "model_sha": "3259cb3c2a10cfb429fb51c4a76fffa049f4c44d", + "model_size": "60.65 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001363255033557047, + "em_stderr": 0.0003778609196460696, + "f1": 0.0645071308724832, + "f1_stderr": 0.0013899526153663272 + }, + "harness|gsm8k|5": { + "acc": 0.14404852160727824, + "acc_stderr": 0.009672110973065282 + }, + "harness|winogrande|5": { + "acc": 0.7947908445146015, + "acc_stderr": 0.011350315707462056 + }, + "all": { + "em": 0.001363255033557047, + "em_stderr": 0.0003778609196460696, + "f1": 0.0645071308724832, + "f1_stderr": 0.0013899526153663272, + "acc": 0.46941968306093984, + "acc_stderr": 0.01051121334026367 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "9a3f97c5a2d50215" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "dbce820b75d0f162" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "faf0ac43367a28be" + }, + "total_evaluation_time_secondes": "21724.582058906555", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/VicUnlocked-30B-LoRA-HF/results_2023-10-23T04-52-45.302158.json b/eval-results/TheBloke/VicUnlocked-30B-LoRA-HF/results_2023-10-23T04-52-45.302158.json new file mode 100644 index 0000000000000000000000000000000000000000..e79439704211171864dceb4046285357e1cb887a --- /dev/null +++ b/eval-results/TheBloke/VicUnlocked-30B-LoRA-HF/results_2023-10-23T04-52-45.302158.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/VicUnlocked-30B-LoRA-HF", + "model_sha": "3259cb3c2a10cfb429fb51c4a76fffa049f4c44d", + "model_size": "60.65 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001363255033557047, + "em_stderr": 0.0003778609196460696, + "f1": 0.0645071308724832, + "f1_stderr": 0.0013899526153663272 + }, + "harness|gsm8k|5": { + "acc": 0.14404852160727824, + "acc_stderr": 0.009672110973065282 + }, + "harness|winogrande|5": { + "acc": 0.7947908445146015, + "acc_stderr": 0.011350315707462056 + }, + "all": { + "em": 0.001363255033557047, + "em_stderr": 0.0003778609196460696, + "f1": 0.0645071308724832, + "f1_stderr": 0.0013899526153663272, + "acc": 0.46941968306093984, + "acc_stderr": 0.01051121334026367 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "9a3f97c5a2d50215" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "dbce820b75d0f162" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "faf0ac43367a28be" + }, + "total_evaluation_time_secondes": "21536.49699831009", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16/results_2023-07-25T19-42-29.328886.json b/eval-results/TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16/results_2023-07-25T19-42-29.328886.json new file mode 100644 index 0000000000000000000000000000000000000000..1d72024f188b7a7ce750ab81cfab48a934e53472 --- /dev/null +++ b/eval-results/TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16/results_2023-07-25T19-42-29.328886.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.6194539249146758, + "acc_stderr": 0.014188277712349812, + "acc_norm": 0.6561433447098977, + "acc_norm_stderr": 0.013880644570156215 + }, + "harness|hellaswag|10": { + "acc": 0.6573391754630552, + "acc_stderr": 0.004736292355716399, + "acc_norm": 0.8515236008763195, + "acc_norm_stderr": 0.0035484490542860114 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.04292596718256981, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.04292596718256981 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7302631578947368, + "acc_stderr": 0.03611780560284898, + "acc_norm": 0.7302631578947368, + "acc_norm_stderr": 0.03611780560284898 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.660377358490566, + "acc_stderr": 0.029146904747798325, + "acc_norm": 0.660377358490566, + "acc_norm_stderr": 0.029146904747798325 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7013888888888888, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.7013888888888888, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5549132947976878, + "acc_stderr": 0.03789401760283647, + "acc_norm": 0.5549132947976878, + "acc_norm_stderr": 0.03789401760283647 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.39215686274509803, + "acc_stderr": 0.048580835742663454, + "acc_norm": 0.39215686274509803, + "acc_norm_stderr": 0.048580835742663454 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932263, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932263 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5914893617021276, + "acc_stderr": 0.032134180267015755, + "acc_norm": 0.5914893617021276, + "acc_norm_stderr": 0.032134180267015755 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.39473684210526316, + "acc_stderr": 0.045981880578165414, + "acc_norm": 0.39473684210526316, + "acc_norm_stderr": 0.045981880578165414 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.37566137566137564, + "acc_stderr": 0.02494236893115978, + "acc_norm": 0.37566137566137564, + "acc_norm_stderr": 0.02494236893115978 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4365079365079365, + "acc_stderr": 0.04435932892851466, + "acc_norm": 0.4365079365079365, + "acc_norm_stderr": 0.04435932892851466 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7483870967741936, + "acc_stderr": 0.024685979286239963, + "acc_norm": 0.7483870967741936, + "acc_norm_stderr": 0.024685979286239963 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.035158955511656986, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.035158955511656986 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.65, + "acc_stderr": 0.04793724854411019, + "acc_norm": 0.65, + "acc_norm_stderr": 0.04793724854411019 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7757575757575758, + "acc_stderr": 0.03256866661681102, + "acc_norm": 0.7757575757575758, + "acc_norm_stderr": 0.03256866661681102 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8131313131313131, + "acc_stderr": 0.027772533334218957, + "acc_norm": 0.8131313131313131, + "acc_norm_stderr": 0.027772533334218957 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8652849740932642, + "acc_stderr": 0.024639789097709447, + "acc_norm": 0.8652849740932642, + "acc_norm_stderr": 0.024639789097709447 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6641025641025641, + "acc_stderr": 0.02394672474156397, + "acc_norm": 0.6641025641025641, + "acc_norm_stderr": 0.02394672474156397 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.0287420409039485, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.0287420409039485 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6932773109243697, + "acc_stderr": 0.029953823891887048, + "acc_norm": 0.6932773109243697, + "acc_norm_stderr": 0.029953823891887048 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3708609271523179, + "acc_stderr": 0.03943966699183629, + "acc_norm": 0.3708609271523179, + "acc_norm_stderr": 0.03943966699183629 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8201834862385321, + "acc_stderr": 0.016465345467391534, + "acc_norm": 0.8201834862385321, + "acc_norm_stderr": 0.016465345467391534 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5601851851851852, + "acc_stderr": 0.0338517797604481, + "acc_norm": 0.5601851851851852, + "acc_norm_stderr": 0.0338517797604481 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8284313725490197, + "acc_stderr": 0.026460569561240634, + "acc_norm": 0.8284313725490197, + "acc_norm_stderr": 0.026460569561240634 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8143459915611815, + "acc_stderr": 0.025310495376944856, + "acc_norm": 0.8143459915611815, + "acc_norm_stderr": 0.025310495376944856 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6547085201793722, + "acc_stderr": 0.03191100192835795, + "acc_norm": 0.6547085201793722, + "acc_norm_stderr": 0.03191100192835795 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7251908396946565, + "acc_stderr": 0.039153454088478354, + "acc_norm": 0.7251908396946565, + "acc_norm_stderr": 0.039153454088478354 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7768595041322314, + "acc_stderr": 0.03800754475228733, + "acc_norm": 0.7768595041322314, + "acc_norm_stderr": 0.03800754475228733 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.042844679680521934, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.042844679680521934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7300613496932515, + "acc_stderr": 0.034878251684978906, + "acc_norm": 0.7300613496932515, + "acc_norm_stderr": 0.034878251684978906 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4642857142857143, + "acc_stderr": 0.04733667890053756, + "acc_norm": 0.4642857142857143, + "acc_norm_stderr": 0.04733667890053756 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.03760178006026621, + "acc_norm": 0.8252427184466019, + "acc_norm_stderr": 0.03760178006026621 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.024414947304543674, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.024414947304543674 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.65, + "acc_stderr": 0.04793724854411019, + "acc_norm": 0.65, + "acc_norm_stderr": 0.04793724854411019 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8135376756066411, + "acc_stderr": 0.013927751372001505, + "acc_norm": 0.8135376756066411, + "acc_norm_stderr": 0.013927751372001505 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7283236994219653, + "acc_stderr": 0.02394851290546835, + "acc_norm": 0.7283236994219653, + "acc_norm_stderr": 0.02394851290546835 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4312849162011173, + "acc_stderr": 0.016563829399047703, + "acc_norm": 0.4312849162011173, + "acc_norm_stderr": 0.016563829399047703 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7026143790849673, + "acc_stderr": 0.02617390850671858, + "acc_norm": 0.7026143790849673, + "acc_norm_stderr": 0.02617390850671858 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.707395498392283, + "acc_stderr": 0.025839898334877983, + "acc_norm": 0.707395498392283, + "acc_norm_stderr": 0.025839898334877983 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.02438366553103546, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.02438366553103546 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5106382978723404, + "acc_stderr": 0.02982074719142244, + "acc_norm": 0.5106382978723404, + "acc_norm_stderr": 0.02982074719142244 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4745762711864407, + "acc_stderr": 0.012753716929101006, + "acc_norm": 0.4745762711864407, + "acc_norm_stderr": 0.012753716929101006 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6323529411764706, + "acc_stderr": 0.02928941340940319, + "acc_norm": 0.6323529411764706, + "acc_norm_stderr": 0.02928941340940319 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6388888888888888, + "acc_stderr": 0.01943177567703731, + "acc_norm": 0.6388888888888888, + "acc_norm_stderr": 0.01943177567703731 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04265792110940588, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04265792110940588 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7387755102040816, + "acc_stderr": 0.028123429335142783, + "acc_norm": 0.7387755102040816, + "acc_norm_stderr": 0.028123429335142783 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8258706467661692, + "acc_stderr": 0.026814951200421603, + "acc_norm": 0.8258706467661692, + "acc_norm_stderr": 0.026814951200421603 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4879518072289157, + "acc_stderr": 0.0389136449583582, + "acc_norm": 0.4879518072289157, + "acc_norm_stderr": 0.0389136449583582 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8128654970760234, + "acc_stderr": 0.02991312723236804, + "acc_norm": 0.8128654970760234, + "acc_norm_stderr": 0.02991312723236804 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3684210526315789, + "mc1_stderr": 0.016886551261046042, + "mc2": 0.524653346988439, + "mc2_stderr": 0.014781307678554883 + }, + "all": { + "acc": 0.6315604657469263, + "acc_stderr": 0.03323662083494856, + "acc_norm": 0.6354735817674091, + "acc_norm_stderr": 0.033211273776582055, + "mc1": 0.3684210526315789, + "mc1_stderr": 0.016886551261046042, + "mc2": 0.524653346988439, + "mc2_stderr": 0.014781307678554883 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16", + "model_sha": "6cdacfda96970aa144e316b108ab9bc17c99a573", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "25534.5802090168", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16/results_2023-10-23T05-26-06.926177.json b/eval-results/TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16/results_2023-10-23T05-26-06.926177.json new file mode 100644 index 0000000000000000000000000000000000000000..eb3b83a5cefb37018f51488882e47c921389114a --- /dev/null +++ b/eval-results/TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16/results_2023-10-23T05-26-06.926177.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/VicUnlocked-alpaca-65B-QLoRA-fp16", + "model_sha": "6cdacfda96970aa144e316b108ab9bc17c99a573", + "model_size": "121.68 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.002202181208053691, + "em_stderr": 0.0004800510816619414, + "f1": 0.07708053691275155, + "f1_stderr": 0.0014808362697713243 + }, + "harness|gsm8k|5": { + "acc": 0.27824109173616374, + "acc_stderr": 0.012343803671422678 + }, + "harness|winogrande|5": { + "acc": 0.8129439621152328, + "acc_stderr": 0.01095971643524291 + }, + "all": { + "em": 0.002202181208053691, + "em_stderr": 0.0004800510816619414, + "f1": 0.07708053691275155, + "f1_stderr": 0.0014808362697713243, + "acc": 0.5455925269256983, + "acc_stderr": 0.011651760053332794 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "79d8201972435e86" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "5ed9485610b11480" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "c789703d3f4e6049" + }, + "total_evaluation_time_secondes": "47180.139203071594", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Vicuna-13B-CoT-fp16/results_2023-07-31T15-25-40.141748.json b/eval-results/TheBloke/Vicuna-13B-CoT-fp16/results_2023-07-31T15-25-40.141748.json new file mode 100644 index 0000000000000000000000000000000000000000..15baf7271fef9f0b1610ae54392f16683105d717 --- /dev/null +++ b/eval-results/TheBloke/Vicuna-13B-CoT-fp16/results_2023-07-31T15-25-40.141748.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5196245733788396, + "acc_stderr": 0.014600132075947094, + "acc_norm": 0.5273037542662116, + "acc_norm_stderr": 0.014589589101985996 + }, + "harness|hellaswag|10": { + "acc": 0.6011750647281418, + "acc_stderr": 0.004886559008754983, + "acc_norm": 0.8014339772953595, + "acc_norm_stderr": 0.003981052091169829 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421296, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421296 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.506578947368421, + "acc_stderr": 0.040685900502249704, + "acc_norm": 0.506578947368421, + "acc_norm_stderr": 0.040685900502249704 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4981132075471698, + "acc_stderr": 0.030772653642075664, + "acc_norm": 0.4981132075471698, + "acc_norm_stderr": 0.030772653642075664 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5972222222222222, + "acc_stderr": 0.04101405519842426, + "acc_norm": 0.5972222222222222, + "acc_norm_stderr": 0.04101405519842426 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4161849710982659, + "acc_stderr": 0.03758517775404948, + "acc_norm": 0.4161849710982659, + "acc_norm_stderr": 0.03758517775404948 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.044405219061793254, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.044405219061793254 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.39148936170212767, + "acc_stderr": 0.03190701242326812, + "acc_norm": 0.39148936170212767, + "acc_norm_stderr": 0.03190701242326812 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.04339138322579861, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.04339138322579861 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4689655172413793, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.4689655172413793, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3386243386243386, + "acc_stderr": 0.02437319786798306, + "acc_norm": 0.3386243386243386, + "acc_norm_stderr": 0.02437319786798306 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.04390259265377562, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.04390259265377562 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.567741935483871, + "acc_stderr": 0.028181739720019416, + "acc_norm": 0.567741935483871, + "acc_norm_stderr": 0.028181739720019416 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4039408866995074, + "acc_stderr": 0.03452453903822039, + "acc_norm": 0.4039408866995074, + "acc_norm_stderr": 0.03452453903822039 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.55, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.0368105086916155, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.0368105086916155 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6515151515151515, + "acc_stderr": 0.03394853965156402, + "acc_norm": 0.6515151515151515, + "acc_norm_stderr": 0.03394853965156402 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.694300518134715, + "acc_stderr": 0.033248379397581594, + "acc_norm": 0.694300518134715, + "acc_norm_stderr": 0.033248379397581594 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4717948717948718, + "acc_stderr": 0.0253106392549339, + "acc_norm": 0.4717948717948718, + "acc_norm_stderr": 0.0253106392549339 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.027940457136228416, + "acc_norm": 0.3, + "acc_norm_stderr": 0.027940457136228416 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.44537815126050423, + "acc_stderr": 0.0322841062671639, + "acc_norm": 0.44537815126050423, + "acc_norm_stderr": 0.0322841062671639 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6844036697247706, + "acc_stderr": 0.019926117513869666, + "acc_norm": 0.6844036697247706, + "acc_norm_stderr": 0.019926117513869666 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.36574074074074076, + "acc_stderr": 0.03284738857647207, + "acc_norm": 0.36574074074074076, + "acc_norm_stderr": 0.03284738857647207 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6862745098039216, + "acc_stderr": 0.03256685484460388, + "acc_norm": 0.6862745098039216, + "acc_norm_stderr": 0.03256685484460388 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7130801687763713, + "acc_stderr": 0.02944377302259469, + "acc_norm": 0.7130801687763713, + "acc_norm_stderr": 0.02944377302259469 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5829596412556054, + "acc_stderr": 0.03309266936071721, + "acc_norm": 0.5829596412556054, + "acc_norm_stderr": 0.03309266936071721 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6717557251908397, + "acc_stderr": 0.04118438565806298, + "acc_norm": 0.6717557251908397, + "acc_norm_stderr": 0.04118438565806298 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6859504132231405, + "acc_stderr": 0.042369647530410184, + "acc_norm": 0.6859504132231405, + "acc_norm_stderr": 0.042369647530410184 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6203703703703703, + "acc_stderr": 0.04691521224077742, + "acc_norm": 0.6203703703703703, + "acc_norm_stderr": 0.04691521224077742 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6441717791411042, + "acc_stderr": 0.03761521380046734, + "acc_norm": 0.6441717791411042, + "acc_norm_stderr": 0.03761521380046734 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.04541609446503947, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.04541609446503947 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7521367521367521, + "acc_stderr": 0.028286324075564386, + "acc_norm": 0.7521367521367521, + "acc_norm_stderr": 0.028286324075564386 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6922094508301405, + "acc_stderr": 0.016506045045155637, + "acc_norm": 0.6922094508301405, + "acc_norm_stderr": 0.016506045045155637 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.546242774566474, + "acc_stderr": 0.026803720583206177, + "acc_norm": 0.546242774566474, + "acc_norm_stderr": 0.026803720583206177 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3318435754189944, + "acc_stderr": 0.015748421208187303, + "acc_norm": 0.3318435754189944, + "acc_norm_stderr": 0.015748421208187303 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5522875816993464, + "acc_stderr": 0.02847293847803353, + "acc_norm": 0.5522875816993464, + "acc_norm_stderr": 0.02847293847803353 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5241157556270096, + "acc_stderr": 0.028365041542564577, + "acc_norm": 0.5241157556270096, + "acc_norm_stderr": 0.028365041542564577 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5493827160493827, + "acc_stderr": 0.0276847214156562, + "acc_norm": 0.5493827160493827, + "acc_norm_stderr": 0.0276847214156562 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3900709219858156, + "acc_stderr": 0.02909767559946393, + "acc_norm": 0.3900709219858156, + "acc_norm_stderr": 0.02909767559946393 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.41851368970013036, + "acc_stderr": 0.012599505608336461, + "acc_norm": 0.41851368970013036, + "acc_norm_stderr": 0.012599505608336461 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.49264705882352944, + "acc_stderr": 0.030369552523902173, + "acc_norm": 0.49264705882352944, + "acc_norm_stderr": 0.030369552523902173 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5179738562091504, + "acc_stderr": 0.020214761037872404, + "acc_norm": 0.5179738562091504, + "acc_norm_stderr": 0.020214761037872404 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5454545454545454, + "acc_stderr": 0.04769300568972745, + "acc_norm": 0.5454545454545454, + "acc_norm_stderr": 0.04769300568972745 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6244897959183674, + "acc_stderr": 0.03100120903989484, + "acc_norm": 0.6244897959183674, + "acc_norm_stderr": 0.03100120903989484 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7711442786069652, + "acc_stderr": 0.029705284056772436, + "acc_norm": 0.7711442786069652, + "acc_norm_stderr": 0.029705284056772436 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.76, + "acc_stderr": 0.04292346959909281, + "acc_norm": 0.76, + "acc_norm_stderr": 0.04292346959909281 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4397590361445783, + "acc_stderr": 0.03864139923699121, + "acc_norm": 0.4397590361445783, + "acc_norm_stderr": 0.03864139923699121 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7134502923976608, + "acc_stderr": 0.03467826685703826, + "acc_norm": 0.7134502923976608, + "acc_norm_stderr": 0.03467826685703826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3623011015911873, + "mc1_stderr": 0.016826646897262255, + "mc2": 0.520836680647418, + "mc2_stderr": 0.015808065640973055 + }, + "all": { + "acc": 0.5204079351510228, + "acc_stderr": 0.03493561166367794, + "acc_norm": 0.5239323096163548, + "acc_norm_stderr": 0.03492008539432969, + "mc1": 0.3623011015911873, + "mc1_stderr": 0.016826646897262255, + "mc2": 0.520836680647418, + "mc2_stderr": 0.015808065640973055 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/Vicuna-13B-CoT-fp16", + "model_sha": "fe74a0ece9089828b301bd0f067ae5f257516179", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "3752.086950778961", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Vicuna-13B-CoT-fp16/results_2023-10-22T14-12-38.922029.json b/eval-results/TheBloke/Vicuna-13B-CoT-fp16/results_2023-10-22T14-12-38.922029.json new file mode 100644 index 0000000000000000000000000000000000000000..29df6cf41733dcc73f92bfe5fb2010d41702b0c4 --- /dev/null +++ b/eval-results/TheBloke/Vicuna-13B-CoT-fp16/results_2023-10-22T14-12-38.922029.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/Vicuna-13B-CoT-fp16", + "model_sha": "eed0c8b1f2f38310b88d3182a50b43d7635c3a4a", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.029677013422818792, + "em_stderr": 0.0017378324714143493, + "f1": 0.09310612416107406, + "f1_stderr": 0.002167792401176146 + }, + "harness|gsm8k|5": { + "acc": 0.08642911296436695, + "acc_stderr": 0.00774004433710381 + }, + "harness|winogrande|5": { + "acc": 0.7419100236779794, + "acc_stderr": 0.012298278833972384 + }, + "all": { + "em": 0.029677013422818792, + "em_stderr": 0.0017378324714143493, + "f1": 0.09310612416107406, + "f1_stderr": 0.002167792401176146, + "acc": 0.4141695683211732, + "acc_stderr": 0.010019161585538096 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "ac752e2682fcf21e" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "6a30e0a9abfde216" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "3120c9f83854444f" + }, + "total_evaluation_time_secondes": "12506.671245336533", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Vicuna-33B-1-3-SuperHOT-8K-fp16/results_2023-07-31T19-04-33.192118.json b/eval-results/TheBloke/Vicuna-33B-1-3-SuperHOT-8K-fp16/results_2023-07-31T19-04-33.192118.json new file mode 100644 index 0000000000000000000000000000000000000000..e282b2e5804e771d98c82161fed75dd082651463 --- /dev/null +++ b/eval-results/TheBloke/Vicuna-33B-1-3-SuperHOT-8K-fp16/results_2023-07-31T19-04-33.192118.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.21331058020477817, + "acc_stderr": 0.011970971742326334, + "acc_norm": 0.25426621160409557, + "acc_norm_stderr": 0.012724999945157744 + }, + "harness|hellaswag|10": { + "acc": 0.28828918542123083, + "acc_stderr": 0.00452040633108404, + "acc_norm": 0.3461461860187214, + "acc_norm_stderr": 0.004747682003491466 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.03712537833614865, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.03712537833614865 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.21509433962264152, + "acc_stderr": 0.025288394502891373, + "acc_norm": 0.21509433962264152, + "acc_norm_stderr": 0.025288394502891373 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.25, + "acc_stderr": 0.03621034121889507, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03621034121889507 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749874, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749874 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.04096985139843671, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.04096985139843671 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2275132275132275, + "acc_stderr": 0.02159126940782378, + "acc_norm": 0.2275132275132275, + "acc_norm_stderr": 0.02159126940782378 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.20634920634920634, + "acc_stderr": 0.0361960452412425, + "acc_norm": 0.20634920634920634, + "acc_norm_stderr": 0.0361960452412425 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2838709677419355, + "acc_stderr": 0.025649381063029254, + "acc_norm": 0.2838709677419355, + "acc_norm_stderr": 0.025649381063029254 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.24630541871921183, + "acc_stderr": 0.030315099285617722, + "acc_norm": 0.24630541871921183, + "acc_norm_stderr": 0.030315099285617722 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.0347769116216366, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.0347769116216366 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.18181818181818182, + "acc_stderr": 0.027479603010538797, + "acc_norm": 0.18181818181818182, + "acc_norm_stderr": 0.027479603010538797 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.19689119170984457, + "acc_stderr": 0.028697873971860702, + "acc_norm": 0.19689119170984457, + "acc_norm_stderr": 0.028697873971860702 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.20512820512820512, + "acc_stderr": 0.020473233173551982, + "acc_norm": 0.20512820512820512, + "acc_norm_stderr": 0.020473233173551982 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.02592887613276612, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.02592887613276612 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.027553614467863818, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.027553614467863818 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.19205298013245034, + "acc_stderr": 0.032162984205936135, + "acc_norm": 0.19205298013245034, + "acc_norm_stderr": 0.032162984205936135 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.21467889908256882, + "acc_stderr": 0.01760430414925649, + "acc_norm": 0.21467889908256882, + "acc_norm_stderr": 0.01760430414925649 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03005820270430985, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03005820270430985 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.242152466367713, + "acc_stderr": 0.028751392398694755, + "acc_norm": 0.242152466367713, + "acc_norm_stderr": 0.028751392398694755 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070416, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070416 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.04284467968052192, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.04284467968052192 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22699386503067484, + "acc_stderr": 0.03291099578615767, + "acc_norm": 0.22699386503067484, + "acc_norm_stderr": 0.03291099578615767 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.30357142857142855, + "acc_stderr": 0.04364226155841043, + "acc_norm": 0.30357142857142855, + "acc_norm_stderr": 0.04364226155841043 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.23931623931623933, + "acc_stderr": 0.027951826808924333, + "acc_norm": 0.23931623931623933, + "acc_norm_stderr": 0.027951826808924333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2554278416347382, + "acc_stderr": 0.015594955384455772, + "acc_norm": 0.2554278416347382, + "acc_norm_stderr": 0.015594955384455772 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.20520231213872833, + "acc_stderr": 0.021742519835276287, + "acc_norm": 0.20520231213872833, + "acc_norm_stderr": 0.021742519835276287 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23910614525139665, + "acc_stderr": 0.014265554192331144, + "acc_norm": 0.23910614525139665, + "acc_norm_stderr": 0.014265554192331144 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.02355083135199509, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.02355083135199509 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1864951768488746, + "acc_stderr": 0.02212243977248077, + "acc_norm": 0.1864951768488746, + "acc_norm_stderr": 0.02212243977248077 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2191358024691358, + "acc_stderr": 0.023016705640262203, + "acc_norm": 0.2191358024691358, + "acc_norm_stderr": 0.023016705640262203 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.24822695035460993, + "acc_stderr": 0.025770015644290392, + "acc_norm": 0.24822695035460993, + "acc_norm_stderr": 0.025770015644290392 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24771838331160365, + "acc_stderr": 0.011025499291443738, + "acc_norm": 0.24771838331160365, + "acc_norm_stderr": 0.011025499291443738 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.21323529411764705, + "acc_stderr": 0.024880971512294275, + "acc_norm": 0.21323529411764705, + "acc_norm_stderr": 0.024880971512294275 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.017630827375148383, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.017630827375148383 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2, + "acc_stderr": 0.03831305140884601, + "acc_norm": 0.2, + "acc_norm_stderr": 0.03831305140884601 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.19183673469387755, + "acc_stderr": 0.025206963154225378, + "acc_norm": 0.19183673469387755, + "acc_norm_stderr": 0.025206963154225378 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401465, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401465 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816508, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816508 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.28313253012048195, + "acc_stderr": 0.03507295431370518, + "acc_norm": 0.28313253012048195, + "acc_norm_stderr": 0.03507295431370518 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.03188578017686399, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.03188578017686399 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23378212974296206, + "mc1_stderr": 0.01481619599193159, + "mc2": 0.4693099566156165, + "mc2_stderr": 0.01667201792733067 + }, + "all": { + "acc": 0.2367148405069541, + "acc_stderr": 0.030958077810881182, + "acc_norm": 0.23838963087978138, + "acc_norm_stderr": 0.030974710079953026, + "mc1": 0.23378212974296206, + "mc1_stderr": 0.01481619599193159, + "mc2": 0.4693099566156165, + "mc2_stderr": 0.01667201792733067 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/Vicuna-33B-1-3-SuperHOT-8K-fp16", + "model_sha": "0b6484697d5cca5baa534b882dcad8101add8cda", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "13024.7011013031", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ/results_2023-08-21T20-32-48.968327.json b/eval-results/TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ/results_2023-08-21T20-32-48.968327.json new file mode 100644 index 0000000000000000000000000000000000000000..882ed1aa095ce3c0ed9e17b3f821e5b728146c1d --- /dev/null +++ b/eval-results/TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ/results_2023-08-21T20-32-48.968327.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.23976109215017063, + "acc_stderr": 0.012476304127453944, + "acc_norm": 0.2960750853242321, + "acc_norm_stderr": 0.013340916085246261 + }, + "harness|hellaswag|10": { + "acc": 0.25721967735510853, + "acc_stderr": 0.004362081806560237, + "acc_norm": 0.25473013343955386, + "acc_norm_stderr": 0.004348189459336531 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.035914440841969694, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.035914440841969694 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.2236842105263158, + "acc_stderr": 0.033911609343436025, + "acc_norm": 0.2236842105263158, + "acc_norm_stderr": 0.033911609343436025 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.3283018867924528, + "acc_stderr": 0.028901593612411784, + "acc_norm": 0.3283018867924528, + "acc_norm_stderr": 0.028901593612411784 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.24305555555555555, + "acc_stderr": 0.03586879280080341, + "acc_norm": 0.24305555555555555, + "acc_norm_stderr": 0.03586879280080341 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206824, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206824 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.16, + "acc_stderr": 0.03684529491774708, + "acc_norm": 0.16, + "acc_norm_stderr": 0.03684529491774708 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.30057803468208094, + "acc_stderr": 0.0349610148119118, + "acc_norm": 0.30057803468208094, + "acc_norm_stderr": 0.0349610148119118 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.04389869956808778, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.04389869956808778 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2297872340425532, + "acc_stderr": 0.02750175294441242, + "acc_norm": 0.2297872340425532, + "acc_norm_stderr": 0.02750175294441242 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.038351539543994215, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.038351539543994215 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.1793103448275862, + "acc_stderr": 0.03196766433373186, + "acc_norm": 0.1793103448275862, + "acc_norm_stderr": 0.03196766433373186 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.28835978835978837, + "acc_stderr": 0.023330654054535882, + "acc_norm": 0.28835978835978837, + "acc_norm_stderr": 0.023330654054535882 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3412698412698413, + "acc_stderr": 0.042407993275749234, + "acc_norm": 0.3412698412698413, + "acc_norm_stderr": 0.042407993275749234 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036623, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036623 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2838709677419355, + "acc_stderr": 0.025649381063029247, + "acc_norm": 0.2838709677419355, + "acc_norm_stderr": 0.025649381063029247 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2315270935960591, + "acc_stderr": 0.02967833314144444, + "acc_norm": 0.2315270935960591, + "acc_norm_stderr": 0.02967833314144444 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.28484848484848485, + "acc_stderr": 0.03524390844511784, + "acc_norm": 0.28484848484848485, + "acc_norm_stderr": 0.03524390844511784 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2676767676767677, + "acc_stderr": 0.03154449888270286, + "acc_norm": 0.2676767676767677, + "acc_norm_stderr": 0.03154449888270286 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.24352331606217617, + "acc_stderr": 0.030975436386845436, + "acc_norm": 0.24352331606217617, + "acc_norm_stderr": 0.030975436386845436 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3153846153846154, + "acc_stderr": 0.02355964698318994, + "acc_norm": 0.3153846153846154, + "acc_norm_stderr": 0.02355964698318994 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.22592592592592592, + "acc_stderr": 0.02549753263960955, + "acc_norm": 0.22592592592592592, + "acc_norm_stderr": 0.02549753263960955 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3025210084033613, + "acc_stderr": 0.02983796238829193, + "acc_norm": 0.3025210084033613, + "acc_norm_stderr": 0.02983796238829193 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2251655629139073, + "acc_stderr": 0.03410435282008937, + "acc_norm": 0.2251655629139073, + "acc_norm_stderr": 0.03410435282008937 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.24587155963302754, + "acc_stderr": 0.018461940968708457, + "acc_norm": 0.24587155963302754, + "acc_norm_stderr": 0.018461940968708457 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3194444444444444, + "acc_stderr": 0.031798763421768524, + "acc_norm": 0.3194444444444444, + "acc_norm_stderr": 0.031798763421768524 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.030587591351604246, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.030587591351604246 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2742616033755274, + "acc_stderr": 0.029041333510598025, + "acc_norm": 0.2742616033755274, + "acc_norm_stderr": 0.029041333510598025 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.2825112107623318, + "acc_stderr": 0.030216831011508766, + "acc_norm": 0.2825112107623318, + "acc_norm_stderr": 0.030216831011508766 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3053435114503817, + "acc_stderr": 0.04039314978724561, + "acc_norm": 0.3053435114503817, + "acc_norm_stderr": 0.04039314978724561 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070416, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070416 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.0433004374965074, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.0433004374965074 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.20245398773006135, + "acc_stderr": 0.03157065078911902, + "acc_norm": 0.20245398773006135, + "acc_norm_stderr": 0.03157065078911902 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2767857142857143, + "acc_stderr": 0.042466243366976256, + "acc_norm": 0.2767857142857143, + "acc_norm_stderr": 0.042466243366976256 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.34951456310679613, + "acc_stderr": 0.047211885060971716, + "acc_norm": 0.34951456310679613, + "acc_norm_stderr": 0.047211885060971716 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.027236013946196666, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.027236013946196666 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.25287356321839083, + "acc_stderr": 0.01554337731371968, + "acc_norm": 0.25287356321839083, + "acc_norm_stderr": 0.01554337731371968 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.22832369942196531, + "acc_stderr": 0.02259870380432164, + "acc_norm": 0.22832369942196531, + "acc_norm_stderr": 0.02259870380432164 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2581005586592179, + "acc_stderr": 0.014635185616527827, + "acc_norm": 0.2581005586592179, + "acc_norm_stderr": 0.014635185616527827 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.20261437908496732, + "acc_stderr": 0.023015446877985662, + "acc_norm": 0.20261437908496732, + "acc_norm_stderr": 0.023015446877985662 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2604501607717042, + "acc_stderr": 0.02492672322484555, + "acc_norm": 0.2604501607717042, + "acc_norm_stderr": 0.02492672322484555 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.20987654320987653, + "acc_stderr": 0.02265834408598137, + "acc_norm": 0.20987654320987653, + "acc_norm_stderr": 0.02265834408598137 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.28368794326241137, + "acc_stderr": 0.02689170942834396, + "acc_norm": 0.28368794326241137, + "acc_norm_stderr": 0.02689170942834396 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2633637548891786, + "acc_stderr": 0.011249506403605282, + "acc_norm": 0.2633637548891786, + "acc_norm_stderr": 0.011249506403605282 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.22794117647058823, + "acc_stderr": 0.025483081468029804, + "acc_norm": 0.22794117647058823, + "acc_norm_stderr": 0.025483081468029804 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2565359477124183, + "acc_stderr": 0.017667841612379, + "acc_norm": 0.2565359477124183, + "acc_norm_stderr": 0.017667841612379 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.04172343038705383, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.04172343038705383 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2612244897959184, + "acc_stderr": 0.028123429335142787, + "acc_norm": 0.2612244897959184, + "acc_norm_stderr": 0.028123429335142787 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.22885572139303484, + "acc_stderr": 0.029705284056772436, + "acc_norm": 0.22885572139303484, + "acc_norm_stderr": 0.029705284056772436 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.22289156626506024, + "acc_stderr": 0.03240004825594687, + "acc_norm": 0.22289156626506024, + "acc_norm_stderr": 0.03240004825594687 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.1695906432748538, + "acc_stderr": 0.028782108105401712, + "acc_norm": 0.1695906432748538, + "acc_norm_stderr": 0.028782108105401712 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2350061199510404, + "mc1_stderr": 0.014843061507731606, + "mc2": 0.5024971327286255, + "mc2_stderr": 0.01690273314078156 + }, + "all": { + "acc": 0.2532349052121112, + "acc_stderr": 0.03157878867428685, + "acc_norm": 0.2541471840131029, + "acc_norm_stderr": 0.03159320765073717, + "mc1": 0.2350061199510404, + "mc1_stderr": 0.014843061507731606, + "mc2": 0.5024971327286255, + "mc2_stderr": 0.01690273314078156 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ", + "model_sha": "d9b00ec47ae3546398432f0693fe2d5d92bf143b", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4683.2806622982025", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ/results_2023-11-07T13-33-21.987098.json b/eval-results/TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ/results_2023-11-07T13-33-21.987098.json new file mode 100644 index 0000000000000000000000000000000000000000..8d00db6c4dc9f07248bf7af576fd8ff43e49ed5c --- /dev/null +++ b/eval-results/TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ/results_2023-11-07T13-33-21.987098.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/Wizard-Vicuna-13B-Uncensored-GPTQ", + "model_sha": "c322bec72dabdbf1fe4a8196b0234694b316a1a7", + "model_dtype": "torch.float16", + "model_size": "6.8 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.07581795302013423, + "em_stderr": 0.0027108434788949637, + "f1": 0.14551698825503265, + "f1_stderr": 0.0029781067344765754 + }, + "harness|gsm8k|5": { + "acc": 0.09931766489764973, + "acc_stderr": 0.008238371412683973 + }, + "harness|winogrande|5": { + "acc": 0.7576953433307024, + "acc_stderr": 0.012042352526174787 + }, + "all": { + "em": 0.07581795302013423, + "em_stderr": 0.0027108434788949637, + "f1": 0.14551698825503265, + "f1_stderr": 0.0029781067344765754, + "acc": 0.42850650411417607, + "acc_stderr": 0.010140361969429381 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "0ee6629fc6a91da8" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "dc55f4a29f149379" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "e3ddc2074581a43b" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Wizard-Vicuna-13B-Uncensored-HF/results_2023-07-18T16-17-31.150663.json b/eval-results/TheBloke/Wizard-Vicuna-13B-Uncensored-HF/results_2023-07-18T16-17-31.150663.json new file mode 100644 index 0000000000000000000000000000000000000000..02eae6133d44dfd0c786d5dd223e16bf47e4ffd6 --- /dev/null +++ b/eval-results/TheBloke/Wizard-Vicuna-13B-Uncensored-HF/results_2023-07-18T16-17-31.150663.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5656996587030717, + "acc_stderr": 0.01448470304885736, + "acc_norm": 0.5895904436860068, + "acc_norm_stderr": 0.014374922192642666 + }, + "harness|hellaswag|10": { + "acc": 0.6218880701055567, + "acc_stderr": 0.0048392473326060404, + "acc_norm": 0.8194582752439753, + "acc_norm_stderr": 0.0038385193358868794 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.04063302731486671, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.04063302731486671 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4716981132075472, + "acc_stderr": 0.030723535249006107, + "acc_norm": 0.4716981132075472, + "acc_norm_stderr": 0.030723535249006107 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4861111111111111, + "acc_stderr": 0.041795966175810016, + "acc_norm": 0.4861111111111111, + "acc_norm_stderr": 0.041795966175810016 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4046242774566474, + "acc_stderr": 0.03742461193887248, + "acc_norm": 0.4046242774566474, + "acc_norm_stderr": 0.03742461193887248 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.042207736591714534, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.042207736591714534 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.41702127659574467, + "acc_stderr": 0.032232762667117124, + "acc_norm": 0.41702127659574467, + "acc_norm_stderr": 0.032232762667117124 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537315, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537315 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4, + "acc_stderr": 0.04082482904638628, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04082482904638628 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.022418042891113946, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.022418042891113946 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.042639068927951315, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.042639068927951315 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5193548387096775, + "acc_stderr": 0.028422687404312107, + "acc_norm": 0.5193548387096775, + "acc_norm_stderr": 0.028422687404312107 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.32019704433497537, + "acc_stderr": 0.032826493853041504, + "acc_norm": 0.32019704433497537, + "acc_norm_stderr": 0.032826493853041504 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6242424242424243, + "acc_stderr": 0.037818873532059816, + "acc_norm": 0.6242424242424243, + "acc_norm_stderr": 0.037818873532059816 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.035402943770953675, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.035402943770953675 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6632124352331606, + "acc_stderr": 0.03410780251836183, + "acc_norm": 0.6632124352331606, + "acc_norm_stderr": 0.03410780251836183 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4230769230769231, + "acc_stderr": 0.02504919787604234, + "acc_norm": 0.4230769230769231, + "acc_norm_stderr": 0.02504919787604234 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2111111111111111, + "acc_stderr": 0.02488211685765509, + "acc_norm": 0.2111111111111111, + "acc_norm_stderr": 0.02488211685765509 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.0322529423239964, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.0322529423239964 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2251655629139073, + "acc_stderr": 0.03410435282008937, + "acc_norm": 0.2251655629139073, + "acc_norm_stderr": 0.03410435282008937 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6238532110091743, + "acc_stderr": 0.02076923196820508, + "acc_norm": 0.6238532110091743, + "acc_norm_stderr": 0.02076923196820508 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.23148148148148148, + "acc_stderr": 0.028765111718046955, + "acc_norm": 0.23148148148148148, + "acc_norm_stderr": 0.028765111718046955 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6519607843137255, + "acc_stderr": 0.03343311240488418, + "acc_norm": 0.6519607843137255, + "acc_norm_stderr": 0.03343311240488418 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6919831223628692, + "acc_stderr": 0.030052389335605702, + "acc_norm": 0.6919831223628692, + "acc_norm_stderr": 0.030052389335605702 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5695067264573991, + "acc_stderr": 0.033231973029429394, + "acc_norm": 0.5695067264573991, + "acc_norm_stderr": 0.033231973029429394 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5954198473282443, + "acc_stderr": 0.043046937953806645, + "acc_norm": 0.5954198473282443, + "acc_norm_stderr": 0.043046937953806645 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6694214876033058, + "acc_stderr": 0.04294340845212094, + "acc_norm": 0.6694214876033058, + "acc_norm_stderr": 0.04294340845212094 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5462962962962963, + "acc_stderr": 0.04812917324536823, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.04812917324536823 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5460122699386503, + "acc_stderr": 0.0391170190467718, + "acc_norm": 0.5460122699386503, + "acc_norm_stderr": 0.0391170190467718 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6019417475728155, + "acc_stderr": 0.048467482539772386, + "acc_norm": 0.6019417475728155, + "acc_norm_stderr": 0.048467482539772386 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7649572649572649, + "acc_stderr": 0.027778835904935444, + "acc_norm": 0.7649572649572649, + "acc_norm_stderr": 0.027778835904935444 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6768837803320562, + "acc_stderr": 0.016723726512343048, + "acc_norm": 0.6768837803320562, + "acc_norm_stderr": 0.016723726512343048 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5086705202312138, + "acc_stderr": 0.026915047355369804, + "acc_norm": 0.5086705202312138, + "acc_norm_stderr": 0.026915047355369804 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24804469273743016, + "acc_stderr": 0.01444415780826144, + "acc_norm": 0.24804469273743016, + "acc_norm_stderr": 0.01444415780826144 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5, + "acc_stderr": 0.028629916715693413, + "acc_norm": 0.5, + "acc_norm_stderr": 0.028629916715693413 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5530546623794212, + "acc_stderr": 0.028237769422085335, + "acc_norm": 0.5530546623794212, + "acc_norm_stderr": 0.028237769422085335 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.027744313443376536, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.027744313443376536 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3617021276595745, + "acc_stderr": 0.028663820147199495, + "acc_norm": 0.3617021276595745, + "acc_norm_stderr": 0.028663820147199495 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.40547588005215124, + "acc_stderr": 0.012539960672377207, + "acc_norm": 0.40547588005215124, + "acc_norm_stderr": 0.012539960672377207 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.49264705882352944, + "acc_stderr": 0.030369552523902173, + "acc_norm": 0.49264705882352944, + "acc_norm_stderr": 0.030369552523902173 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5081699346405228, + "acc_stderr": 0.020225134343057272, + "acc_norm": 0.5081699346405228, + "acc_norm_stderr": 0.020225134343057272 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5272727272727272, + "acc_stderr": 0.04782001791380061, + "acc_norm": 0.5272727272727272, + "acc_norm_stderr": 0.04782001791380061 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5306122448979592, + "acc_stderr": 0.031949171367580624, + "acc_norm": 0.5306122448979592, + "acc_norm_stderr": 0.031949171367580624 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6368159203980099, + "acc_stderr": 0.034005985055990146, + "acc_norm": 0.6368159203980099, + "acc_norm_stderr": 0.034005985055990146 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.45180722891566266, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.45180722891566266, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6900584795321637, + "acc_stderr": 0.035469769593931624, + "acc_norm": 0.6900584795321637, + "acc_norm_stderr": 0.035469769593931624 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3635250917992656, + "mc1_stderr": 0.01683886288396583, + "mc2": 0.5168692050626681, + "mc2_stderr": 0.015551742387661518 + }, + "all": { + "acc": 0.48310655286823834, + "acc_stderr": 0.03495320176413445, + "acc_norm": 0.4868601289719901, + "acc_norm_stderr": 0.03493437958018643, + "mc1": 0.3635250917992656, + "mc1_stderr": 0.01683886288396583, + "mc2": 0.5168692050626681, + "mc2_stderr": 0.015551742387661518 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/Wizard-Vicuna-13B-Uncensored-HF", + "model_sha": "fff9ac7f0e2e7b340f2301f5f089d989fc03be67", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Wizard-Vicuna-13B-Uncensored-HF/results_2023-10-23T01-03-04.641003.json b/eval-results/TheBloke/Wizard-Vicuna-13B-Uncensored-HF/results_2023-10-23T01-03-04.641003.json new file mode 100644 index 0000000000000000000000000000000000000000..4666657eaa89f306a42f7972989a4e6cc420c074 --- /dev/null +++ b/eval-results/TheBloke/Wizard-Vicuna-13B-Uncensored-HF/results_2023-10-23T01-03-04.641003.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/Wizard-Vicuna-13B-Uncensored-HF", + "model_sha": "fff9ac7f0e2e7b340f2301f5f089d989fc03be67", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.14314177852348994, + "em_stderr": 0.0035865537174832513, + "f1": 0.2178586409395965, + "f1_stderr": 0.003730334446277459 + }, + "harness|gsm8k|5": { + "acc": 0.08642911296436695, + "acc_stderr": 0.0077400443371038056 + }, + "harness|winogrande|5": { + "acc": 0.7569060773480663, + "acc_stderr": 0.012055665630431032 + }, + "all": { + "em": 0.14314177852348994, + "em_stderr": 0.0035865537174832513, + "f1": 0.2178586409395965, + "f1_stderr": 0.003730334446277459, + "acc": 0.4216675951562166, + "acc_stderr": 0.00989785498376742 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "fecd03f4264f7a65" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "2f4123fee78f8150" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "249e760c4fd684c7" + }, + "total_evaluation_time_secondes": "12613.579063892365", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Wizard-Vicuna-30B-Superhot-8K-fp16/results_2023-07-31T18-46-06.024423.json b/eval-results/TheBloke/Wizard-Vicuna-30B-Superhot-8K-fp16/results_2023-07-31T18-46-06.024423.json new file mode 100644 index 0000000000000000000000000000000000000000..643343aa85dcdd942ca27608d86dd661392ef4ee --- /dev/null +++ b/eval-results/TheBloke/Wizard-Vicuna-30B-Superhot-8K-fp16/results_2023-07-31T18-46-06.024423.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.22525597269624573, + "acc_stderr": 0.012207839995407312, + "acc_norm": 0.2619453924914676, + "acc_norm_stderr": 0.012849054826858115 + }, + "harness|hellaswag|10": { + "acc": 0.2804222266480781, + "acc_stderr": 0.004482874732237348, + "acc_norm": 0.3296156144194384, + "acc_norm_stderr": 0.004691128722535483 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.1925925925925926, + "acc_stderr": 0.03406542058502653, + "acc_norm": 0.1925925925925926, + "acc_norm_stderr": 0.03406542058502653 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.19245283018867926, + "acc_stderr": 0.024262979839372277, + "acc_norm": 0.19245283018867926, + "acc_norm_stderr": 0.024262979839372277 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749874, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749874 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813365 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135303, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135303 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24867724867724866, + "acc_stderr": 0.022261817692400168, + "acc_norm": 0.24867724867724866, + "acc_norm_stderr": 0.022261817692400168 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04040610178208841, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04040610178208841 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.18064516129032257, + "acc_stderr": 0.02188617856717255, + "acc_norm": 0.18064516129032257, + "acc_norm_stderr": 0.02188617856717255 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.15270935960591134, + "acc_stderr": 0.02530890453938063, + "acc_norm": 0.15270935960591134, + "acc_norm_stderr": 0.02530890453938063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2606060606060606, + "acc_stderr": 0.03427743175816524, + "acc_norm": 0.2606060606060606, + "acc_norm_stderr": 0.03427743175816524 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.17676767676767677, + "acc_stderr": 0.027178752639044915, + "acc_norm": 0.17676767676767677, + "acc_norm_stderr": 0.027178752639044915 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.18652849740932642, + "acc_stderr": 0.028112091210117447, + "acc_norm": 0.18652849740932642, + "acc_norm_stderr": 0.028112091210117447 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2128205128205128, + "acc_stderr": 0.020752423722127995, + "acc_norm": 0.2128205128205128, + "acc_norm_stderr": 0.020752423722127995 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24814814814814815, + "acc_stderr": 0.0263357394040558, + "acc_norm": 0.24814814814814815, + "acc_norm_stderr": 0.0263357394040558 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.23949579831932774, + "acc_stderr": 0.027722065493361255, + "acc_norm": 0.23949579831932774, + "acc_norm_stderr": 0.027722065493361255 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.1986754966887417, + "acc_stderr": 0.03257847384436776, + "acc_norm": 0.1986754966887417, + "acc_norm_stderr": 0.03257847384436776 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1926605504587156, + "acc_stderr": 0.016909276884936094, + "acc_norm": 0.1926605504587156, + "acc_norm_stderr": 0.016909276884936094 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2361111111111111, + "acc_stderr": 0.02896370257079103, + "acc_norm": 0.2361111111111111, + "acc_norm_stderr": 0.02896370257079103 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2742616033755274, + "acc_stderr": 0.029041333510598035, + "acc_norm": 0.2742616033755274, + "acc_norm_stderr": 0.029041333510598035 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.31390134529147984, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.31390134529147984, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.27350427350427353, + "acc_stderr": 0.029202540153431163, + "acc_norm": 0.27350427350427353, + "acc_norm_stderr": 0.029202540153431163 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.23754789272030652, + "acc_stderr": 0.015218733046150193, + "acc_norm": 0.23754789272030652, + "acc_norm_stderr": 0.015218733046150193 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24277456647398843, + "acc_stderr": 0.023083658586984204, + "acc_norm": 0.24277456647398843, + "acc_norm_stderr": 0.023083658586984204 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24581005586592178, + "acc_stderr": 0.01440029642922562, + "acc_norm": 0.24581005586592178, + "acc_norm_stderr": 0.01440029642922562 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.023929155517351284, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.023929155517351284 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1832797427652733, + "acc_stderr": 0.021974198848265805, + "acc_norm": 0.1832797427652733, + "acc_norm_stderr": 0.021974198848265805 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21604938271604937, + "acc_stderr": 0.022899162918445806, + "acc_norm": 0.21604938271604937, + "acc_norm_stderr": 0.022899162918445806 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23404255319148937, + "acc_stderr": 0.025257861359432417, + "acc_norm": 0.23404255319148937, + "acc_norm_stderr": 0.025257861359432417 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142692, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142692 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.18382352941176472, + "acc_stderr": 0.023529242185193106, + "acc_norm": 0.18382352941176472, + "acc_norm_stderr": 0.023529242185193106 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25, + "acc_stderr": 0.01751781884501444, + "acc_norm": 0.25, + "acc_norm_stderr": 0.01751781884501444 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.20909090909090908, + "acc_stderr": 0.03895091015724136, + "acc_norm": 0.20909090909090908, + "acc_norm_stderr": 0.03895091015724136 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.22040816326530613, + "acc_stderr": 0.02653704531214529, + "acc_norm": 0.22040816326530613, + "acc_norm_stderr": 0.02653704531214529 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401465, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401465 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.28313253012048195, + "acc_stderr": 0.03507295431370518, + "acc_norm": 0.28313253012048195, + "acc_norm_stderr": 0.03507295431370518 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.28654970760233917, + "acc_stderr": 0.034678266857038266, + "acc_norm": 0.28654970760233917, + "acc_norm_stderr": 0.034678266857038266 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.22766217870257038, + "mc1_stderr": 0.01467925503211107, + "mc2": 0.4747511496520905, + "mc2_stderr": 0.016743067237896876 + }, + "all": { + "acc": 0.23519468841762173, + "acc_stderr": 0.030867946729594396, + "acc_norm": 0.23665032922383497, + "acc_norm_stderr": 0.03088234450623421, + "mc1": 0.22766217870257038, + "mc1_stderr": 0.01467925503211107, + "mc2": 0.4747511496520905, + "mc2_stderr": 0.016743067237896876 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/Wizard-Vicuna-30B-Superhot-8K-fp16", + "model_sha": "062fe5409861d7386279fb534b435be39c88ceaf", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "13013.691723823547", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ/results_2023-08-29T22-50-11.405669.json b/eval-results/TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ/results_2023-08-29T22-50-11.405669.json new file mode 100644 index 0000000000000000000000000000000000000000..9e024f782d024ce31536d0ecab9a5cd6257b5004 --- /dev/null +++ b/eval-results/TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ/results_2023-08-29T22-50-11.405669.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ", + "model_sha": "56a82ece7a9309189561a590e8f4d2fe0d4be92b", + "model_dtype": "torch.float16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5836177474402731, + "acc_stderr": 0.014405618279436176, + "acc_norm": 0.6109215017064846, + "acc_norm_stderr": 0.014247309976045607 + }, + "harness|hellaswag|10": { + "acc": 0.6248755228042223, + "acc_stderr": 0.004831655648489739, + "acc_norm": 0.8240390360485959, + "acc_norm_stderr": 0.003800087313595186 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411021, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411021 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45925925925925926, + "acc_stderr": 0.04304979692464243, + "acc_norm": 0.45925925925925926, + "acc_norm_stderr": 0.04304979692464243 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5328947368421053, + "acc_stderr": 0.040601270352363966, + "acc_norm": 0.5328947368421053, + "acc_norm_stderr": 0.040601270352363966 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5849056603773585, + "acc_stderr": 0.03032594578928611, + "acc_norm": 0.5849056603773585, + "acc_norm_stderr": 0.03032594578928611 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.04122728707651282, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.04122728707651282 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.049888765156985884, + "acc_norm": 0.44, + "acc_norm_stderr": 0.049888765156985884 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5144508670520231, + "acc_stderr": 0.03810871630454764, + "acc_norm": 0.5144508670520231, + "acc_norm_stderr": 0.03810871630454764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.04280105837364397, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.04280105837364397 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4425531914893617, + "acc_stderr": 0.032469569197899575, + "acc_norm": 0.4425531914893617, + "acc_norm_stderr": 0.032469569197899575 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537315, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537315 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.04166567577101579, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.04166567577101579 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.024130158299762613, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.024130158299762613 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30952380952380953, + "acc_stderr": 0.04134913018303316, + "acc_norm": 0.30952380952380953, + "acc_norm_stderr": 0.04134913018303316 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.667741935483871, + "acc_stderr": 0.0267955608481228, + "acc_norm": 0.667741935483871, + "acc_norm_stderr": 0.0267955608481228 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4088669950738916, + "acc_stderr": 0.034590588158832314, + "acc_norm": 0.4088669950738916, + "acc_norm_stderr": 0.034590588158832314 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.036085410115739666, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.036085410115739666 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.03191178226713547, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.03191178226713547 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7823834196891192, + "acc_stderr": 0.029778663037752954, + "acc_norm": 0.7823834196891192, + "acc_norm_stderr": 0.029778663037752954 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5666666666666667, + "acc_stderr": 0.025124653525885124, + "acc_norm": 0.5666666666666667, + "acc_norm_stderr": 0.025124653525885124 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.026466117538959916, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.026466117538959916 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5798319327731093, + "acc_stderr": 0.03206183783236152, + "acc_norm": 0.5798319327731093, + "acc_norm_stderr": 0.03206183783236152 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3708609271523179, + "acc_stderr": 0.03943966699183629, + "acc_norm": 0.3708609271523179, + "acc_norm_stderr": 0.03943966699183629 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7376146788990826, + "acc_stderr": 0.018861885021534734, + "acc_norm": 0.7376146788990826, + "acc_norm_stderr": 0.018861885021534734 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4212962962962963, + "acc_stderr": 0.03367462138896078, + "acc_norm": 0.4212962962962963, + "acc_norm_stderr": 0.03367462138896078 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7549019607843137, + "acc_stderr": 0.030190282453501943, + "acc_norm": 0.7549019607843137, + "acc_norm_stderr": 0.030190282453501943 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7637130801687764, + "acc_stderr": 0.027652153144159267, + "acc_norm": 0.7637130801687764, + "acc_norm_stderr": 0.027652153144159267 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6457399103139013, + "acc_stderr": 0.03210062154134987, + "acc_norm": 0.6457399103139013, + "acc_norm_stderr": 0.03210062154134987 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6412213740458015, + "acc_stderr": 0.042067393138649066, + "acc_norm": 0.6412213740458015, + "acc_norm_stderr": 0.042067393138649066 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908705, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6851851851851852, + "acc_stderr": 0.04489931073591312, + "acc_norm": 0.6851851851851852, + "acc_norm_stderr": 0.04489931073591312 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6932515337423313, + "acc_stderr": 0.036230899157241474, + "acc_norm": 0.6932515337423313, + "acc_norm_stderr": 0.036230899157241474 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7669902912621359, + "acc_stderr": 0.041858325989283136, + "acc_norm": 0.7669902912621359, + "acc_norm_stderr": 0.041858325989283136 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8547008547008547, + "acc_stderr": 0.023086635086841407, + "acc_norm": 0.8547008547008547, + "acc_norm_stderr": 0.023086635086841407 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7471264367816092, + "acc_stderr": 0.015543377313719681, + "acc_norm": 0.7471264367816092, + "acc_norm_stderr": 0.015543377313719681 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.630057803468208, + "acc_stderr": 0.025992472029306386, + "acc_norm": 0.630057803468208, + "acc_norm_stderr": 0.025992472029306386 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.423463687150838, + "acc_stderr": 0.0165254258987735, + "acc_norm": 0.423463687150838, + "acc_norm_stderr": 0.0165254258987735 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5915032679738562, + "acc_stderr": 0.028146405993096358, + "acc_norm": 0.5915032679738562, + "acc_norm_stderr": 0.028146405993096358 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7041800643086816, + "acc_stderr": 0.02592237178881876, + "acc_norm": 0.7041800643086816, + "acc_norm_stderr": 0.02592237178881876 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6265432098765432, + "acc_stderr": 0.026915003011380167, + "acc_norm": 0.6265432098765432, + "acc_norm_stderr": 0.026915003011380167 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.43617021276595747, + "acc_stderr": 0.029583452036284066, + "acc_norm": 0.43617021276595747, + "acc_norm_stderr": 0.029583452036284066 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4361147327249022, + "acc_stderr": 0.012665568135455333, + "acc_norm": 0.4361147327249022, + "acc_norm_stderr": 0.012665568135455333 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5551470588235294, + "acc_stderr": 0.030187532060329387, + "acc_norm": 0.5551470588235294, + "acc_norm_stderr": 0.030187532060329387 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5784313725490197, + "acc_stderr": 0.019977422600227474, + "acc_norm": 0.5784313725490197, + "acc_norm_stderr": 0.019977422600227474 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6122448979591837, + "acc_stderr": 0.031192230726795656, + "acc_norm": 0.6122448979591837, + "acc_norm_stderr": 0.031192230726795656 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7761194029850746, + "acc_stderr": 0.029475250236017204, + "acc_norm": 0.7761194029850746, + "acc_norm_stderr": 0.029475250236017204 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4879518072289157, + "acc_stderr": 0.0389136449583582, + "acc_norm": 0.4879518072289157, + "acc_norm_stderr": 0.0389136449583582 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.030944459778533207, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.030944459778533207 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3463892288861689, + "mc1_stderr": 0.016656997109125136, + "mc2": 0.4989885484393867, + "mc2_stderr": 0.015462798550011949 + }, + "all": { + "acc": 0.5659294537760093, + "acc_stderr": 0.03428687493287769, + "acc_norm": 0.5697678820389006, + "acc_norm_stderr": 0.03426670753222878, + "mc1": 0.3463892288861689, + "mc1_stderr": 0.016656997109125136, + "mc2": 0.4989885484393867, + "mc2_stderr": 0.015462798550011949 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "16214.730946063995", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ/results_2023-11-07T20-55-33.884727.json b/eval-results/TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ/results_2023-11-07T20-55-33.884727.json new file mode 100644 index 0000000000000000000000000000000000000000..5fc5973e48fd67cb2861d2ee404bd477b86c7b13 --- /dev/null +++ b/eval-results/TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ/results_2023-11-07T20-55-33.884727.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/Wizard-Vicuna-30B-Uncensored-GPTQ", + "model_sha": "3af62c796031ef5a6ece16c163a8444609d9c376", + "model_dtype": "torch.float16", + "model_size": "15.83 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.22074244966442952, + "em_stderr": 0.004247399285462808, + "f1": 0.29961619127516814, + "f1_stderr": 0.004236911466589284 + }, + "harness|gsm8k|5": { + "acc": 0.23275208491281274, + "acc_stderr": 0.011640106217202953 + }, + "harness|winogrande|5": { + "acc": 0.77663772691397, + "acc_stderr": 0.011705697565205191 + }, + "all": { + "em": 0.22074244966442952, + "em_stderr": 0.004247399285462808, + "f1": 0.29961619127516814, + "f1_stderr": 0.004236911466589284, + "acc": 0.5046949059133914, + "acc_stderr": 0.011672901891204072 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "f52ad7e4d2d48d72" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "f059774aa155f188" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "1818cd8e71ac89d3" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Wizard-Vicuna-30B-Uncensored-fp16/results_2023-07-19T22-48-26.116631.json b/eval-results/TheBloke/Wizard-Vicuna-30B-Uncensored-fp16/results_2023-07-19T22-48-26.116631.json new file mode 100644 index 0000000000000000000000000000000000000000..705c692fa4a3e8c7fe6e8f933c5028c2a8a90be9 --- /dev/null +++ b/eval-results/TheBloke/Wizard-Vicuna-30B-Uncensored-fp16/results_2023-07-19T22-48-26.116631.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5989761092150171, + "acc_stderr": 0.014322255790719867, + "acc_norm": 0.621160409556314, + "acc_norm_stderr": 0.014175915490000322 + }, + "harness|hellaswag|10": { + "acc": 0.6386178052180841, + "acc_stderr": 0.004794191785967947, + "acc_norm": 0.8344951204939255, + "acc_norm_stderr": 0.003708760752685524 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5333333333333333, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.5333333333333333, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5986842105263158, + "acc_stderr": 0.039889037033362836, + "acc_norm": 0.5986842105263158, + "acc_norm_stderr": 0.039889037033362836 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6150943396226415, + "acc_stderr": 0.02994649856769995, + "acc_norm": 0.6150943396226415, + "acc_norm_stderr": 0.02994649856769995 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6041666666666666, + "acc_stderr": 0.04089465449325582, + "acc_norm": 0.6041666666666666, + "acc_norm_stderr": 0.04089465449325582 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5317919075144508, + "acc_stderr": 0.03804749744364764, + "acc_norm": 0.5317919075144508, + "acc_norm_stderr": 0.03804749744364764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.04533838195929776, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.04533838195929776 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.65, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.65, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.502127659574468, + "acc_stderr": 0.03268572658667492, + "acc_norm": 0.502127659574468, + "acc_norm_stderr": 0.03268572658667492 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.04372748290278007, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.04372748290278007 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5103448275862069, + "acc_stderr": 0.04165774775728763, + "acc_norm": 0.5103448275862069, + "acc_norm_stderr": 0.04165774775728763 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3439153439153439, + "acc_stderr": 0.024464426625596437, + "acc_norm": 0.3439153439153439, + "acc_norm_stderr": 0.024464426625596437 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.0416345303130286, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.0416345303130286 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6516129032258065, + "acc_stderr": 0.027104826328100944, + "acc_norm": 0.6516129032258065, + "acc_norm_stderr": 0.027104826328100944 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.39901477832512317, + "acc_stderr": 0.03445487686264715, + "acc_norm": 0.39901477832512317, + "acc_norm_stderr": 0.03445487686264715 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7212121212121212, + "acc_stderr": 0.03501438706296781, + "acc_norm": 0.7212121212121212, + "acc_norm_stderr": 0.03501438706296781 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7626262626262627, + "acc_stderr": 0.030313710538198906, + "acc_norm": 0.7626262626262627, + "acc_norm_stderr": 0.030313710538198906 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8082901554404145, + "acc_stderr": 0.028408953626245282, + "acc_norm": 0.8082901554404145, + "acc_norm_stderr": 0.028408953626245282 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5717948717948718, + "acc_stderr": 0.025088301454694834, + "acc_norm": 0.5717948717948718, + "acc_norm_stderr": 0.025088301454694834 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.026962424325073835, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.026962424325073835 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6134453781512605, + "acc_stderr": 0.03163145807552379, + "acc_norm": 0.6134453781512605, + "acc_norm_stderr": 0.03163145807552379 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.03822746937658753, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.03822746937658753 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.781651376146789, + "acc_stderr": 0.017712600528722717, + "acc_norm": 0.781651376146789, + "acc_norm_stderr": 0.017712600528722717 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.03388857118502326, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.03388857118502326 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7598039215686274, + "acc_stderr": 0.02998373305591362, + "acc_norm": 0.7598039215686274, + "acc_norm_stderr": 0.02998373305591362 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7890295358649789, + "acc_stderr": 0.02655837250266192, + "acc_norm": 0.7890295358649789, + "acc_norm_stderr": 0.02655837250266192 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6412556053811659, + "acc_stderr": 0.032190792004199956, + "acc_norm": 0.6412556053811659, + "acc_norm_stderr": 0.032190792004199956 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6793893129770993, + "acc_stderr": 0.04093329229834278, + "acc_norm": 0.6793893129770993, + "acc_norm_stderr": 0.04093329229834278 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7520661157024794, + "acc_stderr": 0.03941897526516304, + "acc_norm": 0.7520661157024794, + "acc_norm_stderr": 0.03941897526516304 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.044531975073749834, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.044531975073749834 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6993865030674846, + "acc_stderr": 0.03602511318806771, + "acc_norm": 0.6993865030674846, + "acc_norm_stderr": 0.03602511318806771 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.48214285714285715, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.48214285714285715, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7864077669902912, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.7864077669902912, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8504273504273504, + "acc_stderr": 0.023365051491753715, + "acc_norm": 0.8504273504273504, + "acc_norm_stderr": 0.023365051491753715 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.64, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.64, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7637292464878672, + "acc_stderr": 0.015190473717037497, + "acc_norm": 0.7637292464878672, + "acc_norm_stderr": 0.015190473717037497 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6589595375722543, + "acc_stderr": 0.02552247463212161, + "acc_norm": 0.6589595375722543, + "acc_norm_stderr": 0.02552247463212161 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4145251396648045, + "acc_stderr": 0.016476342210254, + "acc_norm": 0.4145251396648045, + "acc_norm_stderr": 0.016476342210254 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.630718954248366, + "acc_stderr": 0.027634176689602663, + "acc_norm": 0.630718954248366, + "acc_norm_stderr": 0.027634176689602663 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6881028938906752, + "acc_stderr": 0.02631185807185416, + "acc_norm": 0.6881028938906752, + "acc_norm_stderr": 0.02631185807185416 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6697530864197531, + "acc_stderr": 0.026168298456732846, + "acc_norm": 0.6697530864197531, + "acc_norm_stderr": 0.026168298456732846 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4574468085106383, + "acc_stderr": 0.02971928127223684, + "acc_norm": 0.4574468085106383, + "acc_norm_stderr": 0.02971928127223684 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4576271186440678, + "acc_stderr": 0.012724296550980188, + "acc_norm": 0.4576271186440678, + "acc_norm_stderr": 0.012724296550980188 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5698529411764706, + "acc_stderr": 0.030074971917302875, + "acc_norm": 0.5698529411764706, + "acc_norm_stderr": 0.030074971917302875 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6045751633986928, + "acc_stderr": 0.01978046595477753, + "acc_norm": 0.6045751633986928, + "acc_norm_stderr": 0.01978046595477753 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.045820048415054174, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.045820048415054174 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6, + "acc_stderr": 0.03136250240935893, + "acc_norm": 0.6, + "acc_norm_stderr": 0.03136250240935893 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7860696517412935, + "acc_stderr": 0.028996909693328906, + "acc_norm": 0.7860696517412935, + "acc_norm_stderr": 0.028996909693328906 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.82, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5180722891566265, + "acc_stderr": 0.03889951252827216, + "acc_norm": 0.5180722891566265, + "acc_norm_stderr": 0.03889951252827216 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.783625730994152, + "acc_stderr": 0.03158149539338734, + "acc_norm": 0.783625730994152, + "acc_norm_stderr": 0.03158149539338734 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3525091799265606, + "mc1_stderr": 0.016724646380756547, + "mc2": 0.5080755505916852, + "mc2_stderr": 0.015466614437128033 + }, + "all": { + "acc": 0.5836025157897442, + "acc_stderr": 0.034143683896128325, + "acc_norm": 0.5872984753764754, + "acc_norm_stderr": 0.034122806415891, + "mc1": 0.3525091799265606, + "mc1_stderr": 0.016724646380756547, + "mc2": 0.5080755505916852, + "mc2_stderr": 0.015466614437128033 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/Wizard-Vicuna-30B-Uncensored-fp16", + "model_sha": "c7b7cecb5a314fc66deebabcb67c230a3fbe84f7", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Wizard-Vicuna-30B-Uncensored-fp16/results_2023-10-19T13-45-18.299512.json b/eval-results/TheBloke/Wizard-Vicuna-30B-Uncensored-fp16/results_2023-10-19T13-45-18.299512.json new file mode 100644 index 0000000000000000000000000000000000000000..40d3bcdf86116fb2dc3b6475b876eb2d5de57ab5 --- /dev/null +++ b/eval-results/TheBloke/Wizard-Vicuna-30B-Uncensored-fp16/results_2023-10-19T13-45-18.299512.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/Wizard-Vicuna-30B-Uncensored-fp16", + "model_sha": "c7b7cecb5a314fc66deebabcb67c230a3fbe84f7", + "model_size": "60.65 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.18162751677852348, + "em_stderr": 0.0039482621737543045, + "f1": 0.2674087667785243, + "f1_stderr": 0.004012090110572664 + }, + "harness|gsm8k|5": { + "acc": 0.1425322213798332, + "acc_stderr": 0.009629588445673819 + }, + "harness|winogrande|5": { + "acc": 0.7845303867403315, + "acc_stderr": 0.011555295286059279 + }, + "all": { + "em": 0.18162751677852348, + "em_stderr": 0.0039482621737543045, + "f1": 0.2674087667785243, + "f1_stderr": 0.004012090110572664, + "acc": 0.46353130406008236, + "acc_stderr": 0.01059244186586655 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "a1fb5e26adce6681" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "fd8c9b522b0e1a06" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "dbb1cfb72f833e45" + }, + "total_evaluation_time_secondes": "19523.55388736725", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Wizard-Vicuna-7B-Uncensored-HF/results_2023-07-19T17-11-01.220046.json b/eval-results/TheBloke/Wizard-Vicuna-7B-Uncensored-HF/results_2023-07-19T17-11-01.220046.json new file mode 100644 index 0000000000000000000000000000000000000000..232e69f781354c3a1542a2ea64208b181df6005a --- /dev/null +++ b/eval-results/TheBloke/Wizard-Vicuna-7B-Uncensored-HF/results_2023-07-19T17-11-01.220046.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5008532423208191, + "acc_stderr": 0.014611369529813276, + "acc_norm": 0.5341296928327645, + "acc_norm_stderr": 0.014577311315231104 + }, + "harness|hellaswag|10": { + "acc": 0.6055566620195181, + "acc_stderr": 0.0048773196836390705, + "acc_norm": 0.7884883489344752, + "acc_norm_stderr": 0.004075456897370669 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.37777777777777777, + "acc_stderr": 0.04188307537595853, + "acc_norm": 0.37777777777777777, + "acc_norm_stderr": 0.04188307537595853 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.35526315789473684, + "acc_stderr": 0.038947344870133176, + "acc_norm": 0.35526315789473684, + "acc_norm_stderr": 0.038947344870133176 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4037735849056604, + "acc_stderr": 0.03019761160019795, + "acc_norm": 0.4037735849056604, + "acc_norm_stderr": 0.03019761160019795 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3611111111111111, + "acc_stderr": 0.04016660030451233, + "acc_norm": 0.3611111111111111, + "acc_norm_stderr": 0.04016660030451233 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542126, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542126 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.27167630057803466, + "acc_stderr": 0.0339175032232166, + "acc_norm": 0.27167630057803466, + "acc_norm_stderr": 0.0339175032232166 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171452, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171452 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.40425531914893614, + "acc_stderr": 0.03208115750788684, + "acc_norm": 0.40425531914893614, + "acc_norm_stderr": 0.03208115750788684 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.041424397194893624, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.041424397194893624 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.31724137931034485, + "acc_stderr": 0.03878352372138623, + "acc_norm": 0.31724137931034485, + "acc_norm_stderr": 0.03878352372138623 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.021935878081184763, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.021935878081184763 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.0393253768039287, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.0393253768039287 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3774193548387097, + "acc_stderr": 0.027575960723278236, + "acc_norm": 0.3774193548387097, + "acc_norm_stderr": 0.027575960723278236 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.23645320197044334, + "acc_stderr": 0.029896114291733545, + "acc_norm": 0.23645320197044334, + "acc_norm_stderr": 0.029896114291733545 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.48484848484848486, + "acc_stderr": 0.03902551007374449, + "acc_norm": 0.48484848484848486, + "acc_norm_stderr": 0.03902551007374449 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.42424242424242425, + "acc_stderr": 0.03521224908841583, + "acc_norm": 0.42424242424242425, + "acc_norm_stderr": 0.03521224908841583 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.49222797927461137, + "acc_stderr": 0.03608003225569653, + "acc_norm": 0.49222797927461137, + "acc_norm_stderr": 0.03608003225569653 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.023901157979402534, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.023901157979402534 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.026067159222275794, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.026067159222275794 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.31932773109243695, + "acc_stderr": 0.0302839955258844, + "acc_norm": 0.31932773109243695, + "acc_norm_stderr": 0.0302839955258844 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23841059602649006, + "acc_stderr": 0.0347918557259966, + "acc_norm": 0.23841059602649006, + "acc_norm_stderr": 0.0347918557259966 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.47155963302752296, + "acc_stderr": 0.02140261569734804, + "acc_norm": 0.47155963302752296, + "acc_norm_stderr": 0.02140261569734804 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.19444444444444445, + "acc_stderr": 0.026991454502036733, + "acc_norm": 0.19444444444444445, + "acc_norm_stderr": 0.026991454502036733 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.46568627450980393, + "acc_stderr": 0.03501038327635896, + "acc_norm": 0.46568627450980393, + "acc_norm_stderr": 0.03501038327635896 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.4810126582278481, + "acc_stderr": 0.03252375148090448, + "acc_norm": 0.4810126582278481, + "acc_norm_stderr": 0.03252375148090448 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.4663677130044843, + "acc_stderr": 0.033481800170603065, + "acc_norm": 0.4663677130044843, + "acc_norm_stderr": 0.033481800170603065 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3816793893129771, + "acc_stderr": 0.042607351576445594, + "acc_norm": 0.3816793893129771, + "acc_norm_stderr": 0.042607351576445594 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5950413223140496, + "acc_stderr": 0.04481137755942469, + "acc_norm": 0.5950413223140496, + "acc_norm_stderr": 0.04481137755942469 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.047803436269367894, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.047803436269367894 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.36809815950920244, + "acc_stderr": 0.03789213935838396, + "acc_norm": 0.36809815950920244, + "acc_norm_stderr": 0.03789213935838396 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25, + "acc_stderr": 0.04109974682633932, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04109974682633932 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.34951456310679613, + "acc_stderr": 0.047211885060971716, + "acc_norm": 0.34951456310679613, + "acc_norm_stderr": 0.047211885060971716 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.5427350427350427, + "acc_stderr": 0.03263622596380688, + "acc_norm": 0.5427350427350427, + "acc_norm_stderr": 0.03263622596380688 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5312899106002554, + "acc_stderr": 0.017844918090468558, + "acc_norm": 0.5312899106002554, + "acc_norm_stderr": 0.017844918090468558 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.41329479768786126, + "acc_stderr": 0.02651126136940924, + "acc_norm": 0.41329479768786126, + "acc_norm_stderr": 0.02651126136940924 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.02782610930728369, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.02782610930728369 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.38263665594855306, + "acc_stderr": 0.027604689028581982, + "acc_norm": 0.38263665594855306, + "acc_norm_stderr": 0.027604689028581982 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.41358024691358025, + "acc_stderr": 0.027402042040269966, + "acc_norm": 0.41358024691358025, + "acc_norm_stderr": 0.027402042040269966 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.02812163604063989, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.02812163604063989 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.30378096479791394, + "acc_stderr": 0.011745787720472467, + "acc_norm": 0.30378096479791394, + "acc_norm_stderr": 0.011745787720472467 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4632352941176471, + "acc_stderr": 0.03029061918048569, + "acc_norm": 0.4632352941176471, + "acc_norm_stderr": 0.03029061918048569 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.3872549019607843, + "acc_stderr": 0.019706875804085627, + "acc_norm": 0.3872549019607843, + "acc_norm_stderr": 0.019706875804085627 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.41818181818181815, + "acc_stderr": 0.04724577405731571, + "acc_norm": 0.41818181818181815, + "acc_norm_stderr": 0.04724577405731571 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3020408163265306, + "acc_stderr": 0.029393609319879815, + "acc_norm": 0.3020408163265306, + "acc_norm_stderr": 0.029393609319879815 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.39303482587064675, + "acc_stderr": 0.0345368246603156, + "acc_norm": 0.39303482587064675, + "acc_norm_stderr": 0.0345368246603156 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3192771084337349, + "acc_stderr": 0.0362933532994786, + "acc_norm": 0.3192771084337349, + "acc_norm_stderr": 0.0362933532994786 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.5263157894736842, + "acc_stderr": 0.03829509868994727, + "acc_norm": 0.5263157894736842, + "acc_norm_stderr": 0.03829509868994727 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2974296205630355, + "mc1_stderr": 0.016002651487361005, + "mc2": 0.4347779023075343, + "mc2_stderr": 0.015366618013678184 + }, + "all": { + "acc": 0.3770694411070861, + "acc_stderr": 0.03457272491689496, + "acc_norm": 0.38073398580923695, + "acc_norm_stderr": 0.03455855676433817, + "mc1": 0.2974296205630355, + "mc1_stderr": 0.016002651487361005, + "mc2": 0.4347779023075343, + "mc2_stderr": 0.015366618013678184 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/Wizard-Vicuna-7B-Uncensored-HF", + "model_sha": "b802f1b4401d0b2242137160c20cc11b9ffd3a4c", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/Wizard-Vicuna-7B-Uncensored-HF/results_2023-10-22T23-25-47.452800.json b/eval-results/TheBloke/Wizard-Vicuna-7B-Uncensored-HF/results_2023-10-22T23-25-47.452800.json new file mode 100644 index 0000000000000000000000000000000000000000..0381bfdee20839746685284da4e3bcc7c067a8e9 --- /dev/null +++ b/eval-results/TheBloke/Wizard-Vicuna-7B-Uncensored-HF/results_2023-10-22T23-25-47.452800.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/Wizard-Vicuna-7B-Uncensored-HF", + "model_sha": "b802f1b4401d0b2242137160c20cc11b9ffd3a4c", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.18036912751677853, + "em_stderr": 0.003937584689736024, + "f1": 0.23801803691275183, + "f1_stderr": 0.003988701736112215 + }, + "harness|gsm8k|5": { + "acc": 0.045489006823351025, + "acc_stderr": 0.005739657656722215 + }, + "harness|winogrande|5": { + "acc": 0.7221783741120757, + "acc_stderr": 0.012588918183871601 + }, + "all": { + "em": 0.18036912751677853, + "em_stderr": 0.003937584689736024, + "f1": 0.23801803691275183, + "f1_stderr": 0.003988701736112215, + "acc": 0.3838336904677134, + "acc_stderr": 0.009164287920296908 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "ba7dce9d18ede6db" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "5777dc6a78bf46f4" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "fb3ec5a5e7b2120d" + }, + "total_evaluation_time_secondes": "9480.276952266693", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-GPTQ/results_2023-09-11T17-32-08.880546.json b/eval-results/TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-GPTQ/results_2023-09-11T17-32-08.880546.json new file mode 100644 index 0000000000000000000000000000000000000000..4c514babb261a06667b6e9e2e22a7eaed25da634 --- /dev/null +++ b/eval-results/TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-GPTQ/results_2023-09-11T17-32-08.880546.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-GPTQ", + "model_sha": "085eb5cd394f30d72bf5efcf83a580e87264b3e8", + "model_size": "6.92 GB", + "model_dtype": "None", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5486348122866894, + "acc_stderr": 0.014542104569955265, + "acc_norm": 0.5699658703071673, + "acc_norm_stderr": 0.014467631559137993 + }, + "harness|hellaswag|10": { + "acc": 0.6078470424218283, + "acc_stderr": 0.004872326888655519, + "acc_norm": 0.8032264489145589, + "acc_norm_stderr": 0.003967472072468517 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45925925925925926, + "acc_stderr": 0.04304979692464243, + "acc_norm": 0.45925925925925926, + "acc_norm_stderr": 0.04304979692464243 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.48026315789473684, + "acc_stderr": 0.040657710025626036, + "acc_norm": 0.48026315789473684, + "acc_norm_stderr": 0.040657710025626036 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5622641509433962, + "acc_stderr": 0.030533338430467516, + "acc_norm": 0.5622641509433962, + "acc_norm_stderr": 0.030533338430467516 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.04155319955593146, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.04155319955593146 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4508670520231214, + "acc_stderr": 0.037940126746970296, + "acc_norm": 0.4508670520231214, + "acc_norm_stderr": 0.037940126746970296 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.04389869956808778, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.04389869956808778 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.64, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.64, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.41702127659574467, + "acc_stderr": 0.032232762667117124, + "acc_norm": 0.41702127659574467, + "acc_norm_stderr": 0.032232762667117124 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.04096985139843672, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.04096985139843672 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3931034482758621, + "acc_stderr": 0.0407032901370707, + "acc_norm": 0.3931034482758621, + "acc_norm_stderr": 0.0407032901370707 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.02351729433596329, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.02351729433596329 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.04263906892795133, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.04263906892795133 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.532258064516129, + "acc_stderr": 0.028384747788813332, + "acc_norm": 0.532258064516129, + "acc_norm_stderr": 0.028384747788813332 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3448275862068966, + "acc_stderr": 0.03344283744280458, + "acc_norm": 0.3448275862068966, + "acc_norm_stderr": 0.03344283744280458 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.049888765156985884, + "acc_norm": 0.44, + "acc_norm_stderr": 0.049888765156985884 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.20606060606060606, + "acc_stderr": 0.0315841532404771, + "acc_norm": 0.20606060606060606, + "acc_norm_stderr": 0.0315841532404771 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5808080808080808, + "acc_stderr": 0.03515520728670417, + "acc_norm": 0.5808080808080808, + "acc_norm_stderr": 0.03515520728670417 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6683937823834197, + "acc_stderr": 0.03397636541089118, + "acc_norm": 0.6683937823834197, + "acc_norm_stderr": 0.03397636541089118 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.47435897435897434, + "acc_stderr": 0.025317649726448656, + "acc_norm": 0.47435897435897434, + "acc_norm_stderr": 0.025317649726448656 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.025348097468097852, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.025348097468097852 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5252100840336135, + "acc_stderr": 0.03243718055137411, + "acc_norm": 0.5252100840336135, + "acc_norm_stderr": 0.03243718055137411 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.038227469376587525, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.038227469376587525 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6697247706422018, + "acc_stderr": 0.020164466336342977, + "acc_norm": 0.6697247706422018, + "acc_norm_stderr": 0.020164466336342977 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.37962962962962965, + "acc_stderr": 0.03309682581119035, + "acc_norm": 0.37962962962962965, + "acc_norm_stderr": 0.03309682581119035 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.030190282453501947, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.030190282453501947 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6160337552742616, + "acc_stderr": 0.03165867806410668, + "acc_norm": 0.6160337552742616, + "acc_norm_stderr": 0.03165867806410668 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5112107623318386, + "acc_stderr": 0.033549366530984746, + "acc_norm": 0.5112107623318386, + "acc_norm_stderr": 0.033549366530984746 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5572519083969466, + "acc_stderr": 0.043564472026650695, + "acc_norm": 0.5572519083969466, + "acc_norm_stderr": 0.043564472026650695 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6446280991735537, + "acc_stderr": 0.04369236326573981, + "acc_norm": 0.6446280991735537, + "acc_norm_stderr": 0.04369236326573981 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5092592592592593, + "acc_stderr": 0.04832853553437055, + "acc_norm": 0.5092592592592593, + "acc_norm_stderr": 0.04832853553437055 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5460122699386503, + "acc_stderr": 0.0391170190467718, + "acc_norm": 0.5460122699386503, + "acc_norm_stderr": 0.0391170190467718 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.30357142857142855, + "acc_stderr": 0.04364226155841044, + "acc_norm": 0.30357142857142855, + "acc_norm_stderr": 0.04364226155841044 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6699029126213593, + "acc_stderr": 0.0465614711001235, + "acc_norm": 0.6699029126213593, + "acc_norm_stderr": 0.0465614711001235 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7008547008547008, + "acc_stderr": 0.02999695185834948, + "acc_norm": 0.7008547008547008, + "acc_norm_stderr": 0.02999695185834948 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.01685739124747255, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.01685739124747255 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5606936416184971, + "acc_stderr": 0.026720034380514995, + "acc_norm": 0.5606936416184971, + "acc_norm_stderr": 0.026720034380514995 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2759776536312849, + "acc_stderr": 0.014950103002475349, + "acc_norm": 0.2759776536312849, + "acc_norm_stderr": 0.014950103002475349 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5751633986928104, + "acc_stderr": 0.02830457667314112, + "acc_norm": 0.5751633986928104, + "acc_norm_stderr": 0.02830457667314112 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5530546623794212, + "acc_stderr": 0.028237769422085335, + "acc_norm": 0.5530546623794212, + "acc_norm_stderr": 0.028237769422085335 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5493827160493827, + "acc_stderr": 0.027684721415656203, + "acc_norm": 0.5493827160493827, + "acc_norm_stderr": 0.027684721415656203 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3546099290780142, + "acc_stderr": 0.028538650028878638, + "acc_norm": 0.3546099290780142, + "acc_norm_stderr": 0.028538650028878638 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.39308996088657105, + "acc_stderr": 0.012474899613873961, + "acc_norm": 0.39308996088657105, + "acc_norm_stderr": 0.012474899613873961 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5220588235294118, + "acc_stderr": 0.030343264224213528, + "acc_norm": 0.5220588235294118, + "acc_norm_stderr": 0.030343264224213528 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4803921568627451, + "acc_stderr": 0.020212274976302957, + "acc_norm": 0.4803921568627451, + "acc_norm_stderr": 0.020212274976302957 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.4818181818181818, + "acc_stderr": 0.04785964010794916, + "acc_norm": 0.4818181818181818, + "acc_norm_stderr": 0.04785964010794916 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5918367346938775, + "acc_stderr": 0.03146465712827424, + "acc_norm": 0.5918367346938775, + "acc_norm_stderr": 0.03146465712827424 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6865671641791045, + "acc_stderr": 0.032801882053486414, + "acc_norm": 0.6865671641791045, + "acc_norm_stderr": 0.032801882053486414 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.65, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.65, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.038367221765980515, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.038367221765980515 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7076023391812866, + "acc_stderr": 0.03488647713457922, + "acc_norm": 0.7076023391812866, + "acc_norm_stderr": 0.03488647713457922 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3659730722154223, + "mc1_stderr": 0.016862941684088376, + "mc2": 0.5345534336987072, + "mc2_stderr": 0.01574114618973484 + }, + "all": { + "acc": 0.47448637397526006, + "acc_stderr": 0.035045561337073074, + "acc_norm": 0.47815943269582295, + "acc_norm_stderr": 0.03502896256034419, + "mc1": 0.3659730722154223, + "mc1_stderr": 0.016862941684088376, + "mc2": 0.5345534336987072, + "mc2_stderr": 0.01574114618973484 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6775.387052297592", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-GPTQ/results_2023-10-28T21-00-02.304492.json b/eval-results/TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-GPTQ/results_2023-10-28T21-00-02.304492.json new file mode 100644 index 0000000000000000000000000000000000000000..2fddbcd1ffd5b774ee8e156df5039558e28d3bef --- /dev/null +++ b/eval-results/TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-GPTQ/results_2023-10-28T21-00-02.304492.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-GPTQ", + "model_sha": "085eb5cd394f30d72bf5efcf83a580e87264b3e8", + "model_size": "6.92 GB", + "model_dtype": "None", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.22158137583892618, + "em_stderr": 0.004253171428083824, + "f1": 0.28616296140939684, + "f1_stderr": 0.004276937020149761 + }, + "harness|gsm8k|5": { + "acc": 0.006823351023502654, + "acc_stderr": 0.0022675371022544783 + }, + "harness|winogrande|5": { + "acc": 0.7434885556432518, + "acc_stderr": 0.012273648008759979 + }, + "all": { + "em": 0.22158137583892618, + "em_stderr": 0.004253171428083824, + "f1": 0.28616296140939684, + "f1_stderr": 0.004276937020149761, + "acc": 0.3751559533333772, + "acc_stderr": 0.007270592555507228 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "765bf0d9fe7f36b6" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "c3e21d310721c449" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "1eb50782b30a77e9" + }, + "total_evaluation_time_secondes": "6928.959567308426", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16/results_2023-08-01T13-56-27.012351.json b/eval-results/TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16/results_2023-08-01T13-56-27.012351.json new file mode 100644 index 0000000000000000000000000000000000000000..40ea2459543d3b9c71a4ac3be82bbfcb0de08fb6 --- /dev/null +++ b/eval-results/TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16/results_2023-08-01T13-56-27.012351.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5546075085324232, + "acc_stderr": 0.014523987638344083, + "acc_norm": 0.5861774744027304, + "acc_norm_stderr": 0.014392730009221007 + }, + "harness|hellaswag|10": { + "acc": 0.6121290579565823, + "acc_stderr": 0.0048626905948157065, + "acc_norm": 0.8106950806612229, + "acc_norm_stderr": 0.003909500159884898 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45925925925925926, + "acc_stderr": 0.04304979692464243, + "acc_norm": 0.45925925925925926, + "acc_norm_stderr": 0.04304979692464243 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5460526315789473, + "acc_stderr": 0.04051646342874142, + "acc_norm": 0.5460526315789473, + "acc_norm_stderr": 0.04051646342874142 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5622641509433962, + "acc_stderr": 0.030533338430467516, + "acc_norm": 0.5622641509433962, + "acc_norm_stderr": 0.030533338430467516 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5, + "acc_stderr": 0.04181210050035455, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04181210050035455 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4277456647398844, + "acc_stderr": 0.03772446857518027, + "acc_norm": 0.4277456647398844, + "acc_norm_stderr": 0.03772446857518027 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.042801058373643966, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.042801058373643966 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.40425531914893614, + "acc_stderr": 0.03208115750788685, + "acc_norm": 0.40425531914893614, + "acc_norm_stderr": 0.03208115750788685 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.46206896551724136, + "acc_stderr": 0.041546596717075474, + "acc_norm": 0.46206896551724136, + "acc_norm_stderr": 0.041546596717075474 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.28835978835978837, + "acc_stderr": 0.023330654054535882, + "acc_norm": 0.28835978835978837, + "acc_norm_stderr": 0.023330654054535882 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04216370213557836, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04216370213557836 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5580645161290323, + "acc_stderr": 0.028251557906849734, + "acc_norm": 0.5580645161290323, + "acc_norm_stderr": 0.028251557906849734 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.37438423645320196, + "acc_stderr": 0.03405155380561952, + "acc_norm": 0.37438423645320196, + "acc_norm_stderr": 0.03405155380561952 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.24242424242424243, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6313131313131313, + "acc_stderr": 0.034373055019806184, + "acc_norm": 0.6313131313131313, + "acc_norm_stderr": 0.034373055019806184 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6839378238341969, + "acc_stderr": 0.033553973696861736, + "acc_norm": 0.6839378238341969, + "acc_norm_stderr": 0.033553973696861736 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4564102564102564, + "acc_stderr": 0.0252544854247996, + "acc_norm": 0.4564102564102564, + "acc_norm_stderr": 0.0252544854247996 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.22962962962962963, + "acc_stderr": 0.025644108639267617, + "acc_norm": 0.22962962962962963, + "acc_norm_stderr": 0.025644108639267617 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.46638655462184875, + "acc_stderr": 0.03240501447690071, + "acc_norm": 0.46638655462184875, + "acc_norm_stderr": 0.03240501447690071 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.671559633027523, + "acc_stderr": 0.020135902797298412, + "acc_norm": 0.671559633027523, + "acc_norm_stderr": 0.020135902797298412 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.39351851851851855, + "acc_stderr": 0.03331747876370312, + "acc_norm": 0.39351851851851855, + "acc_norm_stderr": 0.03331747876370312 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.03019028245350195, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.03019028245350195 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6118143459915611, + "acc_stderr": 0.03172295004332329, + "acc_norm": 0.6118143459915611, + "acc_norm_stderr": 0.03172295004332329 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.547085201793722, + "acc_stderr": 0.033408675019233246, + "acc_norm": 0.547085201793722, + "acc_norm_stderr": 0.033408675019233246 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5877862595419847, + "acc_stderr": 0.04317171194870254, + "acc_norm": 0.5877862595419847, + "acc_norm_stderr": 0.04317171194870254 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5867768595041323, + "acc_stderr": 0.04495087843548408, + "acc_norm": 0.5867768595041323, + "acc_norm_stderr": 0.04495087843548408 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5648148148148148, + "acc_stderr": 0.04792898170907061, + "acc_norm": 0.5648148148148148, + "acc_norm_stderr": 0.04792898170907061 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5398773006134969, + "acc_stderr": 0.0391585729143697, + "acc_norm": 0.5398773006134969, + "acc_norm_stderr": 0.0391585729143697 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.30357142857142855, + "acc_stderr": 0.043642261558410445, + "acc_norm": 0.30357142857142855, + "acc_norm_stderr": 0.043642261558410445 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.045416094465039476, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.045416094465039476 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7435897435897436, + "acc_stderr": 0.02860595370200426, + "acc_norm": 0.7435897435897436, + "acc_norm_stderr": 0.02860595370200426 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6781609195402298, + "acc_stderr": 0.016706381415057904, + "acc_norm": 0.6781609195402298, + "acc_norm_stderr": 0.016706381415057904 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5664739884393064, + "acc_stderr": 0.026680134761679217, + "acc_norm": 0.5664739884393064, + "acc_norm_stderr": 0.026680134761679217 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2558659217877095, + "acc_stderr": 0.014593620923210742, + "acc_norm": 0.2558659217877095, + "acc_norm_stderr": 0.014593620923210742 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.02818059632825929, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.02818059632825929 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5627009646302251, + "acc_stderr": 0.0281739177617629, + "acc_norm": 0.5627009646302251, + "acc_norm_stderr": 0.0281739177617629 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5493827160493827, + "acc_stderr": 0.027684721415656203, + "acc_norm": 0.5493827160493827, + "acc_norm_stderr": 0.027684721415656203 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3900709219858156, + "acc_stderr": 0.02909767559946393, + "acc_norm": 0.3900709219858156, + "acc_norm_stderr": 0.02909767559946393 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4015645371577575, + "acc_stderr": 0.012520315120147108, + "acc_norm": 0.4015645371577575, + "acc_norm_stderr": 0.012520315120147108 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5036764705882353, + "acc_stderr": 0.0303720158854282, + "acc_norm": 0.5036764705882353, + "acc_norm_stderr": 0.0303720158854282 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5081699346405228, + "acc_stderr": 0.02022513434305727, + "acc_norm": 0.5081699346405228, + "acc_norm_stderr": 0.02022513434305727 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5545454545454546, + "acc_stderr": 0.047605488214603246, + "acc_norm": 0.5545454545454546, + "acc_norm_stderr": 0.047605488214603246 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5836734693877551, + "acc_stderr": 0.03155782816556164, + "acc_norm": 0.5836734693877551, + "acc_norm_stderr": 0.03155782816556164 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7064676616915423, + "acc_stderr": 0.032200241045342054, + "acc_norm": 0.7064676616915423, + "acc_norm_stderr": 0.032200241045342054 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.038367221765980515, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.038367221765980515 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7134502923976608, + "acc_stderr": 0.03467826685703826, + "acc_norm": 0.7134502923976608, + "acc_norm_stderr": 0.03467826685703826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.38310893512851896, + "mc1_stderr": 0.01701846167938986, + "mc2": 0.541865699474601, + "mc2_stderr": 0.015731584534280967 + }, + "all": { + "acc": 0.48662620492176223, + "acc_stderr": 0.034999460858882295, + "acc_norm": 0.49052681489760885, + "acc_norm_stderr": 0.03498108038322037, + "mc1": 0.38310893512851896, + "mc1_stderr": 0.01701846167938986, + "mc2": 0.541865699474601, + "mc2_stderr": 0.015731584534280967 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16", + "model_sha": "83905656ca3e63877b8d9f3a74118da0c9bc6939", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6275.88010430336", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16/results_2023-10-22T05-46-44.212362.json b/eval-results/TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16/results_2023-10-22T05-46-44.212362.json new file mode 100644 index 0000000000000000000000000000000000000000..f99cb5e5e01e49b3f7742b96bb2fdf015435c2f9 --- /dev/null +++ b/eval-results/TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16/results_2023-10-22T05-46-44.212362.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/WizardLM-13B-V1-1-SuperHOT-8K-fp16", + "model_sha": "83905656ca3e63877b8d9f3a74118da0c9bc6939", + "model_size": "24.4 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.2419253355704698, + "em_stderr": 0.004385673721154169, + "f1": 0.30457843959731623, + "f1_stderr": 0.00439090225052454 + }, + "harness|gsm8k|5": { + "acc": 0.0075815011372251705, + "acc_stderr": 0.0023892815120772092 + }, + "harness|winogrande|5": { + "acc": 0.7600631412786109, + "acc_stderr": 0.012002078629485742 + }, + "all": { + "em": 0.2419253355704698, + "em_stderr": 0.004385673721154169, + "f1": 0.30457843959731623, + "f1_stderr": 0.00439090225052454, + "acc": 0.38382232120791804, + "acc_stderr": 0.007195680070781476 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "50ef156aeac6a66f" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "945323006f6a445d" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "919e23490a64c567" + }, + "total_evaluation_time_secondes": "11284.448909521103", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-13B-V1.1-GPTQ/results_2023-08-29T17-24-13.256665.json b/eval-results/TheBloke/WizardLM-13B-V1.1-GPTQ/results_2023-08-29T17-24-13.256665.json new file mode 100644 index 0000000000000000000000000000000000000000..47da47c269995c0c6fad01be440bc94b676f6e0c --- /dev/null +++ b/eval-results/TheBloke/WizardLM-13B-V1.1-GPTQ/results_2023-08-29T17-24-13.256665.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/WizardLM-13B-V1.1-GPTQ", + "model_sha": "9df807ac64034bc6e7387326689d6e39656ce5e0", + "model_dtype": "torch.float16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5648464163822525, + "acc_stderr": 0.014487986197186045, + "acc_norm": 0.5853242320819113, + "acc_norm_stderr": 0.014397070564409174 + }, + "harness|hellaswag|10": { + "acc": 0.6042620991834295, + "acc_stderr": 0.004880092083408045, + "acc_norm": 0.8066122286397132, + "acc_norm_stderr": 0.003941471781664183 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4934210526315789, + "acc_stderr": 0.040685900502249704, + "acc_norm": 0.4934210526315789, + "acc_norm_stderr": 0.040685900502249704 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5358490566037736, + "acc_stderr": 0.030693675018458003, + "acc_norm": 0.5358490566037736, + "acc_norm_stderr": 0.030693675018458003 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4236111111111111, + "acc_stderr": 0.041321250197233685, + "acc_norm": 0.4236111111111111, + "acc_norm_stderr": 0.041321250197233685 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.45, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.44508670520231214, + "acc_stderr": 0.03789401760283647, + "acc_norm": 0.44508670520231214, + "acc_norm_stderr": 0.03789401760283647 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171452, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171452 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3829787234042553, + "acc_stderr": 0.031778212502369216, + "acc_norm": 0.3829787234042553, + "acc_norm_stderr": 0.031778212502369216 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.43448275862068964, + "acc_stderr": 0.04130740879555497, + "acc_norm": 0.43448275862068964, + "acc_norm_stderr": 0.04130740879555497 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29894179894179895, + "acc_stderr": 0.023577604791655802, + "acc_norm": 0.29894179894179895, + "acc_norm_stderr": 0.023577604791655802 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.04390259265377562, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.04390259265377562 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5709677419354838, + "acc_stderr": 0.028156036538233193, + "acc_norm": 0.5709677419354838, + "acc_norm_stderr": 0.028156036538233193 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3448275862068966, + "acc_stderr": 0.03344283744280458, + "acc_norm": 0.3448275862068966, + "acc_norm_stderr": 0.03344283744280458 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6242424242424243, + "acc_stderr": 0.03781887353205982, + "acc_norm": 0.6242424242424243, + "acc_norm_stderr": 0.03781887353205982 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5909090909090909, + "acc_stderr": 0.03502975799413007, + "acc_norm": 0.5909090909090909, + "acc_norm_stderr": 0.03502975799413007 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6787564766839378, + "acc_stderr": 0.033699508685490674, + "acc_norm": 0.6787564766839378, + "acc_norm_stderr": 0.033699508685490674 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4512820512820513, + "acc_stderr": 0.02523038123893484, + "acc_norm": 0.4512820512820513, + "acc_norm_stderr": 0.02523038123893484 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25555555555555554, + "acc_stderr": 0.02659393910184407, + "acc_norm": 0.25555555555555554, + "acc_norm_stderr": 0.02659393910184407 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5084033613445378, + "acc_stderr": 0.03247390276569669, + "acc_norm": 0.5084033613445378, + "acc_norm_stderr": 0.03247390276569669 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6623853211009174, + "acc_stderr": 0.02027526598663892, + "acc_norm": 0.6623853211009174, + "acc_norm_stderr": 0.02027526598663892 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4305555555555556, + "acc_stderr": 0.03376922151252336, + "acc_norm": 0.4305555555555556, + "acc_norm_stderr": 0.03376922151252336 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6421568627450981, + "acc_stderr": 0.033644872860883, + "acc_norm": 0.6421568627450981, + "acc_norm_stderr": 0.033644872860883 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6582278481012658, + "acc_stderr": 0.03087453753755362, + "acc_norm": 0.6582278481012658, + "acc_norm_stderr": 0.03087453753755362 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.4798206278026906, + "acc_stderr": 0.033530461674123, + "acc_norm": 0.4798206278026906, + "acc_norm_stderr": 0.033530461674123 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5725190839694656, + "acc_stderr": 0.043389203057924, + "acc_norm": 0.5725190839694656, + "acc_norm_stderr": 0.043389203057924 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6115702479338843, + "acc_stderr": 0.04449270350068382, + "acc_norm": 0.6115702479338843, + "acc_norm_stderr": 0.04449270350068382 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.04820403072760627, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.04820403072760627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5030674846625767, + "acc_stderr": 0.03928297078179662, + "acc_norm": 0.5030674846625767, + "acc_norm_stderr": 0.03928297078179662 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285714, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285714 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6504854368932039, + "acc_stderr": 0.047211885060971716, + "acc_norm": 0.6504854368932039, + "acc_norm_stderr": 0.047211885060971716 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.717948717948718, + "acc_stderr": 0.029480360549541194, + "acc_norm": 0.717948717948718, + "acc_norm_stderr": 0.029480360549541194 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6615581098339719, + "acc_stderr": 0.01692086958621067, + "acc_norm": 0.6615581098339719, + "acc_norm_stderr": 0.01692086958621067 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5606936416184971, + "acc_stderr": 0.02672003438051499, + "acc_norm": 0.5606936416184971, + "acc_norm_stderr": 0.02672003438051499 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.29497206703910617, + "acc_stderr": 0.015251931579208185, + "acc_norm": 0.29497206703910617, + "acc_norm_stderr": 0.015251931579208185 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6045751633986928, + "acc_stderr": 0.027996723180631445, + "acc_norm": 0.6045751633986928, + "acc_norm_stderr": 0.027996723180631445 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5369774919614148, + "acc_stderr": 0.02832032583010591, + "acc_norm": 0.5369774919614148, + "acc_norm_stderr": 0.02832032583010591 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.027777777777777797, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.027777777777777797 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.39361702127659576, + "acc_stderr": 0.029144544781596147, + "acc_norm": 0.39361702127659576, + "acc_norm_stderr": 0.029144544781596147 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.40352020860495436, + "acc_stderr": 0.012530241301193195, + "acc_norm": 0.40352020860495436, + "acc_norm_stderr": 0.012530241301193195 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4889705882352941, + "acc_stderr": 0.03036544647727568, + "acc_norm": 0.4889705882352941, + "acc_norm_stderr": 0.03036544647727568 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.47875816993464054, + "acc_stderr": 0.020209572388600255, + "acc_norm": 0.47875816993464054, + "acc_norm_stderr": 0.020209572388600255 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5636363636363636, + "acc_stderr": 0.04750185058907296, + "acc_norm": 0.5636363636363636, + "acc_norm_stderr": 0.04750185058907296 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5877551020408164, + "acc_stderr": 0.031512360446742674, + "acc_norm": 0.5877551020408164, + "acc_norm_stderr": 0.031512360446742674 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6567164179104478, + "acc_stderr": 0.03357379665433431, + "acc_norm": 0.6567164179104478, + "acc_norm_stderr": 0.03357379665433431 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.74, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.74, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42771084337349397, + "acc_stderr": 0.038515976837185335, + "acc_norm": 0.42771084337349397, + "acc_norm_stderr": 0.038515976837185335 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7251461988304093, + "acc_stderr": 0.034240429246915824, + "acc_norm": 0.7251461988304093, + "acc_norm_stderr": 0.034240429246915824 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.37821297429620565, + "mc1_stderr": 0.016976335907546866, + "mc2": 0.5435298001783399, + "mc2_stderr": 0.015594790355285697 + }, + "all": { + "acc": 0.4989274387080137, + "acc_stderr": 0.03528129823407584, + "acc_norm": 0.5027041835411654, + "acc_norm_stderr": 0.0352638484724738, + "mc1": 0.37821297429620565, + "mc1_stderr": 0.016976335907546866, + "mc2": 0.5435298001783399, + "mc2_stderr": 0.015594790355285697 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "7567.830826759338", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-13B-V1.1-GPTQ/results_2023-11-05T12-40-32.771713.json b/eval-results/TheBloke/WizardLM-13B-V1.1-GPTQ/results_2023-11-05T12-40-32.771713.json new file mode 100644 index 0000000000000000000000000000000000000000..4a416f6d5553abafa0cdbfac72dca7a458a3b0ce --- /dev/null +++ b/eval-results/TheBloke/WizardLM-13B-V1.1-GPTQ/results_2023-11-05T12-40-32.771713.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/WizardLM-13B-V1.1-GPTQ", + "model_sha": "41b98a74940744d1a81518683f8e9fdddc152ff8", + "model_dtype": "torch.float16", + "model_size": "6.8 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.16841442953020133, + "em_stderr": 0.0038325053787783623, + "f1": 0.229352978187919, + "f1_stderr": 0.0038742375772474525 + }, + "harness|gsm8k|5": { + "acc": 0.07808946171341925, + "acc_stderr": 0.007390654481108247 + }, + "harness|winogrande|5": { + "acc": 0.744277821625888, + "acc_stderr": 0.012261253845440473 + }, + "all": { + "em": 0.16841442953020133, + "em_stderr": 0.0038325053787783623, + "f1": 0.229352978187919, + "f1_stderr": 0.0038742375772474525, + "acc": 0.4111836416696536, + "acc_stderr": 0.00982595416327436 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "bc75a190b47625de" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "6eb5644c681cd638" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "b4ea0363ee86fbe4" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-13B-V1.1-GPTQ/results_2023-11-07T10-04-12.671111.json b/eval-results/TheBloke/WizardLM-13B-V1.1-GPTQ/results_2023-11-07T10-04-12.671111.json new file mode 100644 index 0000000000000000000000000000000000000000..b59016317ec7f3724ed47c23b5951d09c4a3dfeb --- /dev/null +++ b/eval-results/TheBloke/WizardLM-13B-V1.1-GPTQ/results_2023-11-07T10-04-12.671111.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/WizardLM-13B-V1.1-GPTQ", + "model_sha": "41b98a74940744d1a81518683f8e9fdddc152ff8", + "model_dtype": "torch.float16", + "model_size": "6.8 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.16851929530201343, + "em_stderr": 0.0038334566477606904, + "f1": 0.22963611577181164, + "f1_stderr": 0.0038748826707742656 + }, + "harness|gsm8k|5": { + "acc": 0.08112206216830932, + "acc_stderr": 0.007520395797922653 + }, + "harness|winogrande|5": { + "acc": 0.744277821625888, + "acc_stderr": 0.012261253845440473 + }, + "all": { + "em": 0.16851929530201343, + "em_stderr": 0.0038334566477606904, + "f1": 0.22963611577181164, + "f1_stderr": 0.0038748826707742656, + "acc": 0.41269994189709863, + "acc_stderr": 0.009890824821681563 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "519271be42a50e7e" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "b7ccd399f8cfb284" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "14537522013b7c74" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-30B-GPTQ/results_2023-08-22T13-58-45.500746.json b/eval-results/TheBloke/WizardLM-30B-GPTQ/results_2023-08-22T13-58-45.500746.json new file mode 100644 index 0000000000000000000000000000000000000000..ee47e5dddeae4d472fa5ad1b418b104609227d12 --- /dev/null +++ b/eval-results/TheBloke/WizardLM-30B-GPTQ/results_2023-08-22T13-58-45.500746.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.22525597269624573, + "acc_stderr": 0.012207839995407326, + "acc_norm": 0.2883959044368601, + "acc_norm_stderr": 0.013238394422428173 + }, + "harness|hellaswag|10": { + "acc": 0.2559251145190201, + "acc_stderr": 0.0043548810057897295, + "acc_norm": 0.2608046205935073, + "acc_norm_stderr": 0.004381761941552688 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.03749850709174022, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.03749850709174022 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.27631578947368424, + "acc_stderr": 0.03639057569952924, + "acc_norm": 0.27631578947368424, + "acc_norm_stderr": 0.03639057569952924 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.20754716981132076, + "acc_stderr": 0.02495991802891127, + "acc_norm": 0.20754716981132076, + "acc_norm_stderr": 0.02495991802891127 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2847222222222222, + "acc_stderr": 0.03773809990686934, + "acc_norm": 0.2847222222222222, + "acc_norm_stderr": 0.03773809990686934 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2658959537572254, + "acc_stderr": 0.033687629322594295, + "acc_norm": 0.2658959537572254, + "acc_norm_stderr": 0.033687629322594295 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.25957446808510637, + "acc_stderr": 0.02865917937429232, + "acc_norm": 0.25957446808510637, + "acc_norm_stderr": 0.02865917937429232 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.04142439719489364, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.04142439719489364 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.31724137931034485, + "acc_stderr": 0.038783523721386215, + "acc_norm": 0.31724137931034485, + "acc_norm_stderr": 0.038783523721386215 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2671957671957672, + "acc_stderr": 0.022789673145776564, + "acc_norm": 0.2671957671957672, + "acc_norm_stderr": 0.022789673145776564 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.18253968253968253, + "acc_stderr": 0.03455071019102147, + "acc_norm": 0.18253968253968253, + "acc_norm_stderr": 0.03455071019102147 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.22258064516129034, + "acc_stderr": 0.023664216671642518, + "acc_norm": 0.22258064516129034, + "acc_norm_stderr": 0.023664216671642518 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.17733990147783252, + "acc_stderr": 0.026874337276808356, + "acc_norm": 0.17733990147783252, + "acc_norm_stderr": 0.026874337276808356 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.22424242424242424, + "acc_stderr": 0.03256866661681102, + "acc_norm": 0.22424242424242424, + "acc_norm_stderr": 0.03256866661681102 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.25252525252525254, + "acc_stderr": 0.030954055470365914, + "acc_norm": 0.25252525252525254, + "acc_norm_stderr": 0.030954055470365914 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.15544041450777202, + "acc_stderr": 0.026148483469153303, + "acc_norm": 0.15544041450777202, + "acc_norm_stderr": 0.026148483469153303 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.20512820512820512, + "acc_stderr": 0.020473233173552003, + "acc_norm": 0.20512820512820512, + "acc_norm_stderr": 0.020473233173552003 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.22962962962962963, + "acc_stderr": 0.025644108639267634, + "acc_norm": 0.22962962962962963, + "acc_norm_stderr": 0.025644108639267634 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.20168067226890757, + "acc_stderr": 0.02606431340630452, + "acc_norm": 0.20168067226890757, + "acc_norm_stderr": 0.02606431340630452 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2052980132450331, + "acc_stderr": 0.03297986648473838, + "acc_norm": 0.2052980132450331, + "acc_norm_stderr": 0.03297986648473838 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.22935779816513763, + "acc_stderr": 0.018025349724618684, + "acc_norm": 0.22935779816513763, + "acc_norm_stderr": 0.018025349724618684 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03005820270430985, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03005820270430985 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.02933116229425174, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.02933116229425174 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.26582278481012656, + "acc_stderr": 0.028756799629658342, + "acc_norm": 0.26582278481012656, + "acc_norm_stderr": 0.028756799629658342 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.28699551569506726, + "acc_stderr": 0.030360379710291954, + "acc_norm": 0.28699551569506726, + "acc_norm_stderr": 0.030360379710291954 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.21374045801526717, + "acc_stderr": 0.0359546161177469, + "acc_norm": 0.21374045801526717, + "acc_norm_stderr": 0.0359546161177469 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2644628099173554, + "acc_stderr": 0.04026187527591206, + "acc_norm": 0.2644628099173554, + "acc_norm_stderr": 0.04026187527591206 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.04284467968052192, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.04284467968052192 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.25153374233128833, + "acc_stderr": 0.03408997886857529, + "acc_norm": 0.25153374233128833, + "acc_norm_stderr": 0.03408997886857529 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25, + "acc_stderr": 0.04109974682633932, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04109974682633932 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.21359223300970873, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.21359223300970873, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2692307692307692, + "acc_stderr": 0.029058588303748842, + "acc_norm": 0.2692307692307692, + "acc_norm_stderr": 0.029058588303748842 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.26181353767560667, + "acc_stderr": 0.01572083867844526, + "acc_norm": 0.26181353767560667, + "acc_norm_stderr": 0.01572083867844526 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2658959537572254, + "acc_stderr": 0.023786203255508297, + "acc_norm": 0.2658959537572254, + "acc_norm_stderr": 0.023786203255508297 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2324022346368715, + "acc_stderr": 0.014125968754673398, + "acc_norm": 0.2324022346368715, + "acc_norm_stderr": 0.014125968754673398 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.23202614379084968, + "acc_stderr": 0.024170840879341016, + "acc_norm": 0.23202614379084968, + "acc_norm_stderr": 0.024170840879341016 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24437299035369775, + "acc_stderr": 0.024406162094668903, + "acc_norm": 0.24437299035369775, + "acc_norm_stderr": 0.024406162094668903 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.22839506172839505, + "acc_stderr": 0.023358211840626267, + "acc_norm": 0.22839506172839505, + "acc_norm_stderr": 0.023358211840626267 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.26595744680851063, + "acc_stderr": 0.026358065698880585, + "acc_norm": 0.26595744680851063, + "acc_norm_stderr": 0.026358065698880585 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2666232073011734, + "acc_stderr": 0.011293836031612142, + "acc_norm": 0.2666232073011734, + "acc_norm_stderr": 0.011293836031612142 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2426470588235294, + "acc_stderr": 0.026040662474201264, + "acc_norm": 0.2426470588235294, + "acc_norm_stderr": 0.026040662474201264 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.24183006535947713, + "acc_stderr": 0.017322789207784326, + "acc_norm": 0.24183006535947713, + "acc_norm_stderr": 0.017322789207784326 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.24545454545454545, + "acc_stderr": 0.04122066502878285, + "acc_norm": 0.24545454545454545, + "acc_norm_stderr": 0.04122066502878285 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.19183673469387755, + "acc_stderr": 0.02520696315422538, + "acc_norm": 0.19183673469387755, + "acc_norm_stderr": 0.02520696315422538 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.3034825870646766, + "acc_stderr": 0.03251006816458618, + "acc_norm": 0.3034825870646766, + "acc_norm_stderr": 0.03251006816458618 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.20481927710843373, + "acc_stderr": 0.03141784291663926, + "acc_norm": 0.20481927710843373, + "acc_norm_stderr": 0.03141784291663926 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.28654970760233917, + "acc_stderr": 0.034678266857038266, + "acc_norm": 0.28654970760233917, + "acc_norm_stderr": 0.034678266857038266 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.24357405140758873, + "mc1_stderr": 0.015026354824910782, + "mc2": 0.4913727694395835, + "mc2_stderr": 0.016939600973226592 + }, + "all": { + "acc": 0.24599177907991926, + "acc_stderr": 0.031415523509032525, + "acc_norm": 0.2471446509072939, + "acc_norm_stderr": 0.03143344614230005, + "mc1": 0.24357405140758873, + "mc1_stderr": 0.015026354824910782, + "mc2": 0.4913727694395835, + "mc2_stderr": 0.016939600973226592 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/WizardLM-30B-GPTQ", + "model_sha": "e2e97475a9775d2fe7afba098aee37e694b9220f", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "9878.357825040817", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-30B-GPTQ/results_2023-11-07T18-05-07.591558.json b/eval-results/TheBloke/WizardLM-30B-GPTQ/results_2023-11-07T18-05-07.591558.json new file mode 100644 index 0000000000000000000000000000000000000000..42c4d59ca623aacdb318ab450e61a52a40793fe3 --- /dev/null +++ b/eval-results/TheBloke/WizardLM-30B-GPTQ/results_2023-11-07T18-05-07.591558.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/WizardLM-30B-GPTQ", + "model_sha": "e2e97475a9775d2fe7afba098aee37e694b9220f", + "model_dtype": "torch.float16", + "model_size": "15.83 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.21245805369127516, + "em_stderr": 0.004189026405353694, + "f1": 0.2829110738255039, + "f1_stderr": 0.004179836263087045 + }, + "harness|gsm8k|5": { + "acc": 0.34420015163002277, + "acc_stderr": 0.013086800426693784 + }, + "harness|winogrande|5": { + "acc": 0.7632202052091555, + "acc_stderr": 0.011947592365207392 + }, + "all": { + "em": 0.21245805369127516, + "em_stderr": 0.004189026405353694, + "f1": 0.2829110738255039, + "f1_stderr": 0.004179836263087045, + "acc": 0.5537101784195891, + "acc_stderr": 0.012517196395950588 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "72a7f7831c66865a" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "e9aed27c5d836be3" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "965ae29466a2014a" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-30B-Uncensored-GPTQ/results_2023-08-21T22-11-46.962918.json b/eval-results/TheBloke/WizardLM-30B-Uncensored-GPTQ/results_2023-08-21T22-11-46.962918.json new file mode 100644 index 0000000000000000000000000000000000000000..7d1bc298df757220685a6d7bcfd6a8ce3c39a31f --- /dev/null +++ b/eval-results/TheBloke/WizardLM-30B-Uncensored-GPTQ/results_2023-08-21T22-11-46.962918.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.22696245733788395, + "acc_stderr": 0.012240491536132868, + "acc_norm": 0.29436860068259385, + "acc_norm_stderr": 0.013318528460539426 + }, + "harness|hellaswag|10": { + "acc": 0.25791674965146383, + "acc_stderr": 0.0043659384072096095, + "acc_norm": 0.26468830910177255, + "acc_norm_stderr": 0.0044026547672696295 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.039725528847851375, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.039725528847851375 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.21710526315789475, + "acc_stderr": 0.03355045304882924, + "acc_norm": 0.21710526315789475, + "acc_norm_stderr": 0.03355045304882924 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2641509433962264, + "acc_stderr": 0.027134291628741695, + "acc_norm": 0.2641509433962264, + "acc_norm_stderr": 0.027134291628741695 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.23699421965317918, + "acc_stderr": 0.03242414757483099, + "acc_norm": 0.23699421965317918, + "acc_norm_stderr": 0.03242414757483099 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.30392156862745096, + "acc_stderr": 0.04576665403207763, + "acc_norm": 0.30392156862745096, + "acc_norm_stderr": 0.04576665403207763 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.1829787234042553, + "acc_stderr": 0.025276041000449972, + "acc_norm": 0.1829787234042553, + "acc_norm_stderr": 0.025276041000449972 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.040969851398436716, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.040969851398436716 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.23448275862068965, + "acc_stderr": 0.035306258743465914, + "acc_norm": 0.23448275862068965, + "acc_norm_stderr": 0.035306258743465914 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.022019080012217886, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.022019080012217886 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.038932596106046734, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.038932596106046734 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.23548387096774193, + "acc_stderr": 0.024137632429337714, + "acc_norm": 0.23548387096774193, + "acc_norm_stderr": 0.024137632429337714 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.18719211822660098, + "acc_stderr": 0.027444924966882618, + "acc_norm": 0.18719211822660098, + "acc_norm_stderr": 0.027444924966882618 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21212121212121213, + "acc_stderr": 0.03192271569548299, + "acc_norm": 0.21212121212121213, + "acc_norm_stderr": 0.03192271569548299 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.30808080808080807, + "acc_stderr": 0.03289477330098615, + "acc_norm": 0.30808080808080807, + "acc_norm_stderr": 0.03289477330098615 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.16580310880829016, + "acc_stderr": 0.026839845022314415, + "acc_norm": 0.16580310880829016, + "acc_norm_stderr": 0.026839845022314415 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.30256410256410254, + "acc_stderr": 0.023290888053772718, + "acc_norm": 0.30256410256410254, + "acc_norm_stderr": 0.023290888053772718 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23333333333333334, + "acc_stderr": 0.025787874220959316, + "acc_norm": 0.23333333333333334, + "acc_norm_stderr": 0.025787874220959316 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.28991596638655465, + "acc_stderr": 0.02947248583313607, + "acc_norm": 0.28991596638655465, + "acc_norm_stderr": 0.02947248583313607 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2251655629139073, + "acc_stderr": 0.03410435282008937, + "acc_norm": 0.2251655629139073, + "acc_norm_stderr": 0.03410435282008937 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.23853211009174313, + "acc_stderr": 0.018272575810231867, + "acc_norm": 0.23853211009174313, + "acc_norm_stderr": 0.018272575810231867 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2037037037037037, + "acc_stderr": 0.02746740180405799, + "acc_norm": 0.2037037037037037, + "acc_norm_stderr": 0.02746740180405799 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.02933116229425172, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.02933116229425172 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2320675105485232, + "acc_stderr": 0.02747974455080852, + "acc_norm": 0.2320675105485232, + "acc_norm_stderr": 0.02747974455080852 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.16591928251121077, + "acc_stderr": 0.024967553196547133, + "acc_norm": 0.16591928251121077, + "acc_norm_stderr": 0.024967553196547133 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2366412213740458, + "acc_stderr": 0.03727673575596919, + "acc_norm": 0.2366412213740458, + "acc_norm_stderr": 0.03727673575596919 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2809917355371901, + "acc_stderr": 0.04103203830514511, + "acc_norm": 0.2809917355371901, + "acc_norm_stderr": 0.04103203830514511 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.04133119440243839, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.04133119440243839 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.25766871165644173, + "acc_stderr": 0.03436150827846917, + "acc_norm": 0.25766871165644173, + "acc_norm_stderr": 0.03436150827846917 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.2815533980582524, + "acc_stderr": 0.044532548363264673, + "acc_norm": 0.2815533980582524, + "acc_norm_stderr": 0.044532548363264673 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.21367521367521367, + "acc_stderr": 0.02685345037700913, + "acc_norm": 0.21367521367521367, + "acc_norm_stderr": 0.02685345037700913 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.17, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.17, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.23754789272030652, + "acc_stderr": 0.015218733046150195, + "acc_norm": 0.23754789272030652, + "acc_norm_stderr": 0.015218733046150195 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2745664739884393, + "acc_stderr": 0.024027745155265023, + "acc_norm": 0.2745664739884393, + "acc_norm_stderr": 0.024027745155265023 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.264804469273743, + "acc_stderr": 0.014756906483260659, + "acc_norm": 0.264804469273743, + "acc_norm_stderr": 0.014756906483260659 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.24836601307189543, + "acc_stderr": 0.02473998135511359, + "acc_norm": 0.24836601307189543, + "acc_norm_stderr": 0.02473998135511359 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.26688102893890675, + "acc_stderr": 0.025122637608816636, + "acc_norm": 0.26688102893890675, + "acc_norm_stderr": 0.025122637608816636 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2191358024691358, + "acc_stderr": 0.02301670564026219, + "acc_norm": 0.2191358024691358, + "acc_norm_stderr": 0.02301670564026219 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.24113475177304963, + "acc_stderr": 0.025518731049537776, + "acc_norm": 0.24113475177304963, + "acc_norm_stderr": 0.025518731049537776 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24315514993481094, + "acc_stderr": 0.010956556654417346, + "acc_norm": 0.24315514993481094, + "acc_norm_stderr": 0.010956556654417346 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.19852941176470587, + "acc_stderr": 0.024231013370541093, + "acc_norm": 0.19852941176470587, + "acc_norm_stderr": 0.024231013370541093 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2679738562091503, + "acc_stderr": 0.017917974069594726, + "acc_norm": 0.2679738562091503, + "acc_norm_stderr": 0.017917974069594726 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.04172343038705383, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.04172343038705383 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.18775510204081633, + "acc_stderr": 0.025000256039546205, + "acc_norm": 0.18775510204081633, + "acc_norm_stderr": 0.025000256039546205 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.3034825870646766, + "acc_stderr": 0.032510068164586174, + "acc_norm": 0.3034825870646766, + "acc_norm_stderr": 0.032510068164586174 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.18072289156626506, + "acc_stderr": 0.029955737855810138, + "acc_norm": 0.18072289156626506, + "acc_norm_stderr": 0.029955737855810138 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.17543859649122806, + "acc_stderr": 0.029170885500727668, + "acc_norm": 0.17543859649122806, + "acc_norm_stderr": 0.029170885500727668 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23745410036719705, + "mc1_stderr": 0.014896277441041855, + "mc2": 0.4914690919219968, + "mc2_stderr": 0.01691658252969465 + }, + "all": { + "acc": 0.24346412924226338, + "acc_stderr": 0.03127341802676303, + "acc_norm": 0.24472137844217898, + "acc_norm_stderr": 0.03129231215022856, + "mc1": 0.23745410036719705, + "mc1_stderr": 0.014896277441041855, + "mc2": 0.4914690919219968, + "mc2_stderr": 0.01691658252969465 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/WizardLM-30B-Uncensored-GPTQ", + "model_sha": "43c701ddbe0bceac26c860307e06763cc5203500", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "9853.765189886093", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-30B-Uncensored-GPTQ/results_2023-11-07T17-24-26.800307.json b/eval-results/TheBloke/WizardLM-30B-Uncensored-GPTQ/results_2023-11-07T17-24-26.800307.json new file mode 100644 index 0000000000000000000000000000000000000000..3472b0c975ea2656a2eddff9d69dfc80acf560e6 --- /dev/null +++ b/eval-results/TheBloke/WizardLM-30B-Uncensored-GPTQ/results_2023-11-07T17-24-26.800307.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/WizardLM-30B-Uncensored-GPTQ", + "model_sha": "98c19ab784ab5ff5e086b376ad6e81a30cae8457", + "model_dtype": "torch.float16", + "model_size": "15.83 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.11220637583892618, + "em_stderr": 0.003232246172292982, + "f1": 0.19735633389261756, + "f1_stderr": 0.0034729011607307052 + }, + "harness|gsm8k|5": { + "acc": 0.21076573161485973, + "acc_stderr": 0.011234280469030465 + }, + "harness|winogrande|5": { + "acc": 0.7316495659037096, + "acc_stderr": 0.012453340359561195 + }, + "all": { + "em": 0.11220637583892618, + "em_stderr": 0.003232246172292982, + "f1": 0.19735633389261756, + "f1_stderr": 0.0034729011607307052, + "acc": 0.47120764875928467, + "acc_stderr": 0.01184381041429583 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "afa3f956b5946008", + "hash_cont_tokens": "3683e666f013c75d" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6f81fd8346219949", + "hash_cont_tokens": "9d53433ccaa79e42" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "b84988137a00c1f4", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "fb75bd68fb756923", + "hash_cont_tokens": "ec5ae869b4e55f5b" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-30B-fp16/results_2023-07-31T12-57-51.572522.json b/eval-results/TheBloke/WizardLM-30B-fp16/results_2023-07-31T12-57-51.572522.json new file mode 100644 index 0000000000000000000000000000000000000000..8be2908c28dee0c40e66e548471326c8de0aa75c --- /dev/null +++ b/eval-results/TheBloke/WizardLM-30B-fp16/results_2023-07-31T12-57-51.572522.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.6023890784982935, + "acc_stderr": 0.014301752223279545, + "acc_norm": 0.6254266211604096, + "acc_norm_stderr": 0.014144193471893456 + }, + "harness|hellaswag|10": { + "acc": 0.6337382991435969, + "acc_stderr": 0.004807975515446488, + "acc_norm": 0.8328022306313483, + "acc_norm_stderr": 0.00372389730564549 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5481481481481482, + "acc_stderr": 0.04299268905480864, + "acc_norm": 0.5481481481481482, + "acc_norm_stderr": 0.04299268905480864 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6447368421052632, + "acc_stderr": 0.038947344870133176, + "acc_norm": 0.6447368421052632, + "acc_norm_stderr": 0.038947344870133176 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939098, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939098 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6, + "acc_stderr": 0.030151134457776285, + "acc_norm": 0.6, + "acc_norm_stderr": 0.030151134457776285 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.039420826399272135, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.039420826399272135 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5433526011560693, + "acc_stderr": 0.03798106566014498, + "acc_norm": 0.5433526011560693, + "acc_norm_stderr": 0.03798106566014498 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04690650298201943, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04690650298201943 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.49361702127659574, + "acc_stderr": 0.032683358999363366, + "acc_norm": 0.49361702127659574, + "acc_norm_stderr": 0.032683358999363366 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.39473684210526316, + "acc_stderr": 0.04598188057816541, + "acc_norm": 0.39473684210526316, + "acc_norm_stderr": 0.04598188057816541 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.46206896551724136, + "acc_stderr": 0.041546596717075474, + "acc_norm": 0.46206896551724136, + "acc_norm_stderr": 0.041546596717075474 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3783068783068783, + "acc_stderr": 0.02497695405315525, + "acc_norm": 0.3783068783068783, + "acc_norm_stderr": 0.02497695405315525 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.04325506042017086, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.04325506042017086 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6935483870967742, + "acc_stderr": 0.026226485652553883, + "acc_norm": 0.6935483870967742, + "acc_norm_stderr": 0.026226485652553883 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.034819048444388045, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.034819048444388045 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.62, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.62, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.0347769116216366, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.0347769116216366 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7727272727272727, + "acc_stderr": 0.029857515673386414, + "acc_norm": 0.7727272727272727, + "acc_norm_stderr": 0.029857515673386414 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.844559585492228, + "acc_stderr": 0.02614848346915332, + "acc_norm": 0.844559585492228, + "acc_norm_stderr": 0.02614848346915332 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5897435897435898, + "acc_stderr": 0.02493931390694079, + "acc_norm": 0.5897435897435898, + "acc_norm_stderr": 0.02493931390694079 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3074074074074074, + "acc_stderr": 0.028133252578815642, + "acc_norm": 0.3074074074074074, + "acc_norm_stderr": 0.028133252578815642 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.032145368597886394, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.032145368597886394 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7926605504587156, + "acc_stderr": 0.01738141556360868, + "acc_norm": 0.7926605504587156, + "acc_norm_stderr": 0.01738141556360868 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7941176470588235, + "acc_stderr": 0.028379449451588663, + "acc_norm": 0.7941176470588235, + "acc_norm_stderr": 0.028379449451588663 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8059071729957806, + "acc_stderr": 0.02574490253229093, + "acc_norm": 0.8059071729957806, + "acc_norm_stderr": 0.02574490253229093 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6591928251121076, + "acc_stderr": 0.031811497470553604, + "acc_norm": 0.6591928251121076, + "acc_norm_stderr": 0.031811497470553604 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6870229007633588, + "acc_stderr": 0.04066962905677698, + "acc_norm": 0.6870229007633588, + "acc_norm_stderr": 0.04066962905677698 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7603305785123967, + "acc_stderr": 0.03896878985070415, + "acc_norm": 0.7603305785123967, + "acc_norm_stderr": 0.03896878985070415 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7037037037037037, + "acc_stderr": 0.04414343666854934, + "acc_norm": 0.7037037037037037, + "acc_norm_stderr": 0.04414343666854934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7361963190184049, + "acc_stderr": 0.03462419931615623, + "acc_norm": 0.7361963190184049, + "acc_norm_stderr": 0.03462419931615623 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7475728155339806, + "acc_stderr": 0.04301250399690878, + "acc_norm": 0.7475728155339806, + "acc_norm_stderr": 0.04301250399690878 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8632478632478633, + "acc_stderr": 0.022509033937077785, + "acc_norm": 0.8632478632478633, + "acc_norm_stderr": 0.022509033937077785 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.65, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.65, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7701149425287356, + "acc_stderr": 0.01504630184669182, + "acc_norm": 0.7701149425287356, + "acc_norm_stderr": 0.01504630184669182 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6589595375722543, + "acc_stderr": 0.025522474632121612, + "acc_norm": 0.6589595375722543, + "acc_norm_stderr": 0.025522474632121612 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3776536312849162, + "acc_stderr": 0.01621414875213663, + "acc_norm": 0.3776536312849162, + "acc_norm_stderr": 0.01621414875213663 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6568627450980392, + "acc_stderr": 0.02718449890994161, + "acc_norm": 0.6568627450980392, + "acc_norm_stderr": 0.02718449890994161 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6816720257234726, + "acc_stderr": 0.026457225067811025, + "acc_norm": 0.6816720257234726, + "acc_norm_stderr": 0.026457225067811025 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.025630824975621344, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.025630824975621344 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4716312056737589, + "acc_stderr": 0.029779450957303062, + "acc_norm": 0.4716312056737589, + "acc_norm_stderr": 0.029779450957303062 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4452411994784876, + "acc_stderr": 0.012693421303973294, + "acc_norm": 0.4452411994784876, + "acc_norm_stderr": 0.012693421303973294 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6029411764705882, + "acc_stderr": 0.029722152099280065, + "acc_norm": 0.6029411764705882, + "acc_norm_stderr": 0.029722152099280065 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6437908496732027, + "acc_stderr": 0.019373332420724493, + "acc_norm": 0.6437908496732027, + "acc_norm_stderr": 0.019373332420724493 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6693877551020408, + "acc_stderr": 0.0301164262965406, + "acc_norm": 0.6693877551020408, + "acc_norm_stderr": 0.0301164262965406 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7960199004975125, + "acc_stderr": 0.02849317624532607, + "acc_norm": 0.7960199004975125, + "acc_norm_stderr": 0.02849317624532607 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4759036144578313, + "acc_stderr": 0.038879718495972646, + "acc_norm": 0.4759036144578313, + "acc_norm_stderr": 0.038879718495972646 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.03188578017686397, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.03188578017686397 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.36107711138310894, + "mc1_stderr": 0.016814312844836882, + "mc2": 0.5248618748874638, + "mc2_stderr": 0.01580642715291066 + }, + "all": { + "acc": 0.5912283618413695, + "acc_stderr": 0.03383377056521543, + "acc_norm": 0.5949927936066215, + "acc_norm_stderr": 0.033812725870958016, + "mc1": 0.36107711138310894, + "mc1_stderr": 0.016814312844836882, + "mc2": 0.5248618748874638, + "mc2_stderr": 0.01580642715291066 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/WizardLM-30B-fp16", + "model_sha": "465f87a243969963f25ae6cf8f8d2de6c0898bbe", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "8461.860760211945", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-30B-fp16/results_2023-10-22T15-13-10.027241.json b/eval-results/TheBloke/WizardLM-30B-fp16/results_2023-10-22T15-13-10.027241.json new file mode 100644 index 0000000000000000000000000000000000000000..0f21bff00eabd05aa7211588f13e9b8c60b7c182 --- /dev/null +++ b/eval-results/TheBloke/WizardLM-30B-fp16/results_2023-10-22T15-13-10.027241.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/WizardLM-30B-fp16", + "model_sha": "465f87a243969963f25ae6cf8f8d2de6c0898bbe", + "model_size": "60.65 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.2868078859060403, + "em_stderr": 0.004631679094136414, + "f1": 0.36250838926174567, + "f1_stderr": 0.004522951158382507 + }, + "harness|gsm8k|5": { + "acc": 0.2221379833206975, + "acc_stderr": 0.011449986902435323 + }, + "harness|winogrande|5": { + "acc": 0.7750591949486977, + "acc_stderr": 0.011735043564126742 + }, + "all": { + "em": 0.2868078859060403, + "em_stderr": 0.004631679094136414, + "f1": 0.36250838926174567, + "f1_stderr": 0.004522951158382507, + "acc": 0.49859858913469757, + "acc_stderr": 0.011592515233281033 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "c361e453acf43a25" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "863110018653a1bf" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "1cdde3c8e47af5cc" + }, + "total_evaluation_time_secondes": "18795.53040599823", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-30B-fp16/results_2023-10-23T00-26-26.066701.json b/eval-results/TheBloke/WizardLM-30B-fp16/results_2023-10-23T00-26-26.066701.json new file mode 100644 index 0000000000000000000000000000000000000000..208a3621abd7f6b9478832e42193655140ad4d2b --- /dev/null +++ b/eval-results/TheBloke/WizardLM-30B-fp16/results_2023-10-23T00-26-26.066701.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/WizardLM-30B-fp16", + "model_sha": "465f87a243969963f25ae6cf8f8d2de6c0898bbe", + "model_size": "60.65 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.2868078859060403, + "em_stderr": 0.004631679094136414, + "f1": 0.36250838926174567, + "f1_stderr": 0.004522951158382507 + }, + "harness|gsm8k|5": { + "acc": 0.2221379833206975, + "acc_stderr": 0.011449986902435323 + }, + "harness|winogrande|5": { + "acc": 0.7750591949486977, + "acc_stderr": 0.011735043564126742 + }, + "all": { + "em": 0.2868078859060403, + "em_stderr": 0.004631679094136414, + "f1": 0.36250838926174567, + "f1_stderr": 0.004522951158382507, + "acc": 0.49859858913469757, + "acc_stderr": 0.011592515233281033 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "c361e453acf43a25" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "863110018653a1bf" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "1cdde3c8e47af5cc" + }, + "total_evaluation_time_secondes": "18409.288813591003", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-33B-V1.0-Uncensored-GPTQ/results_2023-08-21T18-26-54.221283.json b/eval-results/TheBloke/WizardLM-33B-V1.0-Uncensored-GPTQ/results_2023-08-21T18-26-54.221283.json new file mode 100644 index 0000000000000000000000000000000000000000..b1ba7bb48545db44ad963e35c3c2482367440c6e --- /dev/null +++ b/eval-results/TheBloke/WizardLM-33B-V1.0-Uncensored-GPTQ/results_2023-08-21T18-26-54.221283.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.2363481228668942, + "acc_stderr": 0.012414960524301842, + "acc_norm": 0.2738907849829352, + "acc_norm_stderr": 0.013032004972989501 + }, + "harness|hellaswag|10": { + "acc": 0.2528380800637323, + "acc_stderr": 0.004337506344899926, + "acc_norm": 0.26030671181039633, + "acc_norm_stderr": 0.004379051357024143 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.22962962962962963, + "acc_stderr": 0.03633384414073461, + "acc_norm": 0.22962962962962963, + "acc_norm_stderr": 0.03633384414073461 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.29605263157894735, + "acc_stderr": 0.037150621549989056, + "acc_norm": 0.29605263157894735, + "acc_norm_stderr": 0.037150621549989056 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2981132075471698, + "acc_stderr": 0.028152837942493857, + "acc_norm": 0.2981132075471698, + "acc_norm_stderr": 0.028152837942493857 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3352601156069364, + "acc_stderr": 0.03599586301247078, + "acc_norm": 0.3352601156069364, + "acc_norm_stderr": 0.03599586301247078 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.04755129616062948, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.04755129616062948 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.20851063829787234, + "acc_stderr": 0.026556982117838728, + "acc_norm": 0.20851063829787234, + "acc_norm_stderr": 0.026556982117838728 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.040969851398436716, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.040969851398436716 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.023456037383982026, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.023456037383982026 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.04073524322147125, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.04073524322147125 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653695, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653695 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.29354838709677417, + "acc_stderr": 0.025906087021319288, + "acc_norm": 0.29354838709677417, + "acc_norm_stderr": 0.025906087021319288 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2512315270935961, + "acc_stderr": 0.030516530732694436, + "acc_norm": 0.2512315270935961, + "acc_norm_stderr": 0.030516530732694436 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.35353535353535354, + "acc_stderr": 0.03406086723547153, + "acc_norm": 0.35353535353535354, + "acc_norm_stderr": 0.03406086723547153 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.35233160621761656, + "acc_stderr": 0.03447478286414359, + "acc_norm": 0.35233160621761656, + "acc_norm_stderr": 0.03447478286414359 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.36153846153846153, + "acc_stderr": 0.024359581465396987, + "acc_norm": 0.36153846153846153, + "acc_norm_stderr": 0.024359581465396987 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.21851851851851853, + "acc_stderr": 0.025195752251823796, + "acc_norm": 0.21851851851851853, + "acc_norm_stderr": 0.025195752251823796 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3487394957983193, + "acc_stderr": 0.03095663632856655, + "acc_norm": 0.3487394957983193, + "acc_norm_stderr": 0.03095663632856655 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.37748344370860926, + "acc_stderr": 0.039580272311215706, + "acc_norm": 0.37748344370860926, + "acc_norm_stderr": 0.039580272311215706 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3137614678899083, + "acc_stderr": 0.019894723341469148, + "acc_norm": 0.3137614678899083, + "acc_norm_stderr": 0.019894723341469148 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3101851851851852, + "acc_stderr": 0.031546962856566274, + "acc_norm": 0.3101851851851852, + "acc_norm_stderr": 0.031546962856566274 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.030778554678693257, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.030778554678693257 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.25738396624472576, + "acc_stderr": 0.028458820991460305, + "acc_norm": 0.25738396624472576, + "acc_norm_stderr": 0.028458820991460305 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.10762331838565023, + "acc_stderr": 0.020799400082879997, + "acc_norm": 0.10762331838565023, + "acc_norm_stderr": 0.020799400082879997 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2824427480916031, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.2824427480916031, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.1652892561983471, + "acc_stderr": 0.03390780612972776, + "acc_norm": 0.1652892561983471, + "acc_norm_stderr": 0.03390780612972776 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2331288343558282, + "acc_stderr": 0.033220157957767414, + "acc_norm": 0.2331288343558282, + "acc_norm_stderr": 0.033220157957767414 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25892857142857145, + "acc_stderr": 0.041577515398656284, + "acc_norm": 0.25892857142857145, + "acc_norm_stderr": 0.041577515398656284 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3786407766990291, + "acc_stderr": 0.04802694698258972, + "acc_norm": 0.3786407766990291, + "acc_norm_stderr": 0.04802694698258972 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.19658119658119658, + "acc_stderr": 0.02603538609895129, + "acc_norm": 0.19658119658119658, + "acc_norm_stderr": 0.02603538609895129 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909281, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909281 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.20561941251596424, + "acc_stderr": 0.014452500456785825, + "acc_norm": 0.20561941251596424, + "acc_norm_stderr": 0.014452500456785825 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24566473988439305, + "acc_stderr": 0.02317629820399201, + "acc_norm": 0.24566473988439305, + "acc_norm_stderr": 0.02317629820399201 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.26143790849673204, + "acc_stderr": 0.025160998214292456, + "acc_norm": 0.26143790849673204, + "acc_norm_stderr": 0.025160998214292456 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24115755627009647, + "acc_stderr": 0.024296594034763426, + "acc_norm": 0.24115755627009647, + "acc_norm_stderr": 0.024296594034763426 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21604938271604937, + "acc_stderr": 0.022899162918445803, + "acc_norm": 0.21604938271604937, + "acc_norm_stderr": 0.022899162918445803 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.20567375886524822, + "acc_stderr": 0.024112138950471873, + "acc_norm": 0.20567375886524822, + "acc_norm_stderr": 0.024112138950471873 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.23989569752281617, + "acc_stderr": 0.010906282617981633, + "acc_norm": 0.23989569752281617, + "acc_norm_stderr": 0.010906282617981633 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.30514705882352944, + "acc_stderr": 0.027971541370170598, + "acc_norm": 0.30514705882352944, + "acc_norm_stderr": 0.027971541370170598 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.01635804429747851, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.01635804429747851 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.22727272727272727, + "acc_stderr": 0.04013964554072774, + "acc_norm": 0.22727272727272727, + "acc_norm_stderr": 0.04013964554072774 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2, + "acc_stderr": 0.02560737598657916, + "acc_norm": 0.2, + "acc_norm_stderr": 0.02560737598657916 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2885572139303483, + "acc_stderr": 0.03203841040213322, + "acc_norm": 0.2885572139303483, + "acc_norm_stderr": 0.03203841040213322 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.1927710843373494, + "acc_stderr": 0.030709824050565274, + "acc_norm": 0.1927710843373494, + "acc_norm_stderr": 0.030709824050565274 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.17543859649122806, + "acc_stderr": 0.029170885500727654, + "acc_norm": 0.17543859649122806, + "acc_norm_stderr": 0.029170885500727654 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.24112607099143207, + "mc1_stderr": 0.014974827279752344, + "mc2": 0.4890395458965077, + "mc2_stderr": 0.016984271164262054 + }, + "all": { + "acc": 0.2576429653107723, + "acc_stderr": 0.031637936193678294, + "acc_norm": 0.2584058685965809, + "acc_norm_stderr": 0.03164909872691239, + "mc1": 0.24112607099143207, + "mc1_stderr": 0.014974827279752344, + "mc2": 0.4890395458965077, + "mc2_stderr": 0.016984271164262054 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/WizardLM-33B-V1.0-Uncensored-GPTQ", + "model_sha": "1c65902c620fcdf6b9c8e36ce17f21360e186a1e", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "9802.42277598381", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-33B-V1.0-Uncensored-GPTQ/results_2023-10-22T20-59-08.755164.json b/eval-results/TheBloke/WizardLM-33B-V1.0-Uncensored-GPTQ/results_2023-10-22T20-59-08.755164.json new file mode 100644 index 0000000000000000000000000000000000000000..98f1b99c553a9fc31c5691634186911bb8ab22d5 --- /dev/null +++ b/eval-results/TheBloke/WizardLM-33B-V1.0-Uncensored-GPTQ/results_2023-10-22T20-59-08.755164.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/WizardLM-33B-V1.0-Uncensored-GPTQ", + "model_sha": "d952d5a374a7e2952297fdb107dafff895f07630", + "model_size": "16.99 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.08850671140939598, + "em_stderr": 0.0029087372393749897, + "f1": 0.1645427852348987, + "f1_stderr": 0.0031594666528343297 + }, + "harness|gsm8k|5": { + "acc": 0.24564063684609552, + "acc_stderr": 0.011857183603902227 + }, + "harness|winogrande|5": { + "acc": 0.7790055248618785, + "acc_stderr": 0.011661223637643407 + }, + "all": { + "em": 0.08850671140939598, + "em_stderr": 0.0029087372393749897, + "f1": 0.1645427852348987, + "f1_stderr": 0.0031594666528343297, + "acc": 0.512323080853987, + "acc_stderr": 0.011759203620772818 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "35f7d2521ec4f57a" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "79e10d9e3d6b6cb0" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "204a94816d58bb3c" + }, + "total_evaluation_time_secondes": "19997.343728780746", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-70B-V1.0-GPTQ/results_2023-08-30T04-09-44.501834.json b/eval-results/TheBloke/WizardLM-70B-V1.0-GPTQ/results_2023-08-30T04-09-44.501834.json new file mode 100644 index 0000000000000000000000000000000000000000..2336484cf310455448e84519899bc505e2c4770e --- /dev/null +++ b/eval-results/TheBloke/WizardLM-70B-V1.0-GPTQ/results_2023-08-30T04-09-44.501834.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/WizardLM-70B-V1.0-GPTQ", + "model_sha": "c234d7c9c0fd26efb55757fdbfb604d549539fe0", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6075085324232082, + "acc_stderr": 0.014269634635670722, + "acc_norm": 0.6382252559726962, + "acc_norm_stderr": 0.014041957945038076 + }, + "harness|hellaswag|10": { + "acc": 0.6486755626369249, + "acc_stderr": 0.004764084597176899, + "acc_norm": 0.838478390758813, + "acc_norm_stderr": 0.00367259272936363 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7171052631578947, + "acc_stderr": 0.03665349695640767, + "acc_norm": 0.7171052631578947, + "acc_norm_stderr": 0.03665349695640767 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.660377358490566, + "acc_stderr": 0.02914690474779833, + "acc_norm": 0.660377358490566, + "acc_norm_stderr": 0.02914690474779833 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7430555555555556, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.7430555555555556, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6184971098265896, + "acc_stderr": 0.03703851193099521, + "acc_norm": 0.6184971098265896, + "acc_norm_stderr": 0.03703851193099521 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.046550104113196177, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.046550104113196177 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5617021276595745, + "acc_stderr": 0.03243618636108101, + "acc_norm": 0.5617021276595745, + "acc_norm_stderr": 0.03243618636108101 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.04537815354939392, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.04537815354939392 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5655172413793104, + "acc_stderr": 0.04130740879555497, + "acc_norm": 0.5655172413793104, + "acc_norm_stderr": 0.04130740879555497 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.41005291005291006, + "acc_stderr": 0.025331202438944413, + "acc_norm": 0.41005291005291006, + "acc_norm_stderr": 0.025331202438944413 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4365079365079365, + "acc_stderr": 0.04435932892851466, + "acc_norm": 0.4365079365079365, + "acc_norm_stderr": 0.04435932892851466 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7516129032258064, + "acc_stderr": 0.02458002892148101, + "acc_norm": 0.7516129032258064, + "acc_norm_stderr": 0.02458002892148101 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5024630541871922, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.5024630541871922, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8181818181818182, + "acc_stderr": 0.03011768892950357, + "acc_norm": 0.8181818181818182, + "acc_norm_stderr": 0.03011768892950357 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.029126522834586808, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.029126522834586808 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8963730569948186, + "acc_stderr": 0.021995311963644234, + "acc_norm": 0.8963730569948186, + "acc_norm_stderr": 0.021995311963644234 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6128205128205129, + "acc_stderr": 0.024697216930878937, + "acc_norm": 0.6128205128205129, + "acc_norm_stderr": 0.024697216930878937 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.02831753349606647, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.02831753349606647 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6848739495798319, + "acc_stderr": 0.03017680828897434, + "acc_norm": 0.6848739495798319, + "acc_norm_stderr": 0.03017680828897434 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4370860927152318, + "acc_stderr": 0.04050035722230636, + "acc_norm": 0.4370860927152318, + "acc_norm_stderr": 0.04050035722230636 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8422018348623853, + "acc_stderr": 0.015630022970092448, + "acc_norm": 0.8422018348623853, + "acc_norm_stderr": 0.015630022970092448 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.03407632093854052, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.03407632093854052 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8382352941176471, + "acc_stderr": 0.025845017986926917, + "acc_norm": 0.8382352941176471, + "acc_norm_stderr": 0.025845017986926917 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8438818565400844, + "acc_stderr": 0.023627159460318667, + "acc_norm": 0.8438818565400844, + "acc_norm_stderr": 0.023627159460318667 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7174887892376681, + "acc_stderr": 0.030216831011508773, + "acc_norm": 0.7174887892376681, + "acc_norm_stderr": 0.030216831011508773 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7862595419847328, + "acc_stderr": 0.0359546161177469, + "acc_norm": 0.7862595419847328, + "acc_norm_stderr": 0.0359546161177469 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8181818181818182, + "acc_stderr": 0.03520893951097655, + "acc_norm": 0.8181818181818182, + "acc_norm_stderr": 0.03520893951097655 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.75, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7484662576687117, + "acc_stderr": 0.03408997886857529, + "acc_norm": 0.7484662576687117, + "acc_norm_stderr": 0.03408997886857529 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5089285714285714, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.5089285714285714, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7961165048543689, + "acc_stderr": 0.039891398595317706, + "acc_norm": 0.7961165048543689, + "acc_norm_stderr": 0.039891398595317706 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8846153846153846, + "acc_stderr": 0.020930193185179333, + "acc_norm": 0.8846153846153846, + "acc_norm_stderr": 0.020930193185179333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.68, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.68, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8186462324393359, + "acc_stderr": 0.013778693778464085, + "acc_norm": 0.8186462324393359, + "acc_norm_stderr": 0.013778693778464085 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6878612716763006, + "acc_stderr": 0.024946792225272314, + "acc_norm": 0.6878612716763006, + "acc_norm_stderr": 0.024946792225272314 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.35195530726256985, + "acc_stderr": 0.015972668523689074, + "acc_norm": 0.35195530726256985, + "acc_norm_stderr": 0.015972668523689074 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.026787453111906504, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.026787453111906504 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6881028938906752, + "acc_stderr": 0.02631185807185416, + "acc_norm": 0.6881028938906752, + "acc_norm_stderr": 0.02631185807185416 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7067901234567902, + "acc_stderr": 0.02532988817190092, + "acc_norm": 0.7067901234567902, + "acc_norm_stderr": 0.02532988817190092 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.48936170212765956, + "acc_stderr": 0.02982074719142248, + "acc_norm": 0.48936170212765956, + "acc_norm_stderr": 0.02982074719142248 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5097783572359843, + "acc_stderr": 0.012767793787729338, + "acc_norm": 0.5097783572359843, + "acc_norm_stderr": 0.012767793787729338 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5808823529411765, + "acc_stderr": 0.02997280717046462, + "acc_norm": 0.5808823529411765, + "acc_norm_stderr": 0.02997280717046462 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.018926082916083376, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.018926082916083376 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7363636363636363, + "acc_stderr": 0.04220224692971987, + "acc_norm": 0.7363636363636363, + "acc_norm_stderr": 0.04220224692971987 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.763265306122449, + "acc_stderr": 0.02721283588407315, + "acc_norm": 0.763265306122449, + "acc_norm_stderr": 0.02721283588407315 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8656716417910447, + "acc_stderr": 0.02411267824090081, + "acc_norm": 0.8656716417910447, + "acc_norm_stderr": 0.02411267824090081 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.0358870281282637, + "acc_norm": 0.85, + "acc_norm_stderr": 0.0358870281282637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5421686746987951, + "acc_stderr": 0.038786267710023595, + "acc_norm": 0.5421686746987951, + "acc_norm_stderr": 0.038786267710023595 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3818849449204406, + "mc1_stderr": 0.017008101939163495, + "mc2": 0.5454276049890074, + "mc2_stderr": 0.015570490235725166 + }, + "all": { + "acc": 0.636465710040601, + "acc_stderr": 0.03280341903722105, + "acc_norm": 0.6402033295604549, + "acc_norm_stderr": 0.03278106024809485, + "mc1": 0.3818849449204406, + "mc1_stderr": 0.017008101939163495, + "mc2": 0.5454276049890074, + "mc2_stderr": 0.015570490235725166 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "40635.91253089905", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-70B-V1.0-GPTQ/results_2023-08-31T06-45-23.824442.json b/eval-results/TheBloke/WizardLM-70B-V1.0-GPTQ/results_2023-08-31T06-45-23.824442.json new file mode 100644 index 0000000000000000000000000000000000000000..2e28e6754fd9087185a24a7b3789947f31901b42 --- /dev/null +++ b/eval-results/TheBloke/WizardLM-70B-V1.0-GPTQ/results_2023-08-31T06-45-23.824442.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/WizardLM-70B-V1.0-GPTQ", + "model_sha": "c234d7c9c0fd26efb55757fdbfb604d549539fe0", + "model_dtype": "None", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6075085324232082, + "acc_stderr": 0.014269634635670722, + "acc_norm": 0.6382252559726962, + "acc_norm_stderr": 0.014041957945038076 + }, + "harness|hellaswag|10": { + "acc": 0.6486755626369249, + "acc_stderr": 0.004764084597176899, + "acc_norm": 0.838478390758813, + "acc_norm_stderr": 0.00367259272936363 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7171052631578947, + "acc_stderr": 0.03665349695640767, + "acc_norm": 0.7171052631578947, + "acc_norm_stderr": 0.03665349695640767 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.660377358490566, + "acc_stderr": 0.02914690474779833, + "acc_norm": 0.660377358490566, + "acc_norm_stderr": 0.02914690474779833 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7430555555555556, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.7430555555555556, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6184971098265896, + "acc_stderr": 0.03703851193099521, + "acc_norm": 0.6184971098265896, + "acc_norm_stderr": 0.03703851193099521 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.046550104113196177, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.046550104113196177 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5617021276595745, + "acc_stderr": 0.03243618636108101, + "acc_norm": 0.5617021276595745, + "acc_norm_stderr": 0.03243618636108101 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.04537815354939392, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.04537815354939392 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5655172413793104, + "acc_stderr": 0.04130740879555497, + "acc_norm": 0.5655172413793104, + "acc_norm_stderr": 0.04130740879555497 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.41005291005291006, + "acc_stderr": 0.025331202438944413, + "acc_norm": 0.41005291005291006, + "acc_norm_stderr": 0.025331202438944413 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4365079365079365, + "acc_stderr": 0.04435932892851466, + "acc_norm": 0.4365079365079365, + "acc_norm_stderr": 0.04435932892851466 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7516129032258064, + "acc_stderr": 0.02458002892148101, + "acc_norm": 0.7516129032258064, + "acc_norm_stderr": 0.02458002892148101 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5024630541871922, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.5024630541871922, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8181818181818182, + "acc_stderr": 0.03011768892950357, + "acc_norm": 0.8181818181818182, + "acc_norm_stderr": 0.03011768892950357 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.029126522834586808, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.029126522834586808 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8963730569948186, + "acc_stderr": 0.021995311963644234, + "acc_norm": 0.8963730569948186, + "acc_norm_stderr": 0.021995311963644234 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6128205128205129, + "acc_stderr": 0.024697216930878937, + "acc_norm": 0.6128205128205129, + "acc_norm_stderr": 0.024697216930878937 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.02831753349606647, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.02831753349606647 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6848739495798319, + "acc_stderr": 0.03017680828897434, + "acc_norm": 0.6848739495798319, + "acc_norm_stderr": 0.03017680828897434 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4370860927152318, + "acc_stderr": 0.04050035722230636, + "acc_norm": 0.4370860927152318, + "acc_norm_stderr": 0.04050035722230636 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8422018348623853, + "acc_stderr": 0.015630022970092448, + "acc_norm": 0.8422018348623853, + "acc_norm_stderr": 0.015630022970092448 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.03407632093854052, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.03407632093854052 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8382352941176471, + "acc_stderr": 0.025845017986926917, + "acc_norm": 0.8382352941176471, + "acc_norm_stderr": 0.025845017986926917 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8438818565400844, + "acc_stderr": 0.023627159460318667, + "acc_norm": 0.8438818565400844, + "acc_norm_stderr": 0.023627159460318667 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7174887892376681, + "acc_stderr": 0.030216831011508773, + "acc_norm": 0.7174887892376681, + "acc_norm_stderr": 0.030216831011508773 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7862595419847328, + "acc_stderr": 0.0359546161177469, + "acc_norm": 0.7862595419847328, + "acc_norm_stderr": 0.0359546161177469 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8181818181818182, + "acc_stderr": 0.03520893951097655, + "acc_norm": 0.8181818181818182, + "acc_norm_stderr": 0.03520893951097655 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.75, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7484662576687117, + "acc_stderr": 0.03408997886857529, + "acc_norm": 0.7484662576687117, + "acc_norm_stderr": 0.03408997886857529 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5089285714285714, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.5089285714285714, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7961165048543689, + "acc_stderr": 0.039891398595317706, + "acc_norm": 0.7961165048543689, + "acc_norm_stderr": 0.039891398595317706 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8846153846153846, + "acc_stderr": 0.020930193185179333, + "acc_norm": 0.8846153846153846, + "acc_norm_stderr": 0.020930193185179333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.68, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.68, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8186462324393359, + "acc_stderr": 0.013778693778464085, + "acc_norm": 0.8186462324393359, + "acc_norm_stderr": 0.013778693778464085 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6878612716763006, + "acc_stderr": 0.024946792225272314, + "acc_norm": 0.6878612716763006, + "acc_norm_stderr": 0.024946792225272314 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.35195530726256985, + "acc_stderr": 0.015972668523689074, + "acc_norm": 0.35195530726256985, + "acc_norm_stderr": 0.015972668523689074 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.026787453111906504, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.026787453111906504 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6881028938906752, + "acc_stderr": 0.02631185807185416, + "acc_norm": 0.6881028938906752, + "acc_norm_stderr": 0.02631185807185416 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7067901234567902, + "acc_stderr": 0.02532988817190092, + "acc_norm": 0.7067901234567902, + "acc_norm_stderr": 0.02532988817190092 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.48936170212765956, + "acc_stderr": 0.02982074719142248, + "acc_norm": 0.48936170212765956, + "acc_norm_stderr": 0.02982074719142248 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5097783572359843, + "acc_stderr": 0.012767793787729338, + "acc_norm": 0.5097783572359843, + "acc_norm_stderr": 0.012767793787729338 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5808823529411765, + "acc_stderr": 0.02997280717046462, + "acc_norm": 0.5808823529411765, + "acc_norm_stderr": 0.02997280717046462 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.018926082916083376, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.018926082916083376 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7363636363636363, + "acc_stderr": 0.04220224692971987, + "acc_norm": 0.7363636363636363, + "acc_norm_stderr": 0.04220224692971987 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.763265306122449, + "acc_stderr": 0.02721283588407315, + "acc_norm": 0.763265306122449, + "acc_norm_stderr": 0.02721283588407315 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8656716417910447, + "acc_stderr": 0.02411267824090081, + "acc_norm": 0.8656716417910447, + "acc_norm_stderr": 0.02411267824090081 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.0358870281282637, + "acc_norm": 0.85, + "acc_norm_stderr": 0.0358870281282637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5421686746987951, + "acc_stderr": 0.038786267710023595, + "acc_norm": 0.5421686746987951, + "acc_norm_stderr": 0.038786267710023595 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3818849449204406, + "mc1_stderr": 0.017008101939163495, + "mc2": 0.5454276049890074, + "mc2_stderr": 0.015570490235725166 + }, + "all": { + "acc": 0.636465710040601, + "acc_stderr": 0.03280341903722105, + "acc_norm": 0.6402033295604549, + "acc_norm_stderr": 0.03278106024809485, + "mc1": 0.3818849449204406, + "mc1_stderr": 0.017008101939163495, + "mc2": 0.5454276049890074, + "mc2_stderr": 0.015570490235725166 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "40584.96996879578", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-70B-V1.0-GPTQ/results_2023-11-07T19-43-56.739522.json b/eval-results/TheBloke/WizardLM-70B-V1.0-GPTQ/results_2023-11-07T19-43-56.739522.json new file mode 100644 index 0000000000000000000000000000000000000000..aa4b14474bda104277f2fa9f786f6e5dc2f6a603 --- /dev/null +++ b/eval-results/TheBloke/WizardLM-70B-V1.0-GPTQ/results_2023-11-07T19-43-56.739522.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/WizardLM-70B-V1.0-GPTQ", + "model_sha": "af6b8708e198e90632bcff24f9a47a24217e8945", + "model_dtype": "None", + "model_size": "33.06 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.17470637583892618, + "em_stderr": 0.0038886447854560428, + "f1": 0.23969064597315412, + "f1_stderr": 0.003917893809852688 + }, + "harness|gsm8k|5": { + "acc": 0.18498862774829417, + "acc_stderr": 0.010695390472237899 + }, + "harness|winogrande|5": { + "acc": 0.7861089187056038, + "acc_stderr": 0.011524466954090259 + }, + "all": { + "em": 0.17470637583892618, + "em_stderr": 0.0038886447854560428, + "f1": 0.23969064597315412, + "f1_stderr": 0.003917893809852688, + "acc": 0.485548773226949, + "acc_stderr": 0.011109928713164078 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "0149b0196625867e" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "4123c947516cb446" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "9e0bf920034b2003" + }, + "truncated": 3, + "non_truncated": 12119, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-7B-uncensored-GPTQ/results_2023-08-21T14-48-40.659244.json b/eval-results/TheBloke/WizardLM-7B-uncensored-GPTQ/results_2023-08-21T14-48-40.659244.json new file mode 100644 index 0000000000000000000000000000000000000000..b5c96f83fffcd98254025740639cd7017eaef87b --- /dev/null +++ b/eval-results/TheBloke/WizardLM-7B-uncensored-GPTQ/results_2023-08-21T14-48-40.659244.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.23122866894197952, + "acc_stderr": 0.01232085883477228, + "acc_norm": 0.28498293515358364, + "acc_norm_stderr": 0.013191348179838793 + }, + "harness|hellaswag|10": { + "acc": 0.2552280422226648, + "acc_stderr": 0.004350982826580602, + "acc_norm": 0.253734315873332, + "acc_norm_stderr": 0.004342580277662738 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.17037037037037037, + "acc_stderr": 0.032477811859955935, + "acc_norm": 0.17037037037037037, + "acc_norm_stderr": 0.032477811859955935 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.034597776068105365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.034597776068105365 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.27547169811320754, + "acc_stderr": 0.027495663683724067, + "acc_norm": 0.27547169811320754, + "acc_norm_stderr": 0.027495663683724067 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.24305555555555555, + "acc_stderr": 0.03586879280080341, + "acc_norm": 0.24305555555555555, + "acc_norm_stderr": 0.03586879280080341 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.24277456647398843, + "acc_stderr": 0.0326926380614177, + "acc_norm": 0.24277456647398843, + "acc_norm_stderr": 0.0326926380614177 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.30638297872340425, + "acc_stderr": 0.030135906478517563, + "acc_norm": 0.30638297872340425, + "acc_norm_stderr": 0.030135906478517563 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.040969851398436716, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.040969851398436716 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2620689655172414, + "acc_stderr": 0.03664666337225257, + "acc_norm": 0.2620689655172414, + "acc_norm_stderr": 0.03664666337225257 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25132275132275134, + "acc_stderr": 0.022340482339643898, + "acc_norm": 0.25132275132275134, + "acc_norm_stderr": 0.022340482339643898 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.04104947269903394, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.04104947269903394 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24516129032258063, + "acc_stderr": 0.024472243840895518, + "acc_norm": 0.24516129032258063, + "acc_norm_stderr": 0.024472243840895518 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2019704433497537, + "acc_stderr": 0.02824735012218027, + "acc_norm": 0.2019704433497537, + "acc_norm_stderr": 0.02824735012218027 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2909090909090909, + "acc_stderr": 0.03546563019624336, + "acc_norm": 0.2909090909090909, + "acc_norm_stderr": 0.03546563019624336 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.21212121212121213, + "acc_stderr": 0.02912652283458682, + "acc_norm": 0.21212121212121213, + "acc_norm_stderr": 0.02912652283458682 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.18652849740932642, + "acc_stderr": 0.028112091210117453, + "acc_norm": 0.18652849740932642, + "acc_norm_stderr": 0.028112091210117453 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2153846153846154, + "acc_stderr": 0.020843034557462874, + "acc_norm": 0.2153846153846154, + "acc_norm_stderr": 0.020843034557462874 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24814814814814815, + "acc_stderr": 0.0263357394040558, + "acc_norm": 0.24814814814814815, + "acc_norm_stderr": 0.0263357394040558 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2184873949579832, + "acc_stderr": 0.026841514322958955, + "acc_norm": 0.2184873949579832, + "acc_norm_stderr": 0.026841514322958955 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.16556291390728478, + "acc_stderr": 0.03034818341030361, + "acc_norm": 0.16556291390728478, + "acc_norm_stderr": 0.03034818341030361 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.22018348623853212, + "acc_stderr": 0.01776597865232757, + "acc_norm": 0.22018348623853212, + "acc_norm_stderr": 0.01776597865232757 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.031141447823536023, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.031141447823536023 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.03096451792692341, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.03096451792692341 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.21940928270042195, + "acc_stderr": 0.026939106581553945, + "acc_norm": 0.21940928270042195, + "acc_norm_stderr": 0.026939106581553945 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.40358744394618834, + "acc_stderr": 0.032928028193303135, + "acc_norm": 0.40358744394618834, + "acc_norm_stderr": 0.032928028193303135 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2824427480916031, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.2824427480916031, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.23140495867768596, + "acc_stderr": 0.0384985609879409, + "acc_norm": 0.23140495867768596, + "acc_norm_stderr": 0.0384985609879409 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.04414343666854933, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.04414343666854933 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2085889570552147, + "acc_stderr": 0.031921934489347235, + "acc_norm": 0.2085889570552147, + "acc_norm_stderr": 0.031921934489347235 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25, + "acc_stderr": 0.04109974682633932, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04109974682633932 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.2815533980582524, + "acc_stderr": 0.04453254836326469, + "acc_norm": 0.2815533980582524, + "acc_norm_stderr": 0.04453254836326469 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.029343114798094476, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.029343114798094476 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2771392081736909, + "acc_stderr": 0.016005636294122435, + "acc_norm": 0.2771392081736909, + "acc_norm_stderr": 0.016005636294122435 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.22832369942196531, + "acc_stderr": 0.02259870380432163, + "acc_norm": 0.22832369942196531, + "acc_norm_stderr": 0.02259870380432163 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2670391061452514, + "acc_stderr": 0.014796502622562551, + "acc_norm": 0.2670391061452514, + "acc_norm_stderr": 0.014796502622562551 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.19281045751633988, + "acc_stderr": 0.02258931888817676, + "acc_norm": 0.19281045751633988, + "acc_norm_stderr": 0.02258931888817676 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2347266881028939, + "acc_stderr": 0.024071805887677045, + "acc_norm": 0.2347266881028939, + "acc_norm_stderr": 0.024071805887677045 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.022779719088733393, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.022779719088733393 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.24468085106382978, + "acc_stderr": 0.025645553622266736, + "acc_norm": 0.24468085106382978, + "acc_norm_stderr": 0.025645553622266736 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2777053455019557, + "acc_stderr": 0.011438741422769584, + "acc_norm": 0.2777053455019557, + "acc_norm_stderr": 0.011438741422769584 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.23161764705882354, + "acc_stderr": 0.025626533803777562, + "acc_norm": 0.23161764705882354, + "acc_norm_stderr": 0.025626533803777562 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.23366013071895425, + "acc_stderr": 0.017119158496044503, + "acc_norm": 0.23366013071895425, + "acc_norm_stderr": 0.017119158496044503 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.041723430387053825, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.041723430387053825 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2163265306122449, + "acc_stderr": 0.02635891633490402, + "acc_norm": 0.2163265306122449, + "acc_norm_stderr": 0.02635891633490402 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23383084577114427, + "acc_stderr": 0.02992941540834839, + "acc_norm": 0.23383084577114427, + "acc_norm_stderr": 0.02992941540834839 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.25301204819277107, + "acc_stderr": 0.03384429155233134, + "acc_norm": 0.25301204819277107, + "acc_norm_stderr": 0.03384429155233134 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.03446296217088426, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.03446296217088426 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.24479804161566707, + "mc1_stderr": 0.015051869486714997, + "mc2": 0.5086427712457394, + "mc2_stderr": 0.01701962999202989 + }, + "all": { + "acc": 0.24833912046930778, + "acc_stderr": 0.03146585831024049, + "acc_norm": 0.24922489233138018, + "acc_norm_stderr": 0.031480469950853185, + "mc1": 0.24479804161566707, + "mc1_stderr": 0.015051869486714997, + "mc2": 0.5086427712457394, + "mc2_stderr": 0.01701962999202989 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/WizardLM-7B-uncensored-GPTQ", + "model_sha": "cc30c031fd795ee3d3a50312ab4549415bfbdb46", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "3047.4784092903137", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-7B-uncensored-GPTQ/results_2023-10-21T21-04-26.590858.json b/eval-results/TheBloke/WizardLM-7B-uncensored-GPTQ/results_2023-10-21T21-04-26.590858.json new file mode 100644 index 0000000000000000000000000000000000000000..ae7709119dc843f1829f1f11903445a730cb896b --- /dev/null +++ b/eval-results/TheBloke/WizardLM-7B-uncensored-GPTQ/results_2023-10-21T21-04-26.590858.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/WizardLM-7B-uncensored-GPTQ", + "model_sha": "0bf85d297a2a660024536e0b77d242466dc20f9a", + "model_size": "3.66 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.4956590370955012, + "acc_stderr": 0.014051956064076911 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0, + "acc": 0.2478295185477506, + "acc_stderr": 0.007025978032038456 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "afa3f956b5946008", + "hash_cont_tokens": "d62a3b26770557a9" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6f81fd8346219949", + "hash_cont_tokens": "8401e6188d643544" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "b84988137a00c1f4", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "fb75bd68fb756923", + "hash_cont_tokens": "f150732b0323f26d" + }, + "total_evaluation_time_secondes": "13500.205662488937", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-7B-uncensored-GPTQ/results_2023-12-02T12-59-15.195874.json b/eval-results/TheBloke/WizardLM-7B-uncensored-GPTQ/results_2023-12-02T12-59-15.195874.json new file mode 100644 index 0000000000000000000000000000000000000000..57782dc3f3b5687dcecb06b3871220a90480a6bf --- /dev/null +++ b/eval-results/TheBloke/WizardLM-7B-uncensored-GPTQ/results_2023-12-02T12-59-15.195874.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1297113.6439748, + "end_time": 1299856.18762588, + "total_evaluation_time_secondes": "2742.543651079992", + "model_name": "TheBloke/WizardLM-7B-uncensored-GPTQ", + "model_sha": "4a524bec59b89e995583018b718c3c7394cade8a", + "model_dtype": "torch.float16", + "model_size": "3.66 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "all": { + "acc": 0.0, + "acc_stderr": 0.0 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "8401e6188d643544" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "42036645de5ac59d", + "hash_cont_tokens": "95e452ffb745c2ae" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-Uncensored-SuperCOT-StoryTelling-30B-GPTQ/results_2023-08-21T17-09-00.185998.json b/eval-results/TheBloke/WizardLM-Uncensored-SuperCOT-StoryTelling-30B-GPTQ/results_2023-08-21T17-09-00.185998.json new file mode 100644 index 0000000000000000000000000000000000000000..f7b0644b6f0f192633adcdbcb291482ea258eb6e --- /dev/null +++ b/eval-results/TheBloke/WizardLM-Uncensored-SuperCOT-StoryTelling-30B-GPTQ/results_2023-08-21T17-09-00.185998.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.22781569965870307, + "acc_stderr": 0.012256708602326931, + "acc_norm": 0.2841296928327645, + "acc_norm_stderr": 0.013179442447653887 + }, + "harness|hellaswag|10": { + "acc": 0.2582154949213304, + "acc_stderr": 0.0043675868017766595, + "acc_norm": 0.2605058753236407, + "acc_norm_stderr": 0.004380136468543943 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.0416333199893227, + "acc_norm": 0.22, + "acc_norm_stderr": 0.0416333199893227 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.038201699145179055, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.038201699145179055 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3026315789473684, + "acc_stderr": 0.03738520676119667, + "acc_norm": 0.3026315789473684, + "acc_norm_stderr": 0.03738520676119667 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2188679245283019, + "acc_stderr": 0.025447863825108625, + "acc_norm": 0.2188679245283019, + "acc_norm_stderr": 0.025447863825108625 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.25, + "acc_stderr": 0.03621034121889507, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03621034121889507 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2658959537572254, + "acc_stderr": 0.03368762932259431, + "acc_norm": 0.2658959537572254, + "acc_norm_stderr": 0.03368762932259431 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.03793281185307811, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.03793281185307811 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.20851063829787234, + "acc_stderr": 0.026556982117838746, + "acc_norm": 0.20851063829787234, + "acc_norm_stderr": 0.026556982117838746 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.19298245614035087, + "acc_stderr": 0.037124548537213684, + "acc_norm": 0.19298245614035087, + "acc_norm_stderr": 0.037124548537213684 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.22758620689655173, + "acc_stderr": 0.03493950380131184, + "acc_norm": 0.22758620689655173, + "acc_norm_stderr": 0.03493950380131184 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.23015873015873015, + "acc_stderr": 0.02167921966369314, + "acc_norm": 0.23015873015873015, + "acc_norm_stderr": 0.02167921966369314 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.04006168083848876, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.04006168083848876 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2645161290322581, + "acc_stderr": 0.02509189237885928, + "acc_norm": 0.2645161290322581, + "acc_norm_stderr": 0.02509189237885928 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.270935960591133, + "acc_stderr": 0.031270907132976984, + "acc_norm": 0.270935960591133, + "acc_norm_stderr": 0.031270907132976984 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.22, + "acc_stderr": 0.0416333199893227, + "acc_norm": 0.22, + "acc_norm_stderr": 0.0416333199893227 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.0347769116216366, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.0347769116216366 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.18181818181818182, + "acc_stderr": 0.0274796030105388, + "acc_norm": 0.18181818181818182, + "acc_norm_stderr": 0.0274796030105388 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.3316062176165803, + "acc_stderr": 0.03397636541089117, + "acc_norm": 0.3316062176165803, + "acc_norm_stderr": 0.03397636541089117 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2076923076923077, + "acc_stderr": 0.020567539567246797, + "acc_norm": 0.2076923076923077, + "acc_norm_stderr": 0.020567539567246797 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24814814814814815, + "acc_stderr": 0.0263357394040558, + "acc_norm": 0.24814814814814815, + "acc_norm_stderr": 0.0263357394040558 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.026653531596715498, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.026653531596715498 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.17880794701986755, + "acc_stderr": 0.03128744850600725, + "acc_norm": 0.17880794701986755, + "acc_norm_stderr": 0.03128744850600725 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.21100917431192662, + "acc_stderr": 0.017493922404112648, + "acc_norm": 0.21100917431192662, + "acc_norm_stderr": 0.017493922404112648 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.25, + "acc_stderr": 0.029531221160930918, + "acc_norm": 0.25, + "acc_norm_stderr": 0.029531221160930918 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.22058823529411764, + "acc_stderr": 0.029102254389674082, + "acc_norm": 0.22058823529411764, + "acc_norm_stderr": 0.029102254389674082 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.21940928270042195, + "acc_stderr": 0.026939106581553945, + "acc_norm": 0.21940928270042195, + "acc_norm_stderr": 0.026939106581553945 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.22869955156950672, + "acc_stderr": 0.0281882400469292, + "acc_norm": 0.22869955156950672, + "acc_norm_stderr": 0.0281882400469292 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2748091603053435, + "acc_stderr": 0.03915345408847837, + "acc_norm": 0.2748091603053435, + "acc_norm_stderr": 0.03915345408847837 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2892561983471074, + "acc_stderr": 0.04139112727635463, + "acc_norm": 0.2892561983471074, + "acc_norm_stderr": 0.04139112727635463 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.23148148148148148, + "acc_stderr": 0.04077494709252626, + "acc_norm": 0.23148148148148148, + "acc_norm_stderr": 0.04077494709252626 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.294478527607362, + "acc_stderr": 0.03581165790474082, + "acc_norm": 0.294478527607362, + "acc_norm_stderr": 0.03581165790474082 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25, + "acc_stderr": 0.04109974682633932, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04109974682633932 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.1553398058252427, + "acc_stderr": 0.03586594738573974, + "acc_norm": 0.1553398058252427, + "acc_norm_stderr": 0.03586594738573974 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.029343114798094472, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.029343114798094472 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.24521072796934865, + "acc_stderr": 0.015384352284543946, + "acc_norm": 0.24521072796934865, + "acc_norm_stderr": 0.015384352284543946 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.23699421965317918, + "acc_stderr": 0.02289408248992599, + "acc_norm": 0.23699421965317918, + "acc_norm_stderr": 0.02289408248992599 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2659217877094972, + "acc_stderr": 0.014776765066438888, + "acc_norm": 0.2659217877094972, + "acc_norm_stderr": 0.014776765066438888 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.024954184324879912, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.024954184324879912 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24437299035369775, + "acc_stderr": 0.024406162094668914, + "acc_norm": 0.24437299035369775, + "acc_norm_stderr": 0.024406162094668914 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.02465968518596728, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.02465968518596728 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2801418439716312, + "acc_stderr": 0.02678917235114023, + "acc_norm": 0.2801418439716312, + "acc_norm_stderr": 0.02678917235114023 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2620599739243807, + "acc_stderr": 0.011231552795890392, + "acc_norm": 0.2620599739243807, + "acc_norm_stderr": 0.011231552795890392 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2610294117647059, + "acc_stderr": 0.026679252270103117, + "acc_norm": 0.2610294117647059, + "acc_norm_stderr": 0.026679252270103117 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2565359477124183, + "acc_stderr": 0.017667841612378988, + "acc_norm": 0.2565359477124183, + "acc_norm_stderr": 0.017667841612378988 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2, + "acc_stderr": 0.03831305140884601, + "acc_norm": 0.2, + "acc_norm_stderr": 0.03831305140884601 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.27755102040816326, + "acc_stderr": 0.028666857790274648, + "acc_norm": 0.27755102040816326, + "acc_norm_stderr": 0.028666857790274648 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.19900497512437812, + "acc_stderr": 0.02823136509275841, + "acc_norm": 0.19900497512437812, + "acc_norm_stderr": 0.02823136509275841 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.28313253012048195, + "acc_stderr": 0.03507295431370519, + "acc_norm": 0.28313253012048195, + "acc_norm_stderr": 0.03507295431370519 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.32748538011695905, + "acc_stderr": 0.03599335771456027, + "acc_norm": 0.32748538011695905, + "acc_norm_stderr": 0.03599335771456027 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2521419828641371, + "mc1_stderr": 0.015201522246299946, + "mc2": 0.49535667677532336, + "mc2_stderr": 0.01702854856477157 + }, + "all": { + "acc": 0.24694623861473894, + "acc_stderr": 0.03134573810087399, + "acc_norm": 0.2479395330821351, + "acc_norm_stderr": 0.03136159036379084, + "mc1": 0.2521419828641371, + "mc1_stderr": 0.015201522246299946, + "mc2": 0.49535667677532336, + "mc2_stderr": 0.01702854856477157 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/WizardLM-Uncensored-SuperCOT-StoryTelling-30B-GPTQ", + "model_sha": "cd07cc7c55b46524f61214012653c25226d24c0d", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "9915.394757032394", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-Uncensored-SuperCOT-StoryTelling-30B-GPTQ/results_2023-11-05T11-28-36.402381.json b/eval-results/TheBloke/WizardLM-Uncensored-SuperCOT-StoryTelling-30B-GPTQ/results_2023-11-05T11-28-36.402381.json new file mode 100644 index 0000000000000000000000000000000000000000..7f58ef49f8ea799f6e8cf347524b55d86ae10234 --- /dev/null +++ b/eval-results/TheBloke/WizardLM-Uncensored-SuperCOT-StoryTelling-30B-GPTQ/results_2023-11-05T11-28-36.402381.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/WizardLM-Uncensored-SuperCOT-StoryTelling-30B-GPTQ", + "model_sha": "99c5cc3fe5dc91d1a1871e28c5959534beb9902c", + "model_dtype": "torch.float16", + "model_size": "15.83 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.15971057046979867, + "em_stderr": 0.003751638050209854, + "f1": 0.25696203859060474, + "f1_stderr": 0.0038655056594378468 + }, + "harness|gsm8k|5": { + "acc": 0.0401819560272934, + "acc_stderr": 0.005409439736970484 + }, + "harness|winogrande|5": { + "acc": 0.6866614048934491, + "acc_stderr": 0.013036512096747983 + }, + "all": { + "em": 0.15971057046979867, + "em_stderr": 0.003751638050209854, + "f1": 0.25696203859060474, + "f1_stderr": 0.0038655056594378468, + "acc": 0.3634216804603712, + "acc_stderr": 0.009222975916859234 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "afa3f956b5946008", + "hash_cont_tokens": "1ac2d5ec05649bbe" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6f81fd8346219949", + "hash_cont_tokens": "577be7ddd21c95b5" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "b84988137a00c1f4", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "fb75bd68fb756923", + "hash_cont_tokens": "8fc0864f249a6e7b" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/WizardLM-Uncensored-SuperCOT-StoryTelling-30B-GPTQ/results_2023-11-08T02-57-56.626250.json b/eval-results/TheBloke/WizardLM-Uncensored-SuperCOT-StoryTelling-30B-GPTQ/results_2023-11-08T02-57-56.626250.json new file mode 100644 index 0000000000000000000000000000000000000000..c84d716eb407f89ac271414699d20ce692d1a728 --- /dev/null +++ b/eval-results/TheBloke/WizardLM-Uncensored-SuperCOT-StoryTelling-30B-GPTQ/results_2023-11-08T02-57-56.626250.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/WizardLM-Uncensored-SuperCOT-StoryTelling-30B-GPTQ", + "model_sha": "99c5cc3fe5dc91d1a1871e28c5959534beb9902c", + "model_dtype": "torch.float16", + "model_size": "15.83 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.15992030201342283, + "em_stderr": 0.0037536320326496562, + "f1": 0.2571140939597322, + "f1_stderr": 0.0038666311684885475 + }, + "harness|gsm8k|5": { + "acc": 0.05307050796057619, + "acc_stderr": 0.006174868858638364 + }, + "harness|winogrande|5": { + "acc": 0.6866614048934491, + "acc_stderr": 0.013036512096747983 + }, + "all": { + "em": 0.15992030201342283, + "em_stderr": 0.0037536320326496562, + "f1": 0.2571140939597322, + "f1_stderr": 0.0038666311684885475, + "acc": 0.36986595642701264, + "acc_stderr": 0.009605690477693173 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "afa3f956b5946008", + "hash_cont_tokens": "804519d7abfcd6d7" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6f81fd8346219949", + "hash_cont_tokens": "d9bf8a1074419ce7" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "b84988137a00c1f4", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "fb75bd68fb756923", + "hash_cont_tokens": "9871e92411f6f05b" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/airoboros-13B-HF/results_2023-07-19T19-05-45.973556.json b/eval-results/TheBloke/airoboros-13B-HF/results_2023-07-19T19-05-45.973556.json new file mode 100644 index 0000000000000000000000000000000000000000..8af8ff0961e1073e9fdadf25feb0a6f04792ab99 --- /dev/null +++ b/eval-results/TheBloke/airoboros-13B-HF/results_2023-07-19T19-05-45.973556.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5554607508532423, + "acc_stderr": 0.01452122640562708, + "acc_norm": 0.5827645051194539, + "acc_norm_stderr": 0.014409825518403084 + }, + "harness|hellaswag|10": { + "acc": 0.6165106552479586, + "acc_stderr": 0.0048524208566314755, + "acc_norm": 0.8104959171479785, + "acc_norm_stderr": 0.003911075662883274 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5, + "acc_stderr": 0.04068942293855797, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04068942293855797 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5509433962264151, + "acc_stderr": 0.030612730713641095, + "acc_norm": 0.5509433962264151, + "acc_norm_stderr": 0.030612730713641095 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4652777777777778, + "acc_stderr": 0.04171115858181618, + "acc_norm": 0.4652777777777778, + "acc_norm_stderr": 0.04171115858181618 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4508670520231214, + "acc_stderr": 0.03794012674697028, + "acc_norm": 0.4508670520231214, + "acc_norm_stderr": 0.03794012674697028 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.042801058373643966, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.042801058373643966 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.40425531914893614, + "acc_stderr": 0.03208115750788684, + "acc_norm": 0.40425531914893614, + "acc_norm_stderr": 0.03208115750788684 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.043727482902780064, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.043727482902780064 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.42758620689655175, + "acc_stderr": 0.04122737111370331, + "acc_norm": 0.42758620689655175, + "acc_norm_stderr": 0.04122737111370331 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.02351729433596329, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.02351729433596329 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.04360314860077459, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.04360314860077459 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5580645161290323, + "acc_stderr": 0.02825155790684974, + "acc_norm": 0.5580645161290323, + "acc_norm_stderr": 0.02825155790684974 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3399014778325123, + "acc_stderr": 0.033327690684107895, + "acc_norm": 0.3399014778325123, + "acc_norm_stderr": 0.033327690684107895 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6424242424242425, + "acc_stderr": 0.037425970438065864, + "acc_norm": 0.6424242424242425, + "acc_norm_stderr": 0.037425970438065864 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5909090909090909, + "acc_stderr": 0.03502975799413007, + "acc_norm": 0.5909090909090909, + "acc_norm_stderr": 0.03502975799413007 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7046632124352331, + "acc_stderr": 0.03292296639155141, + "acc_norm": 0.7046632124352331, + "acc_norm_stderr": 0.03292296639155141 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.48717948717948717, + "acc_stderr": 0.025342671293807257, + "acc_norm": 0.48717948717948717, + "acc_norm_stderr": 0.025342671293807257 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.02794045713622841, + "acc_norm": 0.3, + "acc_norm_stderr": 0.02794045713622841 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5084033613445378, + "acc_stderr": 0.03247390276569669, + "acc_norm": 0.5084033613445378, + "acc_norm_stderr": 0.03247390276569669 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.03631329803969655, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.03631329803969655 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6880733944954128, + "acc_stderr": 0.019862967976707245, + "acc_norm": 0.6880733944954128, + "acc_norm_stderr": 0.019862967976707245 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3425925925925926, + "acc_stderr": 0.032365852526021574, + "acc_norm": 0.3425925925925926, + "acc_norm_stderr": 0.032365852526021574 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6568627450980392, + "acc_stderr": 0.03332139944668086, + "acc_norm": 0.6568627450980392, + "acc_norm_stderr": 0.03332139944668086 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6962025316455697, + "acc_stderr": 0.029936696387138615, + "acc_norm": 0.6962025316455697, + "acc_norm_stderr": 0.029936696387138615 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5112107623318386, + "acc_stderr": 0.033549366530984746, + "acc_norm": 0.5112107623318386, + "acc_norm_stderr": 0.033549366530984746 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5801526717557252, + "acc_stderr": 0.043285772152629715, + "acc_norm": 0.5801526717557252, + "acc_norm_stderr": 0.043285772152629715 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5867768595041323, + "acc_stderr": 0.04495087843548408, + "acc_norm": 0.5867768595041323, + "acc_norm_stderr": 0.04495087843548408 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.048262172941398944, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.048262172941398944 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5398773006134969, + "acc_stderr": 0.03915857291436971, + "acc_norm": 0.5398773006134969, + "acc_norm_stderr": 0.03915857291436971 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285714, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285714 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6796116504854369, + "acc_stderr": 0.04620284082280041, + "acc_norm": 0.6796116504854369, + "acc_norm_stderr": 0.04620284082280041 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7606837606837606, + "acc_stderr": 0.027951826808924333, + "acc_norm": 0.7606837606837606, + "acc_norm_stderr": 0.027951826808924333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6743295019157088, + "acc_stderr": 0.016757989458549675, + "acc_norm": 0.6743295019157088, + "acc_norm_stderr": 0.016757989458549675 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5375722543352601, + "acc_stderr": 0.026842985519615375, + "acc_norm": 0.5375722543352601, + "acc_norm_stderr": 0.026842985519615375 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.33743016759776534, + "acc_stderr": 0.015813901283913055, + "acc_norm": 0.33743016759776534, + "acc_norm_stderr": 0.015813901283913055 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5522875816993464, + "acc_stderr": 0.028472938478033533, + "acc_norm": 0.5522875816993464, + "acc_norm_stderr": 0.028472938478033533 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5627009646302251, + "acc_stderr": 0.028173917761762892, + "acc_norm": 0.5627009646302251, + "acc_norm_stderr": 0.028173917761762892 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5462962962962963, + "acc_stderr": 0.0277012284685426, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.0277012284685426 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.36524822695035464, + "acc_stderr": 0.02872386385328128, + "acc_norm": 0.36524822695035464, + "acc_norm_stderr": 0.02872386385328128 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3891786179921773, + "acc_stderr": 0.012452613934287008, + "acc_norm": 0.3891786179921773, + "acc_norm_stderr": 0.012452613934287008 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5220588235294118, + "acc_stderr": 0.03034326422421352, + "acc_norm": 0.5220588235294118, + "acc_norm_stderr": 0.03034326422421352 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4918300653594771, + "acc_stderr": 0.020225134343057265, + "acc_norm": 0.4918300653594771, + "acc_norm_stderr": 0.020225134343057265 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6, + "acc_stderr": 0.0469237132203465, + "acc_norm": 0.6, + "acc_norm_stderr": 0.0469237132203465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5346938775510204, + "acc_stderr": 0.03193207024425314, + "acc_norm": 0.5346938775510204, + "acc_norm_stderr": 0.03193207024425314 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6318407960199005, + "acc_stderr": 0.03410410565495301, + "acc_norm": 0.6318407960199005, + "acc_norm_stderr": 0.03410410565495301 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.77, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.77, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.038367221765980515, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.038367221765980515 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7192982456140351, + "acc_stderr": 0.034462962170884265, + "acc_norm": 0.7192982456140351, + "acc_norm_stderr": 0.034462962170884265 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3537331701346389, + "mc1_stderr": 0.01673781435884615, + "mc2": 0.5157191510287134, + "mc2_stderr": 0.015439781440728974 + }, + "all": { + "acc": 0.5032125181719386, + "acc_stderr": 0.03519753592604025, + "acc_norm": 0.5069631794628916, + "acc_norm_stderr": 0.03517969277212547, + "mc1": 0.3537331701346389, + "mc1_stderr": 0.01673781435884615, + "mc2": 0.5157191510287134, + "mc2_stderr": 0.015439781440728974 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/airoboros-13B-HF", + "model_sha": "9219b61a0e8bc880e4cd0f8bebc48a97ee0950c7", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/airoboros-13B-HF/results_2023-10-23T02-12-37.195873.json b/eval-results/TheBloke/airoboros-13B-HF/results_2023-10-23T02-12-37.195873.json new file mode 100644 index 0000000000000000000000000000000000000000..309736b86d129fc82815f229c4634334ced933fd --- /dev/null +++ b/eval-results/TheBloke/airoboros-13B-HF/results_2023-10-23T02-12-37.195873.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/airoboros-13B-HF", + "model_sha": "9219b61a0e8bc880e4cd0f8bebc48a97ee0950c7", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.11115771812080537, + "em_stderr": 0.00321900621779522, + "f1": 0.18403838087248262, + "f1_stderr": 0.003410322751505753 + }, + "harness|gsm8k|5": { + "acc": 0.0712661106899166, + "acc_stderr": 0.007086462127954497 + }, + "harness|winogrande|5": { + "acc": 0.7624309392265194, + "acc_stderr": 0.011961298905803145 + }, + "all": { + "em": 0.11115771812080537, + "em_stderr": 0.00321900621779522, + "f1": 0.18403838087248262, + "f1_stderr": 0.003410322751505753, + "acc": 0.416848524958218, + "acc_stderr": 0.009523880516878821 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "dd2b3665ca43ee79" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "eda1c818eedd5de7" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "df39183947f18139" + }, + "total_evaluation_time_secondes": "10874.687096357346", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/airoboros-33B-gpt4-1-4-SuperHOT-8K-fp16/results_2023-08-22T21-17-33.530104.json b/eval-results/TheBloke/airoboros-33B-gpt4-1-4-SuperHOT-8K-fp16/results_2023-08-22T21-17-33.530104.json new file mode 100644 index 0000000000000000000000000000000000000000..fb2ead94581201f57664ff34d54654b76da3c1b0 --- /dev/null +++ b/eval-results/TheBloke/airoboros-33B-gpt4-1-4-SuperHOT-8K-fp16/results_2023-08-22T21-17-33.530104.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.23037542662116042, + "acc_stderr": 0.01230492841874761, + "acc_norm": 0.26023890784982934, + "acc_norm_stderr": 0.012821930225112554 + }, + "harness|hellaswag|10": { + "acc": 0.2709619597689703, + "acc_stderr": 0.0044354815159093975, + "acc_norm": 0.30651264688309104, + "acc_norm_stderr": 0.004601029188459098 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.03785714465066653, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.03785714465066653 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.22264150943396227, + "acc_stderr": 0.025604233470899095, + "acc_norm": 0.22264150943396227, + "acc_norm_stderr": 0.025604233470899095 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036844, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036844 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749874, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749874 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.0414243971948936, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.0414243971948936 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.20899470899470898, + "acc_stderr": 0.02094048156533486, + "acc_norm": 0.20899470899470898, + "acc_norm_stderr": 0.02094048156533486 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04040610178208841, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04040610178208841 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.267741935483871, + "acc_stderr": 0.025189006660212378, + "acc_norm": 0.267741935483871, + "acc_norm_stderr": 0.025189006660212378 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.15763546798029557, + "acc_stderr": 0.025639014131172404, + "acc_norm": 0.15763546798029557, + "acc_norm_stderr": 0.025639014131172404 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.22424242424242424, + "acc_stderr": 0.03256866661681102, + "acc_norm": 0.22424242424242424, + "acc_norm_stderr": 0.03256866661681102 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.18686868686868688, + "acc_stderr": 0.027772533334218977, + "acc_norm": 0.18686868686868688, + "acc_norm_stderr": 0.027772533334218977 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.18652849740932642, + "acc_stderr": 0.028112091210117485, + "acc_norm": 0.18652849740932642, + "acc_norm_stderr": 0.028112091210117485 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2076923076923077, + "acc_stderr": 0.020567539567246797, + "acc_norm": 0.2076923076923077, + "acc_norm_stderr": 0.020567539567246797 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.026067159222275794, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.026067159222275794 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.25210084033613445, + "acc_stderr": 0.028205545033277723, + "acc_norm": 0.25210084033613445, + "acc_norm_stderr": 0.028205545033277723 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2119205298013245, + "acc_stderr": 0.03336767086567976, + "acc_norm": 0.2119205298013245, + "acc_norm_stderr": 0.03336767086567976 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1944954128440367, + "acc_stderr": 0.016970289090458047, + "acc_norm": 0.1944954128440367, + "acc_norm_stderr": 0.016970289090458047 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.1527777777777778, + "acc_stderr": 0.024536326026134224, + "acc_norm": 0.1527777777777778, + "acc_norm_stderr": 0.024536326026134224 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.24019607843137256, + "acc_stderr": 0.02998373305591362, + "acc_norm": 0.24019607843137256, + "acc_norm_stderr": 0.02998373305591362 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2742616033755274, + "acc_stderr": 0.02904133351059804, + "acc_norm": 0.2742616033755274, + "acc_norm_stderr": 0.02904133351059804 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.31390134529147984, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.31390134529147984, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.043300437496507437, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.043300437496507437 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.29914529914529914, + "acc_stderr": 0.029996951858349497, + "acc_norm": 0.29914529914529914, + "acc_norm_stderr": 0.029996951858349497 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.23371647509578544, + "acc_stderr": 0.015133383278988837, + "acc_norm": 0.23371647509578544, + "acc_norm_stderr": 0.015133383278988837 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24692737430167597, + "acc_stderr": 0.014422292204808835, + "acc_norm": 0.24692737430167597, + "acc_norm_stderr": 0.014422292204808835 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.1830065359477124, + "acc_stderr": 0.022140767512880976, + "acc_norm": 0.1830065359477124, + "acc_norm_stderr": 0.022140767512880976 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1864951768488746, + "acc_stderr": 0.02212243977248077, + "acc_norm": 0.1864951768488746, + "acc_norm_stderr": 0.02212243977248077 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2191358024691358, + "acc_stderr": 0.023016705640262203, + "acc_norm": 0.2191358024691358, + "acc_norm_stderr": 0.023016705640262203 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.22695035460992907, + "acc_stderr": 0.02498710636564297, + "acc_norm": 0.22695035460992907, + "acc_norm_stderr": 0.02498710636564297 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142692, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142692 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.18382352941176472, + "acc_stderr": 0.023529242185193106, + "acc_norm": 0.18382352941176472, + "acc_norm_stderr": 0.023529242185193106 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25326797385620914, + "acc_stderr": 0.01759348689536683, + "acc_norm": 0.25326797385620914, + "acc_norm_stderr": 0.01759348689536683 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.17959183673469387, + "acc_stderr": 0.024573293589585637, + "acc_norm": 0.17959183673469387, + "acc_norm_stderr": 0.024573293589585637 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401465, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401465 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.28313253012048195, + "acc_stderr": 0.03507295431370518, + "acc_norm": 0.28313253012048195, + "acc_norm_stderr": 0.03507295431370518 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.035650796707083106, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.035650796707083106 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.22031823745410037, + "mc1_stderr": 0.014509045171487283, + "mc2": 0.4792055778955594, + "mc2_stderr": 0.016809354273525978 + }, + "all": { + "acc": 0.23623967517263394, + "acc_stderr": 0.03091679562106068, + "acc_norm": 0.23734838989030832, + "acc_norm_stderr": 0.030928364256296522, + "mc1": 0.22031823745410037, + "mc1_stderr": 0.014509045171487283, + "mc2": 0.4792055778955594, + "mc2_stderr": 0.016809354273525978 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/airoboros-33B-gpt4-1-4-SuperHOT-8K-fp16", + "model_sha": "53fdac1cdb8a37647e5dbe4199bc3fb70e617fce", + "model_dtype": "torch.float16", + "lighteval_sha": "2d7f9b0219a3536f201c55d7e8126251127b731c", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "13923.179090738297", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/airoboros-7b-gpt4-fp16/results_2023-07-19T17-47-19.580481.json b/eval-results/TheBloke/airoboros-7b-gpt4-fp16/results_2023-07-19T17-47-19.580481.json new file mode 100644 index 0000000000000000000000000000000000000000..b7411c2e054671edfd0c1f4c22c76222be2240c0 --- /dev/null +++ b/eval-results/TheBloke/airoboros-7b-gpt4-fp16/results_2023-07-19T17-47-19.580481.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.49829351535836175, + "acc_stderr": 0.01461130570505699, + "acc_norm": 0.5307167235494881, + "acc_norm_stderr": 0.014583792546304038 + }, + "harness|hellaswag|10": { + "acc": 0.6005775741884087, + "acc_stderr": 0.004887787255353494, + "acc_norm": 0.7866958773152758, + "acc_norm_stderr": 0.004088034745195348 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720683, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720683 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4222222222222222, + "acc_stderr": 0.04266763404099582, + "acc_norm": 0.4222222222222222, + "acc_norm_stderr": 0.04266763404099582 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3881578947368421, + "acc_stderr": 0.03965842097512744, + "acc_norm": 0.3881578947368421, + "acc_norm_stderr": 0.03965842097512744 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4490566037735849, + "acc_stderr": 0.030612730713641092, + "acc_norm": 0.4490566037735849, + "acc_norm_stderr": 0.030612730713641092 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3819444444444444, + "acc_stderr": 0.040629907841466674, + "acc_norm": 0.3819444444444444, + "acc_norm_stderr": 0.040629907841466674 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.32947976878612717, + "acc_stderr": 0.03583901754736411, + "acc_norm": 0.32947976878612717, + "acc_norm_stderr": 0.03583901754736411 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.18627450980392157, + "acc_stderr": 0.038739587141493524, + "acc_norm": 0.18627450980392157, + "acc_norm_stderr": 0.038739587141493524 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3574468085106383, + "acc_stderr": 0.03132941789476425, + "acc_norm": 0.3574468085106383, + "acc_norm_stderr": 0.03132941789476425 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4068965517241379, + "acc_stderr": 0.04093793981266236, + "acc_norm": 0.4068965517241379, + "acc_norm_stderr": 0.04093793981266236 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.02306818884826111, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.02306818884826111 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.03932537680392871, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.03932537680392871 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.4, + "acc_stderr": 0.027869320571664632, + "acc_norm": 0.4, + "acc_norm_stderr": 0.027869320571664632 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3103448275862069, + "acc_stderr": 0.03255086769970103, + "acc_norm": 0.3103448275862069, + "acc_norm_stderr": 0.03255086769970103 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.23636363636363636, + "acc_stderr": 0.033175059300091805, + "acc_norm": 0.23636363636363636, + "acc_norm_stderr": 0.033175059300091805 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.494949494949495, + "acc_stderr": 0.035621707606254015, + "acc_norm": 0.494949494949495, + "acc_norm_stderr": 0.035621707606254015 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.5647668393782384, + "acc_stderr": 0.03578038165008586, + "acc_norm": 0.5647668393782384, + "acc_norm_stderr": 0.03578038165008586 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3564102564102564, + "acc_stderr": 0.0242831405294673, + "acc_norm": 0.3564102564102564, + "acc_norm_stderr": 0.0242831405294673 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.22962962962962963, + "acc_stderr": 0.025644108639267624, + "acc_norm": 0.22962962962962963, + "acc_norm_stderr": 0.025644108639267624 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3445378151260504, + "acc_stderr": 0.03086868260412163, + "acc_norm": 0.3445378151260504, + "acc_norm_stderr": 0.03086868260412163 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.037345356767871984, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.037345356767871984 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.5266055045871559, + "acc_stderr": 0.021406952688151574, + "acc_norm": 0.5266055045871559, + "acc_norm_stderr": 0.021406952688151574 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3287037037037037, + "acc_stderr": 0.03203614084670058, + "acc_norm": 0.3287037037037037, + "acc_norm_stderr": 0.03203614084670058 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.4767932489451477, + "acc_stderr": 0.032512152011410174, + "acc_norm": 0.4767932489451477, + "acc_norm_stderr": 0.032512152011410174 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.484304932735426, + "acc_stderr": 0.0335412657542081, + "acc_norm": 0.484304932735426, + "acc_norm_stderr": 0.0335412657542081 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.42748091603053434, + "acc_stderr": 0.04338920305792401, + "acc_norm": 0.42748091603053434, + "acc_norm_stderr": 0.04338920305792401 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6198347107438017, + "acc_stderr": 0.04431324501968432, + "acc_norm": 0.6198347107438017, + "acc_norm_stderr": 0.04431324501968432 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.047500773411999854, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.047500773411999854 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4539877300613497, + "acc_stderr": 0.0391170190467718, + "acc_norm": 0.4539877300613497, + "acc_norm_stderr": 0.0391170190467718 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.23214285714285715, + "acc_stderr": 0.040073418097558065, + "acc_norm": 0.23214285714285715, + "acc_norm_stderr": 0.040073418097558065 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.4174757281553398, + "acc_stderr": 0.04882840548212238, + "acc_norm": 0.4174757281553398, + "acc_norm_stderr": 0.04882840548212238 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.5726495726495726, + "acc_stderr": 0.032408473935163266, + "acc_norm": 0.5726495726495726, + "acc_norm_stderr": 0.032408473935163266 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5632183908045977, + "acc_stderr": 0.017736470837800694, + "acc_norm": 0.5632183908045977, + "acc_norm_stderr": 0.017736470837800694 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.4277456647398844, + "acc_stderr": 0.026636539741116082, + "acc_norm": 0.4277456647398844, + "acc_norm_stderr": 0.026636539741116082 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.37254901960784315, + "acc_stderr": 0.0276841818833029, + "acc_norm": 0.37254901960784315, + "acc_norm_stderr": 0.0276841818833029 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.4340836012861736, + "acc_stderr": 0.028150232244535604, + "acc_norm": 0.4340836012861736, + "acc_norm_stderr": 0.028150232244535604 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.027513747284379424, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.027513747284379424 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.30851063829787234, + "acc_stderr": 0.027553366165101362, + "acc_norm": 0.30851063829787234, + "acc_norm_stderr": 0.027553366165101362 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3344198174706649, + "acc_stderr": 0.01204966898321494, + "acc_norm": 0.3344198174706649, + "acc_norm_stderr": 0.01204966898321494 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4522058823529412, + "acc_stderr": 0.030233758551596445, + "acc_norm": 0.4522058823529412, + "acc_norm_stderr": 0.030233758551596445 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.39052287581699346, + "acc_stderr": 0.019737008998094604, + "acc_norm": 0.39052287581699346, + "acc_norm_stderr": 0.019737008998094604 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.4636363636363636, + "acc_stderr": 0.047764491623961985, + "acc_norm": 0.4636363636363636, + "acc_norm_stderr": 0.047764491623961985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3306122448979592, + "acc_stderr": 0.0301164262965406, + "acc_norm": 0.3306122448979592, + "acc_norm_stderr": 0.0301164262965406 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.5024875621890548, + "acc_stderr": 0.03535490150137288, + "acc_norm": 0.5024875621890548, + "acc_norm_stderr": 0.03535490150137288 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3313253012048193, + "acc_stderr": 0.03664314777288085, + "acc_norm": 0.3313253012048193, + "acc_norm_stderr": 0.03664314777288085 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.543859649122807, + "acc_stderr": 0.03820042586602966, + "acc_norm": 0.543859649122807, + "acc_norm_stderr": 0.03820042586602966 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.27050183598531213, + "mc1_stderr": 0.015550778332842888, + "mc2": 0.40728190105019363, + "mc2_stderr": 0.014755907179912318 + }, + "all": { + "acc": 0.394202124616002, + "acc_stderr": 0.03480686217496621, + "acc_norm": 0.39790621802817144, + "acc_norm_stderr": 0.03479284072295077, + "mc1": 0.27050183598531213, + "mc1_stderr": 0.015550778332842888, + "mc2": 0.40728190105019363, + "mc2_stderr": 0.014755907179912318 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/airoboros-7b-gpt4-fp16", + "model_sha": "14aa50fba9f6418c0d5e2d24087eb802931040ef", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/airoboros-7b-gpt4-fp16/results_2023-10-22T11-48-44.859139.json b/eval-results/TheBloke/airoboros-7b-gpt4-fp16/results_2023-10-22T11-48-44.859139.json new file mode 100644 index 0000000000000000000000000000000000000000..37eaca8c1f3d97d2077b694fdcc7393e6bdcf76c --- /dev/null +++ b/eval-results/TheBloke/airoboros-7b-gpt4-fp16/results_2023-10-22T11-48-44.859139.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/airoboros-7b-gpt4-fp16", + "model_sha": "14aa50fba9f6418c0d5e2d24087eb802931040ef", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.24276426174496643, + "em_stderr": 0.004390839668047224, + "f1": 0.3038569630872493, + "f1_stderr": 0.004387376487144696 + }, + "harness|gsm8k|5": { + "acc": 0.017437452615617893, + "acc_stderr": 0.0036054868679982572 + }, + "harness|winogrande|5": { + "acc": 0.7308602999210734, + "acc_stderr": 0.012464911951268738 + }, + "all": { + "em": 0.24276426174496643, + "em_stderr": 0.004390839668047224, + "f1": 0.3038569630872493, + "f1_stderr": 0.004387376487144696, + "acc": 0.37414887626834564, + "acc_stderr": 0.008035199409633497 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "ef26abc9e18cab11" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "4f67f77046f200af" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "ef7dad37484dbdb2" + }, + "total_evaluation_time_secondes": "20603.22762107849", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/alpaca-lora-65B-HF/results_2023-07-25T19-46-53.347899.json b/eval-results/TheBloke/alpaca-lora-65B-HF/results_2023-07-25T19-46-53.347899.json new file mode 100644 index 0000000000000000000000000000000000000000..b5fd15eb03c4838a7285ffd83ea307e2880f63ea --- /dev/null +++ b/eval-results/TheBloke/alpaca-lora-65B-HF/results_2023-07-25T19-46-53.347899.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.6237201365187713, + "acc_stderr": 0.014157022555407163, + "acc_norm": 0.6484641638225256, + "acc_norm_stderr": 0.013952413699600935 + }, + "harness|hellaswag|10": { + "acc": 0.6719776936865166, + "acc_stderr": 0.004685334844038669, + "acc_norm": 0.8559051981676957, + "acc_norm_stderr": 0.0035046810917039014 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5333333333333333, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.5333333333333333, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7171052631578947, + "acc_stderr": 0.03665349695640767, + "acc_norm": 0.7171052631578947, + "acc_norm_stderr": 0.03665349695640767 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6641509433962264, + "acc_stderr": 0.02906722014664483, + "acc_norm": 0.6641509433962264, + "acc_norm_stderr": 0.02906722014664483 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6597222222222222, + "acc_stderr": 0.039621355734862175, + "acc_norm": 0.6597222222222222, + "acc_norm_stderr": 0.039621355734862175 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.45, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5549132947976878, + "acc_stderr": 0.03789401760283647, + "acc_norm": 0.5549132947976878, + "acc_norm_stderr": 0.03789401760283647 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3431372549019608, + "acc_stderr": 0.047240073523838876, + "acc_norm": 0.3431372549019608, + "acc_norm_stderr": 0.047240073523838876 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.76, + "acc_stderr": 0.04292346959909281, + "acc_norm": 0.76, + "acc_norm_stderr": 0.04292346959909281 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6, + "acc_stderr": 0.03202563076101736, + "acc_norm": 0.6, + "acc_norm_stderr": 0.03202563076101736 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.34210526315789475, + "acc_stderr": 0.04462917535336936, + "acc_norm": 0.34210526315789475, + "acc_norm_stderr": 0.04462917535336936 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5448275862068965, + "acc_stderr": 0.04149886942192118, + "acc_norm": 0.5448275862068965, + "acc_norm_stderr": 0.04149886942192118 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.02510742548113728, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.02510742548113728 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.044444444444444495, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.044444444444444495 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7258064516129032, + "acc_stderr": 0.025378139970885196, + "acc_norm": 0.7258064516129032, + "acc_norm_stderr": 0.025378139970885196 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.41379310344827586, + "acc_stderr": 0.03465304488406796, + "acc_norm": 0.41379310344827586, + "acc_norm_stderr": 0.03465304488406796 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526066, + "acc_norm": 0.67, + "acc_norm_stderr": 0.047258156262526066 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7757575757575758, + "acc_stderr": 0.03256866661681102, + "acc_norm": 0.7757575757575758, + "acc_norm_stderr": 0.03256866661681102 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.803030303030303, + "acc_stderr": 0.028335609732463355, + "acc_norm": 0.803030303030303, + "acc_norm_stderr": 0.028335609732463355 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8601036269430051, + "acc_stderr": 0.02503387058301518, + "acc_norm": 0.8601036269430051, + "acc_norm_stderr": 0.02503387058301518 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6564102564102564, + "acc_stderr": 0.024078696580635467, + "acc_norm": 0.6564102564102564, + "acc_norm_stderr": 0.024078696580635467 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.31851851851851853, + "acc_stderr": 0.02840653309060846, + "acc_norm": 0.31851851851851853, + "acc_norm_stderr": 0.02840653309060846 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6932773109243697, + "acc_stderr": 0.029953823891887055, + "acc_norm": 0.6932773109243697, + "acc_norm_stderr": 0.029953823891887055 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3841059602649007, + "acc_stderr": 0.03971301814719198, + "acc_norm": 0.3841059602649007, + "acc_norm_stderr": 0.03971301814719198 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8422018348623853, + "acc_stderr": 0.015630022970092444, + "acc_norm": 0.8422018348623853, + "acc_norm_stderr": 0.015630022970092444 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5509259259259259, + "acc_stderr": 0.03392238405321617, + "acc_norm": 0.5509259259259259, + "acc_norm_stderr": 0.03392238405321617 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8284313725490197, + "acc_stderr": 0.026460569561240634, + "acc_norm": 0.8284313725490197, + "acc_norm_stderr": 0.026460569561240634 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.810126582278481, + "acc_stderr": 0.02553010046023349, + "acc_norm": 0.810126582278481, + "acc_norm_stderr": 0.02553010046023349 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7786259541984732, + "acc_stderr": 0.0364129708131373, + "acc_norm": 0.7786259541984732, + "acc_norm_stderr": 0.0364129708131373 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8016528925619835, + "acc_stderr": 0.03640118271990946, + "acc_norm": 0.8016528925619835, + "acc_norm_stderr": 0.03640118271990946 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.04330043749650742, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.04330043749650742 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7791411042944786, + "acc_stderr": 0.03259177392742179, + "acc_norm": 0.7791411042944786, + "acc_norm_stderr": 0.03259177392742179 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5089285714285714, + "acc_stderr": 0.04745033255489122, + "acc_norm": 0.5089285714285714, + "acc_norm_stderr": 0.04745033255489122 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.03760178006026621, + "acc_norm": 0.8252427184466019, + "acc_norm_stderr": 0.03760178006026621 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8675213675213675, + "acc_stderr": 0.022209309073165616, + "acc_norm": 0.8675213675213675, + "acc_norm_stderr": 0.022209309073165616 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8058748403575989, + "acc_stderr": 0.01414397027665757, + "acc_norm": 0.8058748403575989, + "acc_norm_stderr": 0.01414397027665757 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7398843930635838, + "acc_stderr": 0.023618678310069363, + "acc_norm": 0.7398843930635838, + "acc_norm_stderr": 0.023618678310069363 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4793296089385475, + "acc_stderr": 0.016708205559996137, + "acc_norm": 0.4793296089385475, + "acc_norm_stderr": 0.016708205559996137 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.026787453111906497, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.026787453111906497 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.729903536977492, + "acc_stderr": 0.02521804037341063, + "acc_norm": 0.729903536977492, + "acc_norm_stderr": 0.02521804037341063 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7345679012345679, + "acc_stderr": 0.02456922360046085, + "acc_norm": 0.7345679012345679, + "acc_norm_stderr": 0.02456922360046085 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.029766675075873862, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.029766675075873862 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.48826597131681876, + "acc_stderr": 0.01276671901968672, + "acc_norm": 0.48826597131681876, + "acc_norm_stderr": 0.01276671901968672 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6139705882352942, + "acc_stderr": 0.029573269134411124, + "acc_norm": 0.6139705882352942, + "acc_norm_stderr": 0.029573269134411124 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6584967320261438, + "acc_stderr": 0.01918463932809249, + "acc_norm": 0.6584967320261438, + "acc_norm_stderr": 0.01918463932809249 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7346938775510204, + "acc_stderr": 0.028263889943784606, + "acc_norm": 0.7346938775510204, + "acc_norm_stderr": 0.028263889943784606 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.835820895522388, + "acc_stderr": 0.02619392354445412, + "acc_norm": 0.835820895522388, + "acc_norm_stderr": 0.02619392354445412 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.0358870281282637, + "acc_norm": 0.85, + "acc_norm_stderr": 0.0358870281282637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5240963855421686, + "acc_stderr": 0.03887971849597264, + "acc_norm": 0.5240963855421686, + "acc_norm_stderr": 0.03887971849597264 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8245614035087719, + "acc_stderr": 0.02917088550072767, + "acc_norm": 0.8245614035087719, + "acc_norm_stderr": 0.02917088550072767 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.31211750305997554, + "mc1_stderr": 0.01622075676952093, + "mc2": 0.45147495688569755, + "mc2_stderr": 0.014204440495311351 + }, + "all": { + "acc": 0.6317066991099983, + "acc_stderr": 0.03304743342622785, + "acc_norm": 0.6352435047334719, + "acc_norm_stderr": 0.033023954398971225, + "mc1": 0.31211750305997554, + "mc1_stderr": 0.01622075676952093, + "mc2": 0.45147495688569755, + "mc2_stderr": 0.014204440495311351 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/alpaca-lora-65B-HF", + "model_sha": "113b61b37a2862b950ada68620e57acafbcefe13", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "25450.259630680084", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/alpaca-lora-65B-HF/results_2023-10-23T02-09-35.586177.json b/eval-results/TheBloke/alpaca-lora-65B-HF/results_2023-10-23T02-09-35.586177.json new file mode 100644 index 0000000000000000000000000000000000000000..2a0166a3f6c46476971d3dc9f8f9b6aa126ddf7a --- /dev/null +++ b/eval-results/TheBloke/alpaca-lora-65B-HF/results_2023-10-23T02-09-35.586177.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/alpaca-lora-65B-HF", + "model_sha": "113b61b37a2862b950ada68620e57acafbcefe13", + "model_size": "121.68 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.10255872483221476, + "em_stderr": 0.0031069121780170463, + "f1": 0.16075398489932788, + "f1_stderr": 0.0032128112295639008 + }, + "harness|gsm8k|5": { + "acc": 0.2805155420773313, + "acc_stderr": 0.012374608490929553 + }, + "harness|winogrande|5": { + "acc": 0.8121546961325967, + "acc_stderr": 0.010977481103435091 + }, + "all": { + "em": 0.10255872483221476, + "em_stderr": 0.0031069121780170463, + "f1": 0.16075398489932788, + "f1_stderr": 0.0032128112295639008, + "acc": 0.546335119104964, + "acc_stderr": 0.011676044797182322 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "c8a6e955e4847442" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "b206b6b594cd6408" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "71f7dd7e98dfcd82" + }, + "total_evaluation_time_secondes": "42360.19733428955", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/chronos-wizardlm-uc-scot-st-13B-GPTQ/results_2023-08-21T16-42-58.579611.json b/eval-results/TheBloke/chronos-wizardlm-uc-scot-st-13B-GPTQ/results_2023-08-21T16-42-58.579611.json new file mode 100644 index 0000000000000000000000000000000000000000..e3a8dfb69838aa902bd63ae99a00c3304d2e0f0c --- /dev/null +++ b/eval-results/TheBloke/chronos-wizardlm-uc-scot-st-13B-GPTQ/results_2023-08-21T16-42-58.579611.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.2235494880546075, + "acc_stderr": 0.012174896631202605, + "acc_norm": 0.27986348122866894, + "acc_norm_stderr": 0.013119040897725925 + }, + "harness|hellaswag|10": { + "acc": 0.25652260505875324, + "acc_stderr": 0.004358210689442269, + "acc_norm": 0.2610037841067516, + "acc_norm_stderr": 0.004382844128643426 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.03785714465066654, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.03785714465066654 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.21710526315789475, + "acc_stderr": 0.033550453048829226, + "acc_norm": 0.21710526315789475, + "acc_norm_stderr": 0.033550453048829226 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.24150943396226415, + "acc_stderr": 0.026341480371118355, + "acc_norm": 0.24150943396226415, + "acc_norm_stderr": 0.026341480371118355 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.25, + "acc_stderr": 0.03621034121889507, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03621034121889507 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680814, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680814 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2543352601156069, + "acc_stderr": 0.03320556443085569, + "acc_norm": 0.2543352601156069, + "acc_norm_stderr": 0.03320556443085569 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.04533838195929775, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.04533838195929775 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3021276595744681, + "acc_stderr": 0.030017554471880557, + "acc_norm": 0.3021276595744681, + "acc_norm_stderr": 0.030017554471880557 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322004, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322004 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.037245636197746325, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.037245636197746325 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.022569897074918417, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.022569897074918417 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.038095238095238126, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.038095238095238126 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.27741935483870966, + "acc_stderr": 0.025470196835900055, + "acc_norm": 0.27741935483870966, + "acc_norm_stderr": 0.025470196835900055 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.29064039408866993, + "acc_stderr": 0.0319474007226554, + "acc_norm": 0.29064039408866993, + "acc_norm_stderr": 0.0319474007226554 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.16, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.16, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.03453131801885415, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.03453131801885415 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.30303030303030304, + "acc_stderr": 0.03274287914026868, + "acc_norm": 0.30303030303030304, + "acc_norm_stderr": 0.03274287914026868 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.32124352331606215, + "acc_stderr": 0.033699508685490674, + "acc_norm": 0.32124352331606215, + "acc_norm_stderr": 0.033699508685490674 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2717948717948718, + "acc_stderr": 0.022556551010132354, + "acc_norm": 0.2717948717948718, + "acc_norm_stderr": 0.022556551010132354 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.026202766534652144, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.026202766534652144 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2773109243697479, + "acc_stderr": 0.02907937453948001, + "acc_norm": 0.2773109243697479, + "acc_norm_stderr": 0.02907937453948001 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.29357798165137616, + "acc_stderr": 0.019525151122639667, + "acc_norm": 0.29357798165137616, + "acc_norm_stderr": 0.019525151122639667 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.37037037037037035, + "acc_stderr": 0.03293377139415191, + "acc_norm": 0.37037037037037035, + "acc_norm_stderr": 0.03293377139415191 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.028867431449849313, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.028867431449849313 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.20179372197309417, + "acc_stderr": 0.026936111912802273, + "acc_norm": 0.20179372197309417, + "acc_norm_stderr": 0.026936111912802273 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.20610687022900764, + "acc_stderr": 0.03547771004159464, + "acc_norm": 0.20610687022900764, + "acc_norm_stderr": 0.03547771004159464 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2066115702479339, + "acc_stderr": 0.036959801280988254, + "acc_norm": 0.2066115702479339, + "acc_norm_stderr": 0.036959801280988254 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.041331194402438376, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.041331194402438376 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.24539877300613497, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.24539877300613497, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.1875, + "acc_stderr": 0.0370468111477387, + "acc_norm": 0.1875, + "acc_norm_stderr": 0.0370468111477387 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.2815533980582524, + "acc_stderr": 0.04453254836326467, + "acc_norm": 0.2815533980582524, + "acc_norm_stderr": 0.04453254836326467 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.23076923076923078, + "acc_stderr": 0.027601921381417593, + "acc_norm": 0.23076923076923078, + "acc_norm_stderr": 0.027601921381417593 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2388250319284802, + "acc_stderr": 0.015246803197398691, + "acc_norm": 0.2388250319284802, + "acc_norm_stderr": 0.015246803197398691 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.20520231213872833, + "acc_stderr": 0.021742519835276294, + "acc_norm": 0.20520231213872833, + "acc_norm_stderr": 0.021742519835276294 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24692737430167597, + "acc_stderr": 0.014422292204808864, + "acc_norm": 0.24692737430167597, + "acc_norm_stderr": 0.014422292204808864 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2581699346405229, + "acc_stderr": 0.025058503316958154, + "acc_norm": 0.2581699346405229, + "acc_norm_stderr": 0.025058503316958154 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2572347266881029, + "acc_stderr": 0.024826171289250888, + "acc_norm": 0.2572347266881029, + "acc_norm_stderr": 0.024826171289250888 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2716049382716049, + "acc_stderr": 0.02474862449053737, + "acc_norm": 0.2716049382716049, + "acc_norm_stderr": 0.02474862449053737 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2553191489361702, + "acc_stderr": 0.026011992930902027, + "acc_norm": 0.2553191489361702, + "acc_norm_stderr": 0.026011992930902027 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2438070404172099, + "acc_stderr": 0.010966507972178479, + "acc_norm": 0.2438070404172099, + "acc_norm_stderr": 0.010966507972178479 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.3125, + "acc_stderr": 0.02815637344037142, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.02815637344037142 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.22712418300653595, + "acc_stderr": 0.016949853279212376, + "acc_norm": 0.22712418300653595, + "acc_norm_stderr": 0.016949853279212376 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.3181818181818182, + "acc_stderr": 0.04461272175910508, + "acc_norm": 0.3181818181818182, + "acc_norm_stderr": 0.04461272175910508 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.23265306122448978, + "acc_stderr": 0.02704925791589618, + "acc_norm": 0.23265306122448978, + "acc_norm_stderr": 0.02704925791589618 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.26865671641791045, + "acc_stderr": 0.03134328358208954, + "acc_norm": 0.26865671641791045, + "acc_norm_stderr": 0.03134328358208954 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2891566265060241, + "acc_stderr": 0.03529486801511115, + "acc_norm": 0.2891566265060241, + "acc_norm_stderr": 0.03529486801511115 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.21637426900584794, + "acc_stderr": 0.031581495393387324, + "acc_norm": 0.21637426900584794, + "acc_norm_stderr": 0.031581495393387324 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.24724602203182375, + "mc1_stderr": 0.01510240479735965, + "mc2": 0.49679341606997424, + "mc2_stderr": 0.016971815555440684 + }, + "all": { + "acc": 0.2565764331292473, + "acc_stderr": 0.03174716661216211, + "acc_norm": 0.2576068597770789, + "acc_norm_stderr": 0.03176358657327609, + "mc1": 0.24724602203182375, + "mc1_stderr": 0.01510240479735965, + "mc2": 0.49679341606997424, + "mc2_stderr": 0.016971815555440684 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/chronos-wizardlm-uc-scot-st-13B-GPTQ", + "model_sha": "c4246e4b8d3fc77b9fe4ebb1ead61cda4b83575b", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4738.128798723221", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/chronos-wizardlm-uc-scot-st-13B-GPTQ/results_2023-11-05T09-19-09.913548.json b/eval-results/TheBloke/chronos-wizardlm-uc-scot-st-13B-GPTQ/results_2023-11-05T09-19-09.913548.json new file mode 100644 index 0000000000000000000000000000000000000000..8b255812714c1cb32ac9efb33bbf6dcf899b648b --- /dev/null +++ b/eval-results/TheBloke/chronos-wizardlm-uc-scot-st-13B-GPTQ/results_2023-11-05T09-19-09.913548.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/chronos-wizardlm-uc-scot-st-13B-GPTQ", + "model_sha": "a5fa7c0cfb26ffc6710e824d1e8c378c457676df", + "model_dtype": "torch.float16", + "model_size": "6.8 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.008284395973154363, + "em_stderr": 0.0009282472025612514, + "f1": 0.08187185402684566, + "f1_stderr": 0.0018061849809381774 + }, + "harness|gsm8k|5": { + "acc": 0.06823351023502654, + "acc_stderr": 0.006945358944067431 + }, + "harness|winogrande|5": { + "acc": 0.745067087608524, + "acc_stderr": 0.012248806969376422 + }, + "all": { + "em": 0.008284395973154363, + "em_stderr": 0.0009282472025612514, + "f1": 0.08187185402684566, + "f1_stderr": 0.0018061849809381774, + "acc": 0.4066502989217753, + "acc_stderr": 0.009597082956721927 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "9a896d498bc005c0" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "0c3eeb22a2d5ed3d" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "ec82ae2f87aff84c" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/chronos-wizardlm-uc-scot-st-13B-GPTQ/results_2023-11-07T17-01-57.084059.json b/eval-results/TheBloke/chronos-wizardlm-uc-scot-st-13B-GPTQ/results_2023-11-07T17-01-57.084059.json new file mode 100644 index 0000000000000000000000000000000000000000..9ce6e23ad93a9af71e76a21dbad7659e0a2a85da --- /dev/null +++ b/eval-results/TheBloke/chronos-wizardlm-uc-scot-st-13B-GPTQ/results_2023-11-07T17-01-57.084059.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/chronos-wizardlm-uc-scot-st-13B-GPTQ", + "model_sha": "a5fa7c0cfb26ffc6710e824d1e8c378c457676df", + "model_dtype": "torch.float16", + "model_size": "6.8 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.008284395973154363, + "em_stderr": 0.0009282472025612514, + "f1": 0.0820406879194631, + "f1_stderr": 0.0018086518070639704 + }, + "harness|gsm8k|5": { + "acc": 0.06899166034874905, + "acc_stderr": 0.006980995834838566 + }, + "harness|winogrande|5": { + "acc": 0.745067087608524, + "acc_stderr": 0.012248806969376422 + }, + "all": { + "em": 0.008284395973154363, + "em_stderr": 0.0009282472025612514, + "f1": 0.0820406879194631, + "f1_stderr": 0.0018086518070639704, + "acc": 0.40702937397863653, + "acc_stderr": 0.009614901402107493 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "de43c409b4e15959" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "be0c4a50842dd797" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "742aa20475fcfa9e" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/dromedary-65b-lora-HF/results_2023-07-21T02-37-03.243913.json b/eval-results/TheBloke/dromedary-65b-lora-HF/results_2023-07-21T02-37-03.243913.json new file mode 100644 index 0000000000000000000000000000000000000000..4f5d419c6a1dbcde52b1c624bcd4fd93ae8ad2e6 --- /dev/null +++ b/eval-results/TheBloke/dromedary-65b-lora-HF/results_2023-07-21T02-37-03.243913.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5853242320819113, + "acc_stderr": 0.014397070564409172, + "acc_norm": 0.6160409556313993, + "acc_norm_stderr": 0.01421244498065189 + }, + "harness|hellaswag|10": { + "acc": 0.6527584146584345, + "acc_stderr": 0.004751203378888059, + "acc_norm": 0.8253335988846843, + "acc_norm_stderr": 0.003789055487003176 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.04292596718256981, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.04292596718256981 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7368421052631579, + "acc_stderr": 0.03583496176361073, + "acc_norm": 0.7368421052631579, + "acc_norm_stderr": 0.03583496176361073 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6641509433962264, + "acc_stderr": 0.02906722014664483, + "acc_norm": 0.6641509433962264, + "acc_norm_stderr": 0.02906722014664483 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6597222222222222, + "acc_stderr": 0.039621355734862175, + "acc_norm": 0.6597222222222222, + "acc_norm_stderr": 0.039621355734862175 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5375722543352601, + "acc_stderr": 0.03801685104524458, + "acc_norm": 0.5375722543352601, + "acc_norm_stderr": 0.03801685104524458 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.04755129616062946, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.04755129616062946 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5659574468085107, + "acc_stderr": 0.03240038086792747, + "acc_norm": 0.5659574468085107, + "acc_norm_stderr": 0.03240038086792747 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3508771929824561, + "acc_stderr": 0.044895393502707, + "acc_norm": 0.3508771929824561, + "acc_norm_stderr": 0.044895393502707 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5517241379310345, + "acc_stderr": 0.04144311810878152, + "acc_norm": 0.5517241379310345, + "acc_norm_stderr": 0.04144311810878152 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3835978835978836, + "acc_stderr": 0.0250437573185202, + "acc_norm": 0.3835978835978836, + "acc_norm_stderr": 0.0250437573185202 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4603174603174603, + "acc_stderr": 0.04458029125470973, + "acc_norm": 0.4603174603174603, + "acc_norm_stderr": 0.04458029125470973 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7322580645161291, + "acc_stderr": 0.02518900666021238, + "acc_norm": 0.7322580645161291, + "acc_norm_stderr": 0.02518900666021238 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4433497536945813, + "acc_stderr": 0.03495334582162934, + "acc_norm": 0.4433497536945813, + "acc_norm_stderr": 0.03495334582162934 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.64, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.64, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7696969696969697, + "acc_stderr": 0.032876667586034906, + "acc_norm": 0.7696969696969697, + "acc_norm_stderr": 0.032876667586034906 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.029126522834586794, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.029126522834586794 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8756476683937824, + "acc_stderr": 0.02381447708659355, + "acc_norm": 0.8756476683937824, + "acc_norm_stderr": 0.02381447708659355 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6461538461538462, + "acc_stderr": 0.02424378399406217, + "acc_norm": 0.6461538461538462, + "acc_norm_stderr": 0.02424378399406217 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.028742040903948496, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.028742040903948496 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6890756302521008, + "acc_stderr": 0.03006676158297792, + "acc_norm": 0.6890756302521008, + "acc_norm_stderr": 0.03006676158297792 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3973509933774834, + "acc_stderr": 0.0399552400768168, + "acc_norm": 0.3973509933774834, + "acc_norm_stderr": 0.0399552400768168 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8275229357798165, + "acc_stderr": 0.016197807956848043, + "acc_norm": 0.8275229357798165, + "acc_norm_stderr": 0.016197807956848043 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.03400603625538272, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.03400603625538272 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.026156867523931055, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.026156867523931055 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8396624472573839, + "acc_stderr": 0.02388438092596567, + "acc_norm": 0.8396624472573839, + "acc_norm_stderr": 0.02388438092596567 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7633587786259542, + "acc_stderr": 0.03727673575596915, + "acc_norm": 0.7633587786259542, + "acc_norm_stderr": 0.03727673575596915 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8099173553719008, + "acc_stderr": 0.035817969517092825, + "acc_norm": 0.8099173553719008, + "acc_norm_stderr": 0.035817969517092825 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.04236511258094632, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.04236511258094632 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7914110429447853, + "acc_stderr": 0.03192193448934723, + "acc_norm": 0.7914110429447853, + "acc_norm_stderr": 0.03192193448934723 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.48214285714285715, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.48214285714285715, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.03760178006026621, + "acc_norm": 0.8252427184466019, + "acc_norm_stderr": 0.03760178006026621 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8504273504273504, + "acc_stderr": 0.02336505149175372, + "acc_norm": 0.8504273504273504, + "acc_norm_stderr": 0.02336505149175372 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.65, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.65, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7994891443167306, + "acc_stderr": 0.014317653708594204, + "acc_norm": 0.7994891443167306, + "acc_norm_stderr": 0.014317653708594204 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7369942196531792, + "acc_stderr": 0.02370309952525818, + "acc_norm": 0.7369942196531792, + "acc_norm_stderr": 0.02370309952525818 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.45363128491620114, + "acc_stderr": 0.016650437588269073, + "acc_norm": 0.45363128491620114, + "acc_norm_stderr": 0.016650437588269073 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.673202614379085, + "acc_stderr": 0.02685729466328141, + "acc_norm": 0.673202614379085, + "acc_norm_stderr": 0.02685729466328141 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7266881028938906, + "acc_stderr": 0.025311765975426122, + "acc_norm": 0.7266881028938906, + "acc_norm_stderr": 0.025311765975426122 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7283950617283951, + "acc_stderr": 0.024748624490537382, + "acc_norm": 0.7283950617283951, + "acc_norm_stderr": 0.024748624490537382 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5035460992907801, + "acc_stderr": 0.02982674915328092, + "acc_norm": 0.5035460992907801, + "acc_norm_stderr": 0.02982674915328092 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4915254237288136, + "acc_stderr": 0.012768401697269057, + "acc_norm": 0.4915254237288136, + "acc_norm_stderr": 0.012768401697269057 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6066176470588235, + "acc_stderr": 0.029674288281311155, + "acc_norm": 0.6066176470588235, + "acc_norm_stderr": 0.029674288281311155 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6683006535947712, + "acc_stderr": 0.019047485239360378, + "acc_norm": 0.6683006535947712, + "acc_norm_stderr": 0.019047485239360378 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7183673469387755, + "acc_stderr": 0.028795185574291275, + "acc_norm": 0.7183673469387755, + "acc_norm_stderr": 0.028795185574291275 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8109452736318408, + "acc_stderr": 0.02768691358801301, + "acc_norm": 0.8109452736318408, + "acc_norm_stderr": 0.02768691358801301 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.033799766898963086, + "acc_norm": 0.87, + "acc_norm_stderr": 0.033799766898963086 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5421686746987951, + "acc_stderr": 0.0387862677100236, + "acc_norm": 0.5421686746987951, + "acc_norm_stderr": 0.0387862677100236 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8070175438596491, + "acc_stderr": 0.030267457554898458, + "acc_norm": 0.8070175438596491, + "acc_norm_stderr": 0.030267457554898458 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2668298653610771, + "mc1_stderr": 0.01548369193923726, + "mc2": 0.3882253449999486, + "mc2_stderr": 0.015662564074089474 + }, + "all": { + "acc": 0.6303778847694929, + "acc_stderr": 0.033064292808440705, + "acc_norm": 0.6338235103250138, + "acc_norm_stderr": 0.0330448559698705, + "mc1": 0.2668298653610771, + "mc1_stderr": 0.01548369193923726, + "mc2": 0.3882253449999486, + "mc2_stderr": 0.015662564074089474 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/dromedary-65b-lora-HF", + "model_sha": "3fa4546259d6bbd6b5d637484c325ab19181a73c", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/dromedary-65b-lora-HF/results_2023-10-15T03-08-41.091963.json b/eval-results/TheBloke/dromedary-65b-lora-HF/results_2023-10-15T03-08-41.091963.json new file mode 100644 index 0000000000000000000000000000000000000000..06a1ae0f4fead676fe74369e826e7afb1b01374f --- /dev/null +++ b/eval-results/TheBloke/dromedary-65b-lora-HF/results_2023-10-15T03-08-41.091963.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/dromedary-65b-lora-HF", + "model_sha": "3fa4546259d6bbd6b5d637484c325ab19181a73c", + "model_size": "121.68 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001572986577181208, + "em_stderr": 0.0004058451132417735, + "f1": 0.058895763422818985, + "f1_stderr": 0.0012985937732460785 + }, + "harness|gsm8k|5": { + "acc": 0.2744503411675512, + "acc_stderr": 0.012291581170814905 + }, + "harness|winogrande|5": { + "acc": 0.7892659826361483, + "acc_stderr": 0.011462046419710674 + }, + "all": { + "em": 0.001572986577181208, + "em_stderr": 0.0004058451132417735, + "f1": 0.058895763422818985, + "f1_stderr": 0.0012985937732460785, + "acc": 0.5318581619018498, + "acc_stderr": 0.01187681379526279 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "83b6ca4bc4b8cd66" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "ed1249059e863227" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "28d0eb91243e500c" + }, + "total_evaluation_time_secondes": "45447.040060043335", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/fiction.live-Kimiko-V2-70B-fp16/results_2023-08-31T20-41-25.940897.json b/eval-results/TheBloke/fiction.live-Kimiko-V2-70B-fp16/results_2023-08-31T20-41-25.940897.json new file mode 100644 index 0000000000000000000000000000000000000000..d4fb9498676213ff64c8dcb31d57ef84bb92dcda --- /dev/null +++ b/eval-results/TheBloke/fiction.live-Kimiko-V2-70B-fp16/results_2023-08-31T20-41-25.940897.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/fiction.live-Kimiko-V2-70B-fp16", + "model_sha": "6b0c2cb654133cad2d4920e7da2e3f6cb1c4f7fd", + "model_dtype": "torch.float16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6399317406143344, + "acc_stderr": 0.014027516814585188, + "acc_norm": 0.6766211604095563, + "acc_norm_stderr": 0.013669421630012127 + }, + "harness|hellaswag|10": { + "acc": 0.6801433977295359, + "acc_stderr": 0.0046546756068415514, + "acc_norm": 0.8765186217884884, + "acc_norm_stderr": 0.003283165867631371 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6296296296296297, + "acc_stderr": 0.04171654161354543, + "acc_norm": 0.6296296296296297, + "acc_norm_stderr": 0.04171654161354543 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.8223684210526315, + "acc_stderr": 0.03110318238312338, + "acc_norm": 0.8223684210526315, + "acc_norm_stderr": 0.03110318238312338 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.73, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.73, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7320754716981132, + "acc_stderr": 0.027257260322494845, + "acc_norm": 0.7320754716981132, + "acc_norm_stderr": 0.027257260322494845 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.031164899666948617, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.031164899666948617 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.036430371689585475, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.036430371689585475 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.04755129616062946, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.04755129616062946 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6723404255319149, + "acc_stderr": 0.030683020843231008, + "acc_norm": 0.6723404255319149, + "acc_norm_stderr": 0.030683020843231008 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4298245614035088, + "acc_stderr": 0.04657047260594962, + "acc_norm": 0.4298245614035088, + "acc_norm_stderr": 0.04657047260594962 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6206896551724138, + "acc_stderr": 0.040434618619167466, + "acc_norm": 0.6206896551724138, + "acc_norm_stderr": 0.040434618619167466 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4365079365079365, + "acc_stderr": 0.025542846817400492, + "acc_norm": 0.4365079365079365, + "acc_norm_stderr": 0.025542846817400492 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5, + "acc_stderr": 0.04472135954999579, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04472135954999579 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8096774193548387, + "acc_stderr": 0.022331707611823074, + "acc_norm": 0.8096774193548387, + "acc_norm_stderr": 0.022331707611823074 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5270935960591133, + "acc_stderr": 0.03512819077876106, + "acc_norm": 0.5270935960591133, + "acc_norm_stderr": 0.03512819077876106 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8303030303030303, + "acc_stderr": 0.029311188674983134, + "acc_norm": 0.8303030303030303, + "acc_norm_stderr": 0.029311188674983134 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8787878787878788, + "acc_stderr": 0.023253157951942084, + "acc_norm": 0.8787878787878788, + "acc_norm_stderr": 0.023253157951942084 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9378238341968912, + "acc_stderr": 0.017426974154240528, + "acc_norm": 0.9378238341968912, + "acc_norm_stderr": 0.017426974154240528 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7153846153846154, + "acc_stderr": 0.0228783227997063, + "acc_norm": 0.7153846153846154, + "acc_norm_stderr": 0.0228783227997063 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.35555555555555557, + "acc_stderr": 0.0291857149498574, + "acc_norm": 0.35555555555555557, + "acc_norm_stderr": 0.0291857149498574 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7605042016806722, + "acc_stderr": 0.027722065493361255, + "acc_norm": 0.7605042016806722, + "acc_norm_stderr": 0.027722065493361255 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4105960264900662, + "acc_stderr": 0.04016689594849928, + "acc_norm": 0.4105960264900662, + "acc_norm_stderr": 0.04016689594849928 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8844036697247707, + "acc_stderr": 0.01370874953417264, + "acc_norm": 0.8844036697247707, + "acc_norm_stderr": 0.01370874953417264 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.6157407407407407, + "acc_stderr": 0.03317354514310742, + "acc_norm": 0.6157407407407407, + "acc_norm_stderr": 0.03317354514310742 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9068627450980392, + "acc_stderr": 0.020397853969427, + "acc_norm": 0.9068627450980392, + "acc_norm_stderr": 0.020397853969427 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8776371308016878, + "acc_stderr": 0.021331741829746786, + "acc_norm": 0.8776371308016878, + "acc_norm_stderr": 0.021331741829746786 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.8026905829596412, + "acc_stderr": 0.02670985334496796, + "acc_norm": 0.8026905829596412, + "acc_norm_stderr": 0.02670985334496796 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8702290076335878, + "acc_stderr": 0.029473649496907065, + "acc_norm": 0.8702290076335878, + "acc_norm_stderr": 0.029473649496907065 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8760330578512396, + "acc_stderr": 0.030083098716035202, + "acc_norm": 0.8760330578512396, + "acc_norm_stderr": 0.030083098716035202 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8240740740740741, + "acc_stderr": 0.036809181416738807, + "acc_norm": 0.8240740740740741, + "acc_norm_stderr": 0.036809181416738807 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8220858895705522, + "acc_stderr": 0.03004735765580663, + "acc_norm": 0.8220858895705522, + "acc_norm_stderr": 0.03004735765580663 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.04697113923010213, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.04697113923010213 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8349514563106796, + "acc_stderr": 0.03675668832233188, + "acc_norm": 0.8349514563106796, + "acc_norm_stderr": 0.03675668832233188 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8974358974358975, + "acc_stderr": 0.01987565502786746, + "acc_norm": 0.8974358974358975, + "acc_norm_stderr": 0.01987565502786746 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768077, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768077 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8659003831417624, + "acc_stderr": 0.012185528166499976, + "acc_norm": 0.8659003831417624, + "acc_norm_stderr": 0.012185528166499976 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7803468208092486, + "acc_stderr": 0.02228963885261789, + "acc_norm": 0.7803468208092486, + "acc_norm_stderr": 0.02228963885261789 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.43575418994413406, + "acc_stderr": 0.016583881958602394, + "acc_norm": 0.43575418994413406, + "acc_norm_stderr": 0.016583881958602394 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7745098039215687, + "acc_stderr": 0.02392915551735128, + "acc_norm": 0.7745098039215687, + "acc_norm_stderr": 0.02392915551735128 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7684887459807074, + "acc_stderr": 0.023956532766639133, + "acc_norm": 0.7684887459807074, + "acc_norm_stderr": 0.023956532766639133 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.020736358408060002, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.020736358408060002 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5425531914893617, + "acc_stderr": 0.029719281272236834, + "acc_norm": 0.5425531914893617, + "acc_norm_stderr": 0.029719281272236834 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5365058670143416, + "acc_stderr": 0.012736153390214966, + "acc_norm": 0.5365058670143416, + "acc_norm_stderr": 0.012736153390214966 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7536764705882353, + "acc_stderr": 0.02617343857052, + "acc_norm": 0.7536764705882353, + "acc_norm_stderr": 0.02617343857052 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7565359477124183, + "acc_stderr": 0.017362473762146623, + "acc_norm": 0.7565359477124183, + "acc_norm_stderr": 0.017362473762146623 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7, + "acc_stderr": 0.04389311454644287, + "acc_norm": 0.7, + "acc_norm_stderr": 0.04389311454644287 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7959183673469388, + "acc_stderr": 0.025801283475090492, + "acc_norm": 0.7959183673469388, + "acc_norm_stderr": 0.025801283475090492 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8955223880597015, + "acc_stderr": 0.021628920516700637, + "acc_norm": 0.8955223880597015, + "acc_norm_stderr": 0.021628920516700637 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.92, + "acc_stderr": 0.0272659924344291, + "acc_norm": 0.92, + "acc_norm_stderr": 0.0272659924344291 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5542168674698795, + "acc_stderr": 0.03869543323472101, + "acc_norm": 0.5542168674698795, + "acc_norm_stderr": 0.03869543323472101 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8654970760233918, + "acc_stderr": 0.026168221344662297, + "acc_norm": 0.8654970760233918, + "acc_norm_stderr": 0.026168221344662297 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.34761321909424725, + "mc1_stderr": 0.016670769188897303, + "mc2": 0.4927605148296666, + "mc2_stderr": 0.014223853503163776 + }, + "all": { + "acc": 0.6968685258873177, + "acc_stderr": 0.030907315261458148, + "acc_norm": 0.700818774088236, + "acc_norm_stderr": 0.030877999923766906, + "mc1": 0.34761321909424725, + "mc1_stderr": 0.016670769188897303, + "mc2": 0.4927605148296666, + "mc2_stderr": 0.014223853503163776 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "43799.29346227646", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/fiction.live-Kimiko-V2-70B-fp16/results_2023-10-23T10-02-44.747886.json b/eval-results/TheBloke/fiction.live-Kimiko-V2-70B-fp16/results_2023-10-23T10-02-44.747886.json new file mode 100644 index 0000000000000000000000000000000000000000..a30280f7f0eae285d3bb649ede72ef86fe4add0f --- /dev/null +++ b/eval-results/TheBloke/fiction.live-Kimiko-V2-70B-fp16/results_2023-10-23T10-02-44.747886.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/fiction.live-Kimiko-V2-70B-fp16", + "model_sha": "0645ebb33a45a8511cdab29e995a535ee3fe7dca", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001572986577181208, + "em_stderr": 0.00040584511324177344, + "f1": 0.06689072986577178, + "f1_stderr": 0.0013705945295387344 + }, + "harness|gsm8k|5": { + "acc": 0.3457164518574678, + "acc_stderr": 0.01310042299044158 + }, + "harness|winogrande|5": { + "acc": 0.8389897395422258, + "acc_stderr": 0.010329712832785717 + }, + "all": { + "em": 0.001572986577181208, + "em_stderr": 0.00040584511324177344, + "f1": 0.06689072986577178, + "f1_stderr": 0.0013705945295387344, + "acc": 0.5923530956998468, + "acc_stderr": 0.011715067911613648 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "ccd5a4848be2b928" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "d9203f306a771fbb" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "1b9062a75f765259" + }, + "total_evaluation_time_secondes": "45535.27230644226", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/gpt4-alpaca-lora-13B-HF/results_2023-07-19T19-32-00.745427.json b/eval-results/TheBloke/gpt4-alpaca-lora-13B-HF/results_2023-07-19T19-32-00.745427.json new file mode 100644 index 0000000000000000000000000000000000000000..73ca70e9c898ba60c8e1050e5b744e47317fc615 --- /dev/null +++ b/eval-results/TheBloke/gpt4-alpaca-lora-13B-HF/results_2023-07-19T19-32-00.745427.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5716723549488054, + "acc_stderr": 0.01446049636759902, + "acc_norm": 0.5955631399317406, + "acc_norm_stderr": 0.014342036483436175 + }, + "harness|hellaswag|10": { + "acc": 0.6233817964548894, + "acc_stderr": 0.004835475957610925, + "acc_norm": 0.8208524198366859, + "acc_norm_stderr": 0.003826921299075402 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.04063302731486671, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.04063302731486671 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4716981132075472, + "acc_stderr": 0.0307235352490061, + "acc_norm": 0.4716981132075472, + "acc_norm_stderr": 0.0307235352490061 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4652777777777778, + "acc_stderr": 0.04171115858181618, + "acc_norm": 0.4652777777777778, + "acc_norm_stderr": 0.04171115858181618 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.45, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4046242774566474, + "acc_stderr": 0.03742461193887248, + "acc_norm": 0.4046242774566474, + "acc_norm_stderr": 0.03742461193887248 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.04280105837364395, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.04280105837364395 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4340425531914894, + "acc_stderr": 0.03240038086792747, + "acc_norm": 0.4340425531914894, + "acc_norm_stderr": 0.03240038086792747 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.38620689655172413, + "acc_stderr": 0.04057324734419034, + "acc_norm": 0.38620689655172413, + "acc_norm_stderr": 0.04057324734419034 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24338624338624337, + "acc_stderr": 0.022101128787415433, + "acc_norm": 0.24338624338624337, + "acc_norm_stderr": 0.022101128787415433 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.04190596438871136, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.04190596438871136 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5, + "acc_stderr": 0.028444006199428714, + "acc_norm": 0.5, + "acc_norm_stderr": 0.028444006199428714 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.31527093596059114, + "acc_stderr": 0.03269080871970186, + "acc_norm": 0.31527093596059114, + "acc_norm_stderr": 0.03269080871970186 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6242424242424243, + "acc_stderr": 0.037818873532059816, + "acc_norm": 0.6242424242424243, + "acc_norm_stderr": 0.037818873532059816 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5505050505050505, + "acc_stderr": 0.035441324919479704, + "acc_norm": 0.5505050505050505, + "acc_norm_stderr": 0.035441324919479704 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6373056994818653, + "acc_stderr": 0.034697137917043715, + "acc_norm": 0.6373056994818653, + "acc_norm_stderr": 0.034697137917043715 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4282051282051282, + "acc_stderr": 0.025088301454694834, + "acc_norm": 0.4282051282051282, + "acc_norm_stderr": 0.025088301454694834 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23333333333333334, + "acc_stderr": 0.02578787422095932, + "acc_norm": 0.23333333333333334, + "acc_norm_stderr": 0.02578787422095932 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.0322529423239964, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.0322529423239964 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.26490066225165565, + "acc_stderr": 0.03603038545360383, + "acc_norm": 0.26490066225165565, + "acc_norm_stderr": 0.03603038545360383 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6201834862385321, + "acc_stderr": 0.020808825617866244, + "acc_norm": 0.6201834862385321, + "acc_norm_stderr": 0.020808825617866244 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.22685185185185186, + "acc_stderr": 0.028561650102422283, + "acc_norm": 0.22685185185185186, + "acc_norm_stderr": 0.028561650102422283 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5784313725490197, + "acc_stderr": 0.034658681963807614, + "acc_norm": 0.5784313725490197, + "acc_norm_stderr": 0.034658681963807614 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6877637130801688, + "acc_stderr": 0.030165137867847004, + "acc_norm": 0.6877637130801688, + "acc_norm_stderr": 0.030165137867847004 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5426008968609866, + "acc_stderr": 0.033435777055830646, + "acc_norm": 0.5426008968609866, + "acc_norm_stderr": 0.033435777055830646 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5725190839694656, + "acc_stderr": 0.043389203057924, + "acc_norm": 0.5725190839694656, + "acc_norm_stderr": 0.043389203057924 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6611570247933884, + "acc_stderr": 0.04320767807536671, + "acc_norm": 0.6611570247933884, + "acc_norm_stderr": 0.04320767807536671 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.04830366024635331, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.04830366024635331 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5398773006134969, + "acc_stderr": 0.03915857291436971, + "acc_norm": 0.5398773006134969, + "acc_norm_stderr": 0.03915857291436971 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6407766990291263, + "acc_stderr": 0.04750458399041695, + "acc_norm": 0.6407766990291263, + "acc_norm_stderr": 0.04750458399041695 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7435897435897436, + "acc_stderr": 0.028605953702004243, + "acc_norm": 0.7435897435897436, + "acc_norm_stderr": 0.028605953702004243 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.52, + "acc_stderr": 0.05021167315686779, + "acc_norm": 0.52, + "acc_norm_stderr": 0.05021167315686779 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6628352490421456, + "acc_stderr": 0.016905207420803547, + "acc_norm": 0.6628352490421456, + "acc_norm_stderr": 0.016905207420803547 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.026897049996382868, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.026897049996382868 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23910614525139665, + "acc_stderr": 0.014265554192331144, + "acc_norm": 0.23910614525139665, + "acc_norm_stderr": 0.014265554192331144 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5228758169934641, + "acc_stderr": 0.028599936776089782, + "acc_norm": 0.5228758169934641, + "acc_norm_stderr": 0.028599936776089782 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5401929260450161, + "acc_stderr": 0.028306190403305693, + "acc_norm": 0.5401929260450161, + "acc_norm_stderr": 0.028306190403305693 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5216049382716049, + "acc_stderr": 0.027794760105008736, + "acc_norm": 0.5216049382716049, + "acc_norm_stderr": 0.027794760105008736 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.36524822695035464, + "acc_stderr": 0.028723863853281278, + "acc_norm": 0.36524822695035464, + "acc_norm_stderr": 0.028723863853281278 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.38070404172099087, + "acc_stderr": 0.012401430654645875, + "acc_norm": 0.38070404172099087, + "acc_norm_stderr": 0.012401430654645875 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5220588235294118, + "acc_stderr": 0.03034326422421352, + "acc_norm": 0.5220588235294118, + "acc_norm_stderr": 0.03034326422421352 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.49673202614379086, + "acc_stderr": 0.020227402794434867, + "acc_norm": 0.49673202614379086, + "acc_norm_stderr": 0.020227402794434867 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5818181818181818, + "acc_stderr": 0.04724577405731571, + "acc_norm": 0.5818181818181818, + "acc_norm_stderr": 0.04724577405731571 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5306122448979592, + "acc_stderr": 0.031949171367580624, + "acc_norm": 0.5306122448979592, + "acc_norm_stderr": 0.031949171367580624 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6567164179104478, + "acc_stderr": 0.03357379665433431, + "acc_norm": 0.6567164179104478, + "acc_norm_stderr": 0.03357379665433431 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42168674698795183, + "acc_stderr": 0.03844453181770917, + "acc_norm": 0.42168674698795183, + "acc_norm_stderr": 0.03844453181770917 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6608187134502924, + "acc_stderr": 0.03631053496488904, + "acc_norm": 0.6608187134502924, + "acc_norm_stderr": 0.03631053496488904 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.33047735618115054, + "mc1_stderr": 0.016466769613698296, + "mc2": 0.48961829787726385, + "mc2_stderr": 0.014519534456149341 + }, + "all": { + "acc": 0.4790041649614366, + "acc_stderr": 0.03506604120562671, + "acc_norm": 0.482756053238805, + "acc_norm_stderr": 0.035046939264225044, + "mc1": 0.33047735618115054, + "mc1_stderr": 0.016466769613698296, + "mc2": 0.48961829787726385, + "mc2_stderr": 0.014519534456149341 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/gpt4-alpaca-lora-13B-HF", + "model_sha": "49678a2dd15fb4e1f1b99616ccc1ffd269912833", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/gpt4-alpaca-lora-13B-HF/results_2023-10-23T00-28-03.157336.json b/eval-results/TheBloke/gpt4-alpaca-lora-13B-HF/results_2023-10-23T00-28-03.157336.json new file mode 100644 index 0000000000000000000000000000000000000000..db29006b58aef011c30c9451df2c495849760850 --- /dev/null +++ b/eval-results/TheBloke/gpt4-alpaca-lora-13B-HF/results_2023-10-23T00-28-03.157336.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/gpt4-alpaca-lora-13B-HF", + "model_sha": "49678a2dd15fb4e1f1b99616ccc1ffd269912833", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0041946308724832215, + "em_stderr": 0.0006618716168266549, + "f1": 0.06315121644295306, + "f1_stderr": 0.0014384546797583987 + }, + "harness|gsm8k|5": { + "acc": 0.09097801364670205, + "acc_stderr": 0.007921322844013642 + }, + "harness|winogrande|5": { + "acc": 0.7671665351223362, + "acc_stderr": 0.011878201073856544 + }, + "all": { + "em": 0.0041946308724832215, + "em_stderr": 0.0006618716168266549, + "f1": 0.06315121644295306, + "f1_stderr": 0.0014384546797583987, + "acc": 0.4290722743845191, + "acc_stderr": 0.009899761958935093 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "e8f11def0a28765b" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "27ca820efd001832" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "71face6e7701f1b3" + }, + "total_evaluation_time_secondes": "12670.341174602509", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/gpt4-alpaca-lora-30b-HF/results_2023-08-12T11-42-51.272208.json b/eval-results/TheBloke/gpt4-alpaca-lora-30b-HF/results_2023-08-12T11-42-51.272208.json new file mode 100644 index 0000000000000000000000000000000000000000..0054d05d4cfa1317b3ddc02c945c5991e2f5dad0 --- /dev/null +++ b/eval-results/TheBloke/gpt4-alpaca-lora-30b-HF/results_2023-08-12T11-42-51.272208.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.6245733788395904, + "acc_stderr": 0.014150631435111728, + "acc_norm": 0.6484641638225256, + "acc_norm_stderr": 0.013952413699600931 + }, + "harness|hellaswag|10": { + "acc": 0.6632144991037642, + "acc_stderr": 0.004716449792353796, + "acc_norm": 0.8571997610037841, + "acc_norm_stderr": 0.0034915398589272883 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411021, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411021 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6118421052631579, + "acc_stderr": 0.03965842097512744, + "acc_norm": 0.6118421052631579, + "acc_norm_stderr": 0.03965842097512744 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5962264150943396, + "acc_stderr": 0.03019761160019795, + "acc_norm": 0.5962264150943396, + "acc_norm_stderr": 0.03019761160019795 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6041666666666666, + "acc_stderr": 0.04089465449325582, + "acc_norm": 0.6041666666666666, + "acc_norm_stderr": 0.04089465449325582 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.03809342081273956, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.03809342081273956 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.044405219061793275, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.044405219061793275 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252607, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252607 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5191489361702127, + "acc_stderr": 0.03266204299064678, + "acc_norm": 0.5191489361702127, + "acc_norm_stderr": 0.03266204299064678 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3508771929824561, + "acc_stderr": 0.04489539350270699, + "acc_norm": 0.3508771929824561, + "acc_norm_stderr": 0.04489539350270699 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.04166567577101579, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.04166567577101579 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3544973544973545, + "acc_stderr": 0.024636830602841997, + "acc_norm": 0.3544973544973545, + "acc_norm_stderr": 0.024636830602841997 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04216370213557836, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04216370213557836 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6806451612903226, + "acc_stderr": 0.02652270967466777, + "acc_norm": 0.6806451612903226, + "acc_norm_stderr": 0.02652270967466777 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43349753694581283, + "acc_stderr": 0.03486731727419872, + "acc_norm": 0.43349753694581283, + "acc_norm_stderr": 0.03486731727419872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.03477691162163659, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.03477691162163659 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7171717171717171, + "acc_stderr": 0.03208779558786752, + "acc_norm": 0.7171717171717171, + "acc_norm_stderr": 0.03208779558786752 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8341968911917098, + "acc_stderr": 0.026839845022314415, + "acc_norm": 0.8341968911917098, + "acc_norm_stderr": 0.026839845022314415 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5538461538461539, + "acc_stderr": 0.02520357177302833, + "acc_norm": 0.5538461538461539, + "acc_norm_stderr": 0.02520357177302833 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.02592887613276611, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.02592887613276611 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5840336134453782, + "acc_stderr": 0.03201650100739611, + "acc_norm": 0.5840336134453782, + "acc_norm_stderr": 0.03201650100739611 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3509933774834437, + "acc_stderr": 0.03896981964257375, + "acc_norm": 0.3509933774834437, + "acc_norm_stderr": 0.03896981964257375 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7559633027522936, + "acc_stderr": 0.01841528635141641, + "acc_norm": 0.7559633027522936, + "acc_norm_stderr": 0.01841528635141641 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4537037037037037, + "acc_stderr": 0.03395322726375797, + "acc_norm": 0.4537037037037037, + "acc_norm_stderr": 0.03395322726375797 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7892156862745098, + "acc_stderr": 0.028626547912437388, + "acc_norm": 0.7892156862745098, + "acc_norm_stderr": 0.028626547912437388 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8016877637130801, + "acc_stderr": 0.02595502084162112, + "acc_norm": 0.8016877637130801, + "acc_norm_stderr": 0.02595502084162112 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6591928251121076, + "acc_stderr": 0.031811497470553604, + "acc_norm": 0.6591928251121076, + "acc_norm_stderr": 0.031811497470553604 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6412213740458015, + "acc_stderr": 0.04206739313864908, + "acc_norm": 0.6412213740458015, + "acc_norm_stderr": 0.04206739313864908 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.03984979653302872, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.03984979653302872 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6851851851851852, + "acc_stderr": 0.04489931073591312, + "acc_norm": 0.6851851851851852, + "acc_norm_stderr": 0.04489931073591312 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7116564417177914, + "acc_stderr": 0.03559039531617342, + "acc_norm": 0.7116564417177914, + "acc_norm_stderr": 0.03559039531617342 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4642857142857143, + "acc_stderr": 0.04733667890053756, + "acc_norm": 0.4642857142857143, + "acc_norm_stderr": 0.04733667890053756 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.021901905115073325, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.021901905115073325 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.65, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.65, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7790549169859514, + "acc_stderr": 0.014836205167333567, + "acc_norm": 0.7790549169859514, + "acc_norm_stderr": 0.014836205167333567 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6647398843930635, + "acc_stderr": 0.025416003773165555, + "acc_norm": 0.6647398843930635, + "acc_norm_stderr": 0.025416003773165555 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.44692737430167595, + "acc_stderr": 0.016628030039647614, + "acc_norm": 0.44692737430167595, + "acc_norm_stderr": 0.016628030039647614 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6241830065359477, + "acc_stderr": 0.027732834353363947, + "acc_norm": 0.6241830065359477, + "acc_norm_stderr": 0.027732834353363947 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6784565916398714, + "acc_stderr": 0.026527724079528872, + "acc_norm": 0.6784565916398714, + "acc_norm_stderr": 0.026527724079528872 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6820987654320988, + "acc_stderr": 0.02591006352824088, + "acc_norm": 0.6820987654320988, + "acc_norm_stderr": 0.02591006352824088 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.44680851063829785, + "acc_stderr": 0.029658235097666904, + "acc_norm": 0.44680851063829785, + "acc_norm_stderr": 0.029658235097666904 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4556714471968709, + "acc_stderr": 0.012719949543032199, + "acc_norm": 0.4556714471968709, + "acc_norm_stderr": 0.012719949543032199 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5220588235294118, + "acc_stderr": 0.03034326422421352, + "acc_norm": 0.5220588235294118, + "acc_norm_stderr": 0.03034326422421352 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6339869281045751, + "acc_stderr": 0.01948802574552966, + "acc_norm": 0.6339869281045751, + "acc_norm_stderr": 0.01948802574552966 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6818181818181818, + "acc_stderr": 0.04461272175910509, + "acc_norm": 0.6818181818181818, + "acc_norm_stderr": 0.04461272175910509 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6816326530612244, + "acc_stderr": 0.029822533793982066, + "acc_norm": 0.6816326530612244, + "acc_norm_stderr": 0.029822533793982066 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8009950248756219, + "acc_stderr": 0.028231365092758406, + "acc_norm": 0.8009950248756219, + "acc_norm_stderr": 0.028231365092758406 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.0358870281282637, + "acc_norm": 0.85, + "acc_norm_stderr": 0.0358870281282637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4879518072289157, + "acc_stderr": 0.03891364495835821, + "acc_norm": 0.4879518072289157, + "acc_norm_stderr": 0.03891364495835821 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.03094445977853321, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.03094445977853321 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35862913096695226, + "mc1_stderr": 0.016789289499502022, + "mc2": 0.5224270387662996, + "mc2_stderr": 0.014647944028826718 + }, + "all": { + "acc": 0.5871245408173702, + "acc_stderr": 0.034052915594697045, + "acc_norm": 0.5908173551713186, + "acc_norm_stderr": 0.03402879478674896, + "mc1": 0.35862913096695226, + "mc1_stderr": 0.016789289499502022, + "mc2": 0.5224270387662996, + "mc2_stderr": 0.014647944028826718 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/gpt4-alpaca-lora-30b-HF", + "model_sha": "3c8007467a081dc72ae09b9d358416b056b38920", + "model_dtype": "torch.float16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "9113.090842962265", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/gpt4-alpaca-lora-30b-HF/results_2023-09-18T00-20-21.073173.json b/eval-results/TheBloke/gpt4-alpaca-lora-30b-HF/results_2023-09-18T00-20-21.073173.json new file mode 100644 index 0000000000000000000000000000000000000000..f00a91d38bad64be3380b2207336f47ea60f8fad --- /dev/null +++ b/eval-results/TheBloke/gpt4-alpaca-lora-30b-HF/results_2023-09-18T00-20-21.073173.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/gpt4-alpaca-lora-30b-HF", + "model_sha": "3c8007467a081dc72ae09b9d358416b056b38920", + "model_size": "60.65 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0016778523489932886, + "em_stderr": 0.00041913301788269584, + "f1": 0.06442533557047006, + "f1_stderr": 0.0013970563636897643 + }, + "harness|gsm8k|5": { + "acc": 0.155420773313116, + "acc_stderr": 0.009979689409499152 + }, + "harness|winogrande|5": { + "acc": 0.8018942383583267, + "acc_stderr": 0.01120186274448705 + }, + "all": { + "em": 0.0016778523489932886, + "em_stderr": 0.00041913301788269584, + "f1": 0.06442533557047006, + "f1_stderr": 0.0013970563636897643, + "acc": 0.47865750583572136, + "acc_stderr": 0.0105907760769931 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "3a9bdcd529f30aa3" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "cfd87e4fbe462b5a" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "a97d8d08091e5262" + }, + "total_evaluation_time_secondes": "20770.471425056458", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/gpt4-alpaca-lora_mlp-65B-HF/results_2023-07-25T19-53-38.948593.json b/eval-results/TheBloke/gpt4-alpaca-lora_mlp-65B-HF/results_2023-07-25T19-53-38.948593.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe99c6d270c56dee2507119f0d9afe56a8e9b35 --- /dev/null +++ b/eval-results/TheBloke/gpt4-alpaca-lora_mlp-65B-HF/results_2023-07-25T19-53-38.948593.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.6220136518771331, + "acc_stderr": 0.014169664520303098, + "acc_norm": 0.6501706484641638, + "acc_norm_stderr": 0.01393680921215829 + }, + "harness|hellaswag|10": { + "acc": 0.6664011153156741, + "acc_stderr": 0.004705347137699622, + "acc_norm": 0.8612826130252937, + "acc_norm_stderr": 0.003449449618650559 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5481481481481482, + "acc_stderr": 0.04299268905480864, + "acc_norm": 0.5481481481481482, + "acc_norm_stderr": 0.04299268905480864 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7105263157894737, + "acc_stderr": 0.036906779861372814, + "acc_norm": 0.7105263157894737, + "acc_norm_stderr": 0.036906779861372814 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6641509433962264, + "acc_stderr": 0.029067220146644826, + "acc_norm": 0.6641509433962264, + "acc_norm_stderr": 0.029067220146644826 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6875, + "acc_stderr": 0.038760854559127644, + "acc_norm": 0.6875, + "acc_norm_stderr": 0.038760854559127644 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5375722543352601, + "acc_stderr": 0.0380168510452446, + "acc_norm": 0.5375722543352601, + "acc_norm_stderr": 0.0380168510452446 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816507, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816507 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5914893617021276, + "acc_stderr": 0.032134180267015755, + "acc_norm": 0.5914893617021276, + "acc_norm_stderr": 0.032134180267015755 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.39473684210526316, + "acc_stderr": 0.045981880578165414, + "acc_norm": 0.39473684210526316, + "acc_norm_stderr": 0.045981880578165414 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5241379310344828, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.5241379310344828, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.37566137566137564, + "acc_stderr": 0.024942368931159788, + "acc_norm": 0.37566137566137564, + "acc_norm_stderr": 0.024942368931159788 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.47619047619047616, + "acc_stderr": 0.04467062628403273, + "acc_norm": 0.47619047619047616, + "acc_norm_stderr": 0.04467062628403273 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7354838709677419, + "acc_stderr": 0.02509189237885928, + "acc_norm": 0.7354838709677419, + "acc_norm_stderr": 0.02509189237885928 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3891625615763547, + "acc_stderr": 0.034304624161038716, + "acc_norm": 0.3891625615763547, + "acc_norm_stderr": 0.034304624161038716 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7818181818181819, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.7818181818181819, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.029126522834586808, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.029126522834586808 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8704663212435233, + "acc_stderr": 0.024233532297758723, + "acc_norm": 0.8704663212435233, + "acc_norm_stderr": 0.024233532297758723 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6256410256410256, + "acc_stderr": 0.024537591572830506, + "acc_norm": 0.6256410256410256, + "acc_norm_stderr": 0.024537591572830506 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.31851851851851853, + "acc_stderr": 0.02840653309060846, + "acc_norm": 0.31851851851851853, + "acc_norm_stderr": 0.02840653309060846 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.0303883535518868, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.0303883535518868 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.40397350993377484, + "acc_stderr": 0.040064856853653415, + "acc_norm": 0.40397350993377484, + "acc_norm_stderr": 0.040064856853653415 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8238532110091743, + "acc_stderr": 0.01633288239343137, + "acc_norm": 0.8238532110091743, + "acc_norm_stderr": 0.01633288239343137 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5138888888888888, + "acc_stderr": 0.034086558679777494, + "acc_norm": 0.5138888888888888, + "acc_norm_stderr": 0.034086558679777494 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8235294117647058, + "acc_stderr": 0.026756401538078966, + "acc_norm": 0.8235294117647058, + "acc_norm_stderr": 0.026756401538078966 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8312236286919831, + "acc_stderr": 0.024381406832586237, + "acc_norm": 0.8312236286919831, + "acc_norm_stderr": 0.024381406832586237 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6591928251121076, + "acc_stderr": 0.0318114974705536, + "acc_norm": 0.6591928251121076, + "acc_norm_stderr": 0.0318114974705536 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6870229007633588, + "acc_stderr": 0.04066962905677698, + "acc_norm": 0.6870229007633588, + "acc_norm_stderr": 0.04066962905677698 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8016528925619835, + "acc_stderr": 0.03640118271990946, + "acc_norm": 0.8016528925619835, + "acc_norm_stderr": 0.03640118271990946 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.04133119440243838, + "acc_norm": 0.7592592592592593, + "acc_norm_stderr": 0.04133119440243838 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7300613496932515, + "acc_stderr": 0.034878251684978906, + "acc_norm": 0.7300613496932515, + "acc_norm_stderr": 0.034878251684978906 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.45535714285714285, + "acc_stderr": 0.047268355537191, + "acc_norm": 0.45535714285714285, + "acc_norm_stderr": 0.047268355537191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.03916667762822584, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.03916667762822584 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.021901905115073325, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.021901905115073325 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.68, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.68, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7867177522349936, + "acc_stderr": 0.014648172749593525, + "acc_norm": 0.7867177522349936, + "acc_norm_stderr": 0.014648172749593525 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7312138728323699, + "acc_stderr": 0.023868003262500107, + "acc_norm": 0.7312138728323699, + "acc_norm_stderr": 0.023868003262500107 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4480446927374302, + "acc_stderr": 0.016631976628930595, + "acc_norm": 0.4480446927374302, + "acc_norm_stderr": 0.016631976628930595 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6633986928104575, + "acc_stderr": 0.027057974624494382, + "acc_norm": 0.6633986928104575, + "acc_norm_stderr": 0.027057974624494382 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7331189710610932, + "acc_stderr": 0.025122637608816657, + "acc_norm": 0.7331189710610932, + "acc_norm_stderr": 0.025122637608816657 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7191358024691358, + "acc_stderr": 0.025006469755799215, + "acc_norm": 0.7191358024691358, + "acc_norm_stderr": 0.025006469755799215 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4716312056737589, + "acc_stderr": 0.029779450957303062, + "acc_norm": 0.4716312056737589, + "acc_norm_stderr": 0.029779450957303062 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.47196870925684486, + "acc_stderr": 0.01275015180292244, + "acc_norm": 0.47196870925684486, + "acc_norm_stderr": 0.01275015180292244 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.029520095697687758, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.029520095697687758 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6519607843137255, + "acc_stderr": 0.019270998708223974, + "acc_norm": 0.6519607843137255, + "acc_norm_stderr": 0.019270998708223974 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7061224489795919, + "acc_stderr": 0.02916273841024977, + "acc_norm": 0.7061224489795919, + "acc_norm_stderr": 0.02916273841024977 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.845771144278607, + "acc_stderr": 0.025538433368578327, + "acc_norm": 0.845771144278607, + "acc_norm_stderr": 0.025538433368578327 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5602409638554217, + "acc_stderr": 0.03864139923699121, + "acc_norm": 0.5602409638554217, + "acc_norm_stderr": 0.03864139923699121 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8187134502923976, + "acc_stderr": 0.029547741687640038, + "acc_norm": 0.8187134502923976, + "acc_norm_stderr": 0.029547741687640038 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.43329253365973075, + "mc1_stderr": 0.017347024450107492, + "mc2": 0.591576692393521, + "mc2_stderr": 0.01516150681773446 + }, + "all": { + "acc": 0.6278352247612916, + "acc_stderr": 0.03330845518784364, + "acc_norm": 0.6316155382239468, + "acc_norm_stderr": 0.03328322208907764, + "mc1": 0.43329253365973075, + "mc1_stderr": 0.017347024450107492, + "mc2": 0.591576692393521, + "mc2_stderr": 0.01516150681773446 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/gpt4-alpaca-lora_mlp-65B-HF", + "model_sha": "664ff8e3e1d446971a16a6c9018ab24de7664684", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "25765.75688767433", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/gpt4-alpaca-lora_mlp-65B-HF/results_2023-10-23T07-45-08.272902.json b/eval-results/TheBloke/gpt4-alpaca-lora_mlp-65B-HF/results_2023-10-23T07-45-08.272902.json new file mode 100644 index 0000000000000000000000000000000000000000..1f11206cdb625abbfe0eeed39801c80ff17a62c9 --- /dev/null +++ b/eval-results/TheBloke/gpt4-alpaca-lora_mlp-65B-HF/results_2023-10-23T07-45-08.272902.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/gpt4-alpaca-lora_mlp-65B-HF", + "model_sha": "664ff8e3e1d446971a16a6c9018ab24de7664684", + "model_size": "121.68 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.015625, + "em_stderr": 0.0012700767094662763, + "f1": 0.09636115771812082, + "f1_stderr": 0.0019819425315034905 + }, + "harness|gsm8k|5": { + "acc": 0.28278999241849884, + "acc_stderr": 0.01240502041787362 + }, + "harness|winogrande|5": { + "acc": 0.8066298342541437, + "acc_stderr": 0.011099796645920533 + }, + "all": { + "em": 0.015625, + "em_stderr": 0.0012700767094662763, + "f1": 0.09636115771812082, + "f1_stderr": 0.0019819425315034905, + "acc": 0.5447099133363212, + "acc_stderr": 0.011752408531897077 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "fadb1584424463e8" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "fdfc37768446a780" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "f62ae5dafffa85f0" + }, + "total_evaluation_time_secondes": "45914.21450138092", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/gpt4-x-vicuna-13B-HF/results_2023-07-19T19-01-51.030763.json b/eval-results/TheBloke/gpt4-x-vicuna-13B-HF/results_2023-07-19T19-01-51.030763.json new file mode 100644 index 0000000000000000000000000000000000000000..b827baa0a633b0b0cc0788b2b3a29f3853c7c7a4 --- /dev/null +++ b/eval-results/TheBloke/gpt4-x-vicuna-13B-HF/results_2023-07-19T19-01-51.030763.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5110921501706485, + "acc_stderr": 0.01460779491401305, + "acc_norm": 0.5341296928327645, + "acc_norm_stderr": 0.014577311315231104 + }, + "harness|hellaswag|10": { + "acc": 0.6038637721569409, + "acc_stderr": 0.004880937933163287, + "acc_norm": 0.8012348137821151, + "acc_norm_stderr": 0.003982553164086259 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45925925925925926, + "acc_stderr": 0.04304979692464243, + "acc_norm": 0.45925925925925926, + "acc_norm_stderr": 0.04304979692464243 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.506578947368421, + "acc_stderr": 0.040685900502249704, + "acc_norm": 0.506578947368421, + "acc_norm_stderr": 0.040685900502249704 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4867924528301887, + "acc_stderr": 0.030762134874500482, + "acc_norm": 0.4867924528301887, + "acc_norm_stderr": 0.030762134874500482 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5486111111111112, + "acc_stderr": 0.04161402398403279, + "acc_norm": 0.5486111111111112, + "acc_norm_stderr": 0.04161402398403279 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4046242774566474, + "acc_stderr": 0.03742461193887249, + "acc_norm": 0.4046242774566474, + "acc_norm_stderr": 0.03742461193887249 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.04023382273617747, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.04023382273617747 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3617021276595745, + "acc_stderr": 0.03141082197596241, + "acc_norm": 0.3617021276595745, + "acc_norm_stderr": 0.03141082197596241 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.28835978835978837, + "acc_stderr": 0.023330654054535896, + "acc_norm": 0.28835978835978837, + "acc_norm_stderr": 0.023330654054535896 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.04415438226743744, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.04415438226743744 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5741935483870968, + "acc_stderr": 0.028129112709165894, + "acc_norm": 0.5741935483870968, + "acc_norm_stderr": 0.028129112709165894 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3891625615763547, + "acc_stderr": 0.03430462416103873, + "acc_norm": 0.3891625615763547, + "acc_norm_stderr": 0.03430462416103873 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6424242424242425, + "acc_stderr": 0.037425970438065864, + "acc_norm": 0.6424242424242425, + "acc_norm_stderr": 0.037425970438065864 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.034273086529999344, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.034273086529999344 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7046632124352331, + "acc_stderr": 0.03292296639155141, + "acc_norm": 0.7046632124352331, + "acc_norm_stderr": 0.03292296639155141 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.43846153846153846, + "acc_stderr": 0.02515826601686857, + "acc_norm": 0.43846153846153846, + "acc_norm_stderr": 0.02515826601686857 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.025928876132766135, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.025928876132766135 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.0322529423239964, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.0322529423239964 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6770642201834862, + "acc_stderr": 0.02004811592341531, + "acc_norm": 0.6770642201834862, + "acc_norm_stderr": 0.02004811592341531 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.36574074074074076, + "acc_stderr": 0.03284738857647206, + "acc_norm": 0.36574074074074076, + "acc_norm_stderr": 0.03284738857647206 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6715686274509803, + "acc_stderr": 0.03296245110172228, + "acc_norm": 0.6715686274509803, + "acc_norm_stderr": 0.03296245110172228 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7046413502109705, + "acc_stderr": 0.02969633871342288, + "acc_norm": 0.7046413502109705, + "acc_norm_stderr": 0.02969633871342288 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5739910313901345, + "acc_stderr": 0.03318833286217281, + "acc_norm": 0.5739910313901345, + "acc_norm_stderr": 0.03318833286217281 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6717557251908397, + "acc_stderr": 0.04118438565806298, + "acc_norm": 0.6717557251908397, + "acc_norm_stderr": 0.04118438565806298 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6776859504132231, + "acc_stderr": 0.042664163633521685, + "acc_norm": 0.6776859504132231, + "acc_norm_stderr": 0.042664163633521685 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6574074074074074, + "acc_stderr": 0.045879047413018105, + "acc_norm": 0.6574074074074074, + "acc_norm_stderr": 0.045879047413018105 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6625766871165644, + "acc_stderr": 0.03714908409935574, + "acc_norm": 0.6625766871165644, + "acc_norm_stderr": 0.03714908409935574 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6796116504854369, + "acc_stderr": 0.04620284082280041, + "acc_norm": 0.6796116504854369, + "acc_norm_stderr": 0.04620284082280041 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7735042735042735, + "acc_stderr": 0.027421007295392912, + "acc_norm": 0.7735042735042735, + "acc_norm_stderr": 0.027421007295392912 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6871008939974457, + "acc_stderr": 0.01658093594030406, + "acc_norm": 0.6871008939974457, + "acc_norm_stderr": 0.01658093594030406 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5375722543352601, + "acc_stderr": 0.026842985519615375, + "acc_norm": 0.5375722543352601, + "acc_norm_stderr": 0.026842985519615375 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.31731843575418994, + "acc_stderr": 0.01556639263005703, + "acc_norm": 0.31731843575418994, + "acc_norm_stderr": 0.01556639263005703 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5424836601307189, + "acc_stderr": 0.028526383452142638, + "acc_norm": 0.5424836601307189, + "acc_norm_stderr": 0.028526383452142638 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5530546623794212, + "acc_stderr": 0.02823776942208535, + "acc_norm": 0.5530546623794212, + "acc_norm_stderr": 0.02823776942208535 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5740740740740741, + "acc_stderr": 0.027513747284379424, + "acc_norm": 0.5740740740740741, + "acc_norm_stderr": 0.027513747284379424 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.38652482269503546, + "acc_stderr": 0.02904919034254346, + "acc_norm": 0.38652482269503546, + "acc_norm_stderr": 0.02904919034254346 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.41264667535853977, + "acc_stderr": 0.012573836633799015, + "acc_norm": 0.41264667535853977, + "acc_norm_stderr": 0.012573836633799015 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.44485294117647056, + "acc_stderr": 0.03018753206032939, + "acc_norm": 0.44485294117647056, + "acc_norm_stderr": 0.03018753206032939 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5196078431372549, + "acc_stderr": 0.020212274976302957, + "acc_norm": 0.5196078431372549, + "acc_norm_stderr": 0.020212274976302957 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5454545454545454, + "acc_stderr": 0.04769300568972743, + "acc_norm": 0.5454545454545454, + "acc_norm_stderr": 0.04769300568972743 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5795918367346938, + "acc_stderr": 0.03160106993449601, + "acc_norm": 0.5795918367346938, + "acc_norm_stderr": 0.03160106993449601 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7412935323383084, + "acc_stderr": 0.030965903123573033, + "acc_norm": 0.7412935323383084, + "acc_norm_stderr": 0.030965903123573033 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.45180722891566266, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.45180722891566266, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7426900584795322, + "acc_stderr": 0.03352799844161865, + "acc_norm": 0.7426900584795322, + "acc_norm_stderr": 0.03352799844161865 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3635250917992656, + "mc1_stderr": 0.016838862883965827, + "mc2": 0.5357942440986606, + "mc2_stderr": 0.015916184024373756 + }, + "all": { + "acc": 0.5137597162733054, + "acc_stderr": 0.03484317305077308, + "acc_norm": 0.5174954549900392, + "acc_norm_stderr": 0.03482742951911445, + "mc1": 0.3635250917992656, + "mc1_stderr": 0.016838862883965827, + "mc2": 0.5357942440986606, + "mc2_stderr": 0.015916184024373756 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/gpt4-x-vicuna-13B-HF", + "model_sha": "a247577c882940e0c6b040fe8239d760c0d10d40", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/guanaco-13B-HF/results_2023-07-19T19-24-37.744515.json b/eval-results/TheBloke/guanaco-13B-HF/results_2023-07-19T19-24-37.744515.json new file mode 100644 index 0000000000000000000000000000000000000000..909e2b2ea40445686c645eae4712f394c63e2e23 --- /dev/null +++ b/eval-results/TheBloke/guanaco-13B-HF/results_2023-07-19T19-24-37.744515.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5486348122866894, + "acc_stderr": 0.014542104569955264, + "acc_norm": 0.5784982935153583, + "acc_norm_stderr": 0.014430197069326023 + }, + "harness|hellaswag|10": { + "acc": 0.6353316072495518, + "acc_stderr": 0.004803533333364223, + "acc_norm": 0.8383788090021908, + "acc_norm_stderr": 0.0036735065123709503 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4740740740740741, + "acc_stderr": 0.04313531696750574, + "acc_norm": 0.4740740740740741, + "acc_norm_stderr": 0.04313531696750574 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4934210526315789, + "acc_stderr": 0.040685900502249704, + "acc_norm": 0.4934210526315789, + "acc_norm_stderr": 0.040685900502249704 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.47924528301886793, + "acc_stderr": 0.030746349975723463, + "acc_norm": 0.47924528301886793, + "acc_norm_stderr": 0.030746349975723463 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4513888888888889, + "acc_stderr": 0.04161402398403279, + "acc_norm": 0.4513888888888889, + "acc_norm_stderr": 0.04161402398403279 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3930635838150289, + "acc_stderr": 0.037242495958177295, + "acc_norm": 0.3930635838150289, + "acc_norm_stderr": 0.037242495958177295 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.04280105837364395, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.04280105837364395 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.40425531914893614, + "acc_stderr": 0.03208115750788684, + "acc_norm": 0.40425531914893614, + "acc_norm_stderr": 0.03208115750788684 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537315, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537315 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4413793103448276, + "acc_stderr": 0.04137931034482758, + "acc_norm": 0.4413793103448276, + "acc_norm_stderr": 0.04137931034482758 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.02306818884826112, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.02306818884826112 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04285714285714281, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04285714285714281 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5387096774193548, + "acc_stderr": 0.028358634859836935, + "acc_norm": 0.5387096774193548, + "acc_norm_stderr": 0.028358634859836935 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.35467980295566504, + "acc_stderr": 0.033661244890514495, + "acc_norm": 0.35467980295566504, + "acc_norm_stderr": 0.033661244890514495 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5757575757575758, + "acc_stderr": 0.038592681420702636, + "acc_norm": 0.5757575757575758, + "acc_norm_stderr": 0.038592681420702636 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5757575757575758, + "acc_stderr": 0.03521224908841586, + "acc_norm": 0.5757575757575758, + "acc_norm_stderr": 0.03521224908841586 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6632124352331606, + "acc_stderr": 0.03410780251836183, + "acc_norm": 0.6632124352331606, + "acc_norm_stderr": 0.03410780251836183 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.44358974358974357, + "acc_stderr": 0.025189149894764198, + "acc_norm": 0.44358974358974357, + "acc_norm_stderr": 0.025189149894764198 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.29259259259259257, + "acc_stderr": 0.02773896963217609, + "acc_norm": 0.29259259259259257, + "acc_norm_stderr": 0.02773896963217609 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4831932773109244, + "acc_stderr": 0.03246013680375308, + "acc_norm": 0.4831932773109244, + "acc_norm_stderr": 0.03246013680375308 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2913907284768212, + "acc_stderr": 0.03710185726119996, + "acc_norm": 0.2913907284768212, + "acc_norm_stderr": 0.03710185726119996 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6330275229357798, + "acc_stderr": 0.020664675659520525, + "acc_norm": 0.6330275229357798, + "acc_norm_stderr": 0.020664675659520525 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.03167468706828977, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.03167468706828977 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.034542365853806094, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.034542365853806094 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6835443037974683, + "acc_stderr": 0.030274974880218984, + "acc_norm": 0.6835443037974683, + "acc_norm_stderr": 0.030274974880218984 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5605381165919282, + "acc_stderr": 0.03331092511038179, + "acc_norm": 0.5605381165919282, + "acc_norm_stderr": 0.03331092511038179 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5954198473282443, + "acc_stderr": 0.043046937953806645, + "acc_norm": 0.5954198473282443, + "acc_norm_stderr": 0.043046937953806645 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6033057851239669, + "acc_stderr": 0.044658697805310094, + "acc_norm": 0.6033057851239669, + "acc_norm_stderr": 0.044658697805310094 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.04820403072760627, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.04820403072760627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4785276073619632, + "acc_stderr": 0.0392474687675113, + "acc_norm": 0.4785276073619632, + "acc_norm_stderr": 0.0392474687675113 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6504854368932039, + "acc_stderr": 0.04721188506097173, + "acc_norm": 0.6504854368932039, + "acc_norm_stderr": 0.04721188506097173 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7094017094017094, + "acc_stderr": 0.029745048572674057, + "acc_norm": 0.7094017094017094, + "acc_norm_stderr": 0.029745048572674057 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6577266922094508, + "acc_stderr": 0.016967031766413627, + "acc_norm": 0.6577266922094508, + "acc_norm_stderr": 0.016967031766413627 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5375722543352601, + "acc_stderr": 0.026842985519615375, + "acc_norm": 0.5375722543352601, + "acc_norm_stderr": 0.026842985519615375 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2871508379888268, + "acc_stderr": 0.015131608849963752, + "acc_norm": 0.2871508379888268, + "acc_norm_stderr": 0.015131608849963752 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5261437908496732, + "acc_stderr": 0.028590752958852387, + "acc_norm": 0.5261437908496732, + "acc_norm_stderr": 0.028590752958852387 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5401929260450161, + "acc_stderr": 0.028306190403305696, + "acc_norm": 0.5401929260450161, + "acc_norm_stderr": 0.028306190403305696 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.558641975308642, + "acc_stderr": 0.027628737155668777, + "acc_norm": 0.558641975308642, + "acc_norm_stderr": 0.027628737155668777 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.38652482269503546, + "acc_stderr": 0.02904919034254346, + "acc_norm": 0.38652482269503546, + "acc_norm_stderr": 0.02904919034254346 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.39504563233376794, + "acc_stderr": 0.01248572781325156, + "acc_norm": 0.39504563233376794, + "acc_norm_stderr": 0.01248572781325156 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5477941176470589, + "acc_stderr": 0.030233758551596445, + "acc_norm": 0.5477941176470589, + "acc_norm_stderr": 0.030233758551596445 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.47875816993464054, + "acc_stderr": 0.020209572388600244, + "acc_norm": 0.47875816993464054, + "acc_norm_stderr": 0.020209572388600244 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5727272727272728, + "acc_stderr": 0.04738198703545483, + "acc_norm": 0.5727272727272728, + "acc_norm_stderr": 0.04738198703545483 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5224489795918368, + "acc_stderr": 0.03197694118713672, + "acc_norm": 0.5224489795918368, + "acc_norm_stderr": 0.03197694118713672 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6218905472636815, + "acc_stderr": 0.034288678487786564, + "acc_norm": 0.6218905472636815, + "acc_norm_stderr": 0.034288678487786564 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4457831325301205, + "acc_stderr": 0.03869543323472101, + "acc_norm": 0.4457831325301205, + "acc_norm_stderr": 0.03869543323472101 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6608187134502924, + "acc_stderr": 0.03631053496488904, + "acc_norm": 0.6608187134502924, + "acc_norm_stderr": 0.03631053496488904 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3182374541003672, + "mc1_stderr": 0.016305988648920605, + "mc2": 0.4673352529553649, + "mc2_stderr": 0.015225510622226497 + }, + "all": { + "acc": 0.48650845710233237, + "acc_stderr": 0.035222212787197754, + "acc_norm": 0.49045609579693084, + "acc_norm_stderr": 0.035201163052932964, + "mc1": 0.3182374541003672, + "mc1_stderr": 0.016305988648920605, + "mc2": 0.4673352529553649, + "mc2_stderr": 0.015225510622226497 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/guanaco-13B-HF", + "model_sha": "bd59c700815124df616a17f5b49a0bc51590b231", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/guanaco-13B-HF/results_2023-10-23T02-23-34.396726.json b/eval-results/TheBloke/guanaco-13B-HF/results_2023-10-23T02-23-34.396726.json new file mode 100644 index 0000000000000000000000000000000000000000..c14adefe99fca1236e3d934a2262c0a5184746ff --- /dev/null +++ b/eval-results/TheBloke/guanaco-13B-HF/results_2023-10-23T02-23-34.396726.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/guanaco-13B-HF", + "model_sha": "bd59c700815124df616a17f5b49a0bc51590b231", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.003984899328859061, + "em_stderr": 0.0006451805848102414, + "f1": 0.06359479865771825, + "f1_stderr": 0.001462243147092022 + }, + "harness|gsm8k|5": { + "acc": 0.08718726307808947, + "acc_stderr": 0.007770691416783571 + }, + "harness|winogrande|5": { + "acc": 0.7584846093133386, + "acc_stderr": 0.012028983782011875 + }, + "all": { + "em": 0.003984899328859061, + "em_stderr": 0.0006451805848102414, + "f1": 0.06359479865771825, + "f1_stderr": 0.001462243147092022, + "acc": 0.422835936195714, + "acc_stderr": 0.009899837599397724 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "3999ec603e509a91" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "df3d16ef5ca4fd99" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "7c1e66de7b284c9e" + }, + "total_evaluation_time_secondes": "13134.080974817276", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/guanaco-33B-GPTQ/results_2023-08-21T20-58-04.901059.json b/eval-results/TheBloke/guanaco-33B-GPTQ/results_2023-08-21T20-58-04.901059.json new file mode 100644 index 0000000000000000000000000000000000000000..1703890aeea40580ad05c1e79adf53efec2983cc --- /dev/null +++ b/eval-results/TheBloke/guanaco-33B-GPTQ/results_2023-08-21T20-58-04.901059.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.23890784982935154, + "acc_stderr": 0.012461071376316617, + "acc_norm": 0.2815699658703072, + "acc_norm_stderr": 0.013143376735009019 + }, + "harness|hellaswag|10": { + "acc": 0.25423222465644296, + "acc_stderr": 0.00434538861452002, + "acc_norm": 0.2633937462656841, + "acc_norm_stderr": 0.00439573949568858 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.03749850709174022, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.03749850709174022 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.03459777606810536, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.03459777606810536 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.21, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.21, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.21132075471698114, + "acc_stderr": 0.025125766484827845, + "acc_norm": 0.21132075471698114, + "acc_norm_stderr": 0.025125766484827845 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.22916666666666666, + "acc_stderr": 0.03514697467862388, + "acc_norm": 0.22916666666666666, + "acc_norm_stderr": 0.03514697467862388 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847415, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847415 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2543352601156069, + "acc_stderr": 0.0332055644308557, + "acc_norm": 0.2543352601156069, + "acc_norm_stderr": 0.0332055644308557 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.04488482852329017, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.04488482852329017 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.25957446808510637, + "acc_stderr": 0.02865917937429232, + "acc_norm": 0.25957446808510637, + "acc_norm_stderr": 0.02865917937429232 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.04227054451232199, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.04227054451232199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3103448275862069, + "acc_stderr": 0.03855289616378948, + "acc_norm": 0.3103448275862069, + "acc_norm_stderr": 0.03855289616378948 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.22486772486772486, + "acc_stderr": 0.021502096078229147, + "acc_norm": 0.22486772486772486, + "acc_norm_stderr": 0.021502096078229147 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23015873015873015, + "acc_stderr": 0.037649508797906066, + "acc_norm": 0.23015873015873015, + "acc_norm_stderr": 0.037649508797906066 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.20967741935483872, + "acc_stderr": 0.023157879349083522, + "acc_norm": 0.20967741935483872, + "acc_norm_stderr": 0.023157879349083522 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.21182266009852216, + "acc_stderr": 0.02874898368994108, + "acc_norm": 0.21182266009852216, + "acc_norm_stderr": 0.02874898368994108 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.03477691162163659, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.03477691162163659 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2474747474747475, + "acc_stderr": 0.03074630074212451, + "acc_norm": 0.2474747474747475, + "acc_norm_stderr": 0.03074630074212451 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.15544041450777202, + "acc_stderr": 0.02614848346915333, + "acc_norm": 0.15544041450777202, + "acc_norm_stderr": 0.02614848346915333 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2076923076923077, + "acc_stderr": 0.020567539567246815, + "acc_norm": 0.2076923076923077, + "acc_norm_stderr": 0.020567539567246815 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.027309140588230172, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.027309140588230172 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.20168067226890757, + "acc_stderr": 0.02606431340630452, + "acc_norm": 0.20168067226890757, + "acc_norm_stderr": 0.02606431340630452 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.24503311258278146, + "acc_stderr": 0.03511807571804725, + "acc_norm": 0.24503311258278146, + "acc_norm_stderr": 0.03511807571804725 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.23119266055045873, + "acc_stderr": 0.018075750241633146, + "acc_norm": 0.23119266055045873, + "acc_norm_stderr": 0.018075750241633146 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.30092592592592593, + "acc_stderr": 0.031280390843298804, + "acc_norm": 0.30092592592592593, + "acc_norm_stderr": 0.031280390843298804 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.030778554678693268, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.030778554678693268 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.22362869198312235, + "acc_stderr": 0.027123298205229972, + "acc_norm": 0.22362869198312235, + "acc_norm_stderr": 0.027123298205229972 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.28699551569506726, + "acc_stderr": 0.030360379710291954, + "acc_norm": 0.28699551569506726, + "acc_norm_stderr": 0.030360379710291954 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.21374045801526717, + "acc_stderr": 0.0359546161177469, + "acc_norm": 0.21374045801526717, + "acc_norm_stderr": 0.0359546161177469 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2644628099173554, + "acc_stderr": 0.04026187527591207, + "acc_norm": 0.2644628099173554, + "acc_norm_stderr": 0.04026187527591207 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.04236511258094633, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.04236511258094633 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2392638036809816, + "acc_stderr": 0.033519538795212696, + "acc_norm": 0.2392638036809816, + "acc_norm_stderr": 0.033519538795212696 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.20535714285714285, + "acc_stderr": 0.03834241021419073, + "acc_norm": 0.20535714285714285, + "acc_norm_stderr": 0.03834241021419073 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.21359223300970873, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.21359223300970873, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2692307692307692, + "acc_stderr": 0.029058588303748842, + "acc_norm": 0.2692307692307692, + "acc_norm_stderr": 0.029058588303748842 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.26181353767560667, + "acc_stderr": 0.01572083867844526, + "acc_norm": 0.26181353767560667, + "acc_norm_stderr": 0.01572083867844526 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24566473988439305, + "acc_stderr": 0.023176298203992005, + "acc_norm": 0.24566473988439305, + "acc_norm_stderr": 0.023176298203992005 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.28044692737430166, + "acc_stderr": 0.015024083883322884, + "acc_norm": 0.28044692737430166, + "acc_norm_stderr": 0.015024083883322884 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.27124183006535946, + "acc_stderr": 0.02545775669666789, + "acc_norm": 0.27124183006535946, + "acc_norm_stderr": 0.02545775669666789 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24115755627009647, + "acc_stderr": 0.024296594034763426, + "acc_norm": 0.24115755627009647, + "acc_norm_stderr": 0.024296594034763426 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2191358024691358, + "acc_stderr": 0.02301670564026219, + "acc_norm": 0.2191358024691358, + "acc_norm_stderr": 0.02301670564026219 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.24822695035460993, + "acc_stderr": 0.025770015644290385, + "acc_norm": 0.24822695035460993, + "acc_norm_stderr": 0.025770015644290385 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24902216427640156, + "acc_stderr": 0.01104489226404077, + "acc_norm": 0.24902216427640156, + "acc_norm_stderr": 0.01104489226404077 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.19117647058823528, + "acc_stderr": 0.02388688192244035, + "acc_norm": 0.19117647058823528, + "acc_norm_stderr": 0.02388688192244035 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2826797385620915, + "acc_stderr": 0.018217269552053435, + "acc_norm": 0.2826797385620915, + "acc_norm_stderr": 0.018217269552053435 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.22727272727272727, + "acc_stderr": 0.04013964554072773, + "acc_norm": 0.22727272727272727, + "acc_norm_stderr": 0.04013964554072773 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2938775510204082, + "acc_stderr": 0.029162738410249765, + "acc_norm": 0.2938775510204082, + "acc_norm_stderr": 0.029162738410249765 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2885572139303483, + "acc_stderr": 0.03203841040213321, + "acc_norm": 0.2885572139303483, + "acc_norm_stderr": 0.03203841040213321 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.21084337349397592, + "acc_stderr": 0.0317555478662992, + "acc_norm": 0.21084337349397592, + "acc_norm_stderr": 0.0317555478662992 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.28654970760233917, + "acc_stderr": 0.034678266857038266, + "acc_norm": 0.28654970760233917, + "acc_norm_stderr": 0.034678266857038266 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2460220318237454, + "mc1_stderr": 0.015077219200662574, + "mc2": 0.48980862480634235, + "mc2_stderr": 0.016960965764557922 + }, + "all": { + "acc": 0.2493400645880108, + "acc_stderr": 0.03154721477538292, + "acc_norm": 0.25021843132784466, + "acc_norm_stderr": 0.03155963267775345, + "mc1": 0.2460220318237454, + "mc1_stderr": 0.015077219200662574, + "mc2": 0.48980862480634235, + "mc2_stderr": 0.016960965764557922 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/guanaco-33B-GPTQ", + "model_sha": "8e42e031bfc8be3bbf31dc546d7c51fb991ff6e0", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "9800.63639140129", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/guanaco-33B-GPTQ/results_2023-11-08T02-14-39.195892.json b/eval-results/TheBloke/guanaco-33B-GPTQ/results_2023-11-08T02-14-39.195892.json new file mode 100644 index 0000000000000000000000000000000000000000..c53937f45b4ea755fcd4a13b02a3182053508312 --- /dev/null +++ b/eval-results/TheBloke/guanaco-33B-GPTQ/results_2023-11-08T02-14-39.195892.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/guanaco-33B-GPTQ", + "model_sha": "a8b393c45d939a6441ef82608f2bc109c3e471ad", + "model_dtype": "torch.float16", + "model_size": "15.83 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.0035654362416107383, + "em_stderr": 0.0006104082299890425, + "f1": 0.05981019295302028, + "f1_stderr": 0.0013198104887474009 + }, + "harness|gsm8k|5": { + "acc": 0.23805913570887036, + "acc_stderr": 0.011731278748420901 + }, + "harness|winogrande|5": { + "acc": 0.7884767166535123, + "acc_stderr": 0.011477747684223183 + }, + "all": { + "em": 0.0035654362416107383, + "em_stderr": 0.0006104082299890425, + "f1": 0.05981019295302028, + "f1_stderr": 0.0013198104887474009, + "acc": 0.5132679261811913, + "acc_stderr": 0.011604513216322042 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "51ddc696c89d253a" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "a758bd9e3cfa5aba" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "604fb677a44a1f55" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/guanaco-65B-HF/results_2023-07-25T19-41-45.375855.json b/eval-results/TheBloke/guanaco-65B-HF/results_2023-07-25T19-41-45.375855.json new file mode 100644 index 0000000000000000000000000000000000000000..82d59d67ab91a84031848cb95666d748fb0a22fd --- /dev/null +++ b/eval-results/TheBloke/guanaco-65B-HF/results_2023-07-25T19-41-45.375855.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.6186006825938567, + "acc_stderr": 0.014194389086685253, + "acc_norm": 0.6544368600682594, + "acc_norm_stderr": 0.013896938461145678 + }, + "harness|hellaswag|10": { + "acc": 0.6684923322047401, + "acc_stderr": 0.0046979297746702975, + "acc_norm": 0.8646683927504482, + "acc_norm_stderr": 0.0034137831331580697 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.04292596718256981, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.04292596718256981 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7631578947368421, + "acc_stderr": 0.03459777606810536, + "acc_norm": 0.7631578947368421, + "acc_norm_stderr": 0.03459777606810536 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.660377358490566, + "acc_stderr": 0.029146904747798325, + "acc_norm": 0.660377358490566, + "acc_norm_stderr": 0.029146904747798325 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7013888888888888, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.7013888888888888, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5491329479768786, + "acc_stderr": 0.0379401267469703, + "acc_norm": 0.5491329479768786, + "acc_norm_stderr": 0.0379401267469703 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4215686274509804, + "acc_stderr": 0.049135952012744975, + "acc_norm": 0.4215686274509804, + "acc_norm_stderr": 0.049135952012744975 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6170212765957447, + "acc_stderr": 0.03177821250236922, + "acc_norm": 0.6170212765957447, + "acc_norm_stderr": 0.03177821250236922 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.35964912280701755, + "acc_stderr": 0.045144961328736334, + "acc_norm": 0.35964912280701755, + "acc_norm_stderr": 0.045144961328736334 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3835978835978836, + "acc_stderr": 0.025043757318520203, + "acc_norm": 0.3835978835978836, + "acc_norm_stderr": 0.025043757318520203 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.04403438954768176, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.04403438954768176 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7387096774193549, + "acc_stderr": 0.024993053397764812, + "acc_norm": 0.7387096774193549, + "acc_norm_stderr": 0.024993053397764812 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43842364532019706, + "acc_stderr": 0.03491207857486518, + "acc_norm": 0.43842364532019706, + "acc_norm_stderr": 0.03491207857486518 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.63, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.63, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7696969696969697, + "acc_stderr": 0.032876667586034906, + "acc_norm": 0.7696969696969697, + "acc_norm_stderr": 0.032876667586034906 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8282828282828283, + "acc_stderr": 0.026869716187429903, + "acc_norm": 0.8282828282828283, + "acc_norm_stderr": 0.026869716187429903 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8704663212435233, + "acc_stderr": 0.024233532297758723, + "acc_norm": 0.8704663212435233, + "acc_norm_stderr": 0.024233532297758723 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6615384615384615, + "acc_stderr": 0.023991500500313033, + "acc_norm": 0.6615384615384615, + "acc_norm_stderr": 0.023991500500313033 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34814814814814815, + "acc_stderr": 0.029045600290616258, + "acc_norm": 0.34814814814814815, + "acc_norm_stderr": 0.029045600290616258 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6932773109243697, + "acc_stderr": 0.029953823891887055, + "acc_norm": 0.6932773109243697, + "acc_norm_stderr": 0.029953823891887055 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.41721854304635764, + "acc_stderr": 0.040261414976346104, + "acc_norm": 0.41721854304635764, + "acc_norm_stderr": 0.040261414976346104 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.818348623853211, + "acc_stderr": 0.016530617409266857, + "acc_norm": 0.818348623853211, + "acc_norm_stderr": 0.016530617409266857 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5925925925925926, + "acc_stderr": 0.03350991604696044, + "acc_norm": 0.5925925925925926, + "acc_norm_stderr": 0.03350991604696044 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8382352941176471, + "acc_stderr": 0.02584501798692692, + "acc_norm": 0.8382352941176471, + "acc_norm_stderr": 0.02584501798692692 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8227848101265823, + "acc_stderr": 0.024856364184503224, + "acc_norm": 0.8227848101265823, + "acc_norm_stderr": 0.024856364184503224 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6681614349775785, + "acc_stderr": 0.031602951437766785, + "acc_norm": 0.6681614349775785, + "acc_norm_stderr": 0.031602951437766785 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7251908396946565, + "acc_stderr": 0.03915345408847836, + "acc_norm": 0.7251908396946565, + "acc_norm_stderr": 0.03915345408847836 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.768595041322314, + "acc_stderr": 0.03849856098794088, + "acc_norm": 0.768595041322314, + "acc_norm_stderr": 0.03849856098794088 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.04330043749650741, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.04330043749650741 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7607361963190185, + "acc_stderr": 0.0335195387952127, + "acc_norm": 0.7607361963190185, + "acc_norm_stderr": 0.0335195387952127 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.45535714285714285, + "acc_stderr": 0.047268355537191, + "acc_norm": 0.45535714285714285, + "acc_norm_stderr": 0.047268355537191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.03916667762822584, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.03916667762822584 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8589743589743589, + "acc_stderr": 0.02280138253459753, + "acc_norm": 0.8589743589743589, + "acc_norm_stderr": 0.02280138253459753 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8045977011494253, + "acc_stderr": 0.014179171373424384, + "acc_norm": 0.8045977011494253, + "acc_norm_stderr": 0.014179171373424384 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7341040462427746, + "acc_stderr": 0.023786203255508297, + "acc_norm": 0.7341040462427746, + "acc_norm_stderr": 0.023786203255508297 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3743016759776536, + "acc_stderr": 0.016185444179457175, + "acc_norm": 0.3743016759776536, + "acc_norm_stderr": 0.016185444179457175 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6568627450980392, + "acc_stderr": 0.027184498909941613, + "acc_norm": 0.6568627450980392, + "acc_norm_stderr": 0.027184498909941613 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7266881028938906, + "acc_stderr": 0.025311765975426122, + "acc_norm": 0.7266881028938906, + "acc_norm_stderr": 0.025311765975426122 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7345679012345679, + "acc_stderr": 0.02456922360046085, + "acc_norm": 0.7345679012345679, + "acc_norm_stderr": 0.02456922360046085 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5141843971631206, + "acc_stderr": 0.02981549448368206, + "acc_norm": 0.5141843971631206, + "acc_norm_stderr": 0.02981549448368206 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4771838331160365, + "acc_stderr": 0.012756933382823696, + "acc_norm": 0.4771838331160365, + "acc_norm_stderr": 0.012756933382823696 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6360294117647058, + "acc_stderr": 0.02922719246003203, + "acc_norm": 0.6360294117647058, + "acc_norm_stderr": 0.02922719246003203 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6421568627450981, + "acc_stderr": 0.019393058402355435, + "acc_norm": 0.6421568627450981, + "acc_norm_stderr": 0.019393058402355435 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7181818181818181, + "acc_stderr": 0.043091187099464585, + "acc_norm": 0.7181818181818181, + "acc_norm_stderr": 0.043091187099464585 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.689795918367347, + "acc_stderr": 0.02961345987248438, + "acc_norm": 0.689795918367347, + "acc_norm_stderr": 0.02961345987248438 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8059701492537313, + "acc_stderr": 0.027962677604768914, + "acc_norm": 0.8059701492537313, + "acc_norm_stderr": 0.027962677604768914 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.0358870281282637, + "acc_norm": 0.85, + "acc_norm_stderr": 0.0358870281282637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5180722891566265, + "acc_stderr": 0.03889951252827216, + "acc_norm": 0.5180722891566265, + "acc_norm_stderr": 0.03889951252827216 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8128654970760234, + "acc_stderr": 0.02991312723236804, + "acc_norm": 0.8128654970760234, + "acc_norm_stderr": 0.02991312723236804 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.36474908200734396, + "mc1_stderr": 0.016850961061720116, + "mc2": 0.5281436462656385, + "mc2_stderr": 0.014720364426283132 + }, + "all": { + "acc": 0.6296715851046241, + "acc_stderr": 0.033305912245580625, + "acc_norm": 0.6336039959185242, + "acc_norm_stderr": 0.03327910551224076, + "mc1": 0.36474908200734396, + "mc1_stderr": 0.016850961061720116, + "mc2": 0.5281436462656385, + "mc2_stderr": 0.014720364426283132 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/guanaco-65B-HF", + "model_sha": "7f83ae526f8b83705ca8434535da8fd8c692f9d0", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "25564.56579065323", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/guanaco-65B-HF/results_2023-10-23T03-09-40.214751.json b/eval-results/TheBloke/guanaco-65B-HF/results_2023-10-23T03-09-40.214751.json new file mode 100644 index 0000000000000000000000000000000000000000..eb541c911f85957843579b6a578adbe201275706 --- /dev/null +++ b/eval-results/TheBloke/guanaco-65B-HF/results_2023-10-23T03-09-40.214751.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/guanaco-65B-HF", + "model_sha": "7f83ae526f8b83705ca8434535da8fd8c692f9d0", + "model_size": "121.68 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0019924496644295304, + "em_stderr": 0.0004566676462666983, + "f1": 0.06694840604026871, + "f1_stderr": 0.0014210409267209844 + }, + "harness|gsm8k|5": { + "acc": 0.26004548900682334, + "acc_stderr": 0.012082852340334089 + }, + "harness|winogrande|5": { + "acc": 0.823993685872139, + "acc_stderr": 0.010703090882320705 + }, + "all": { + "em": 0.0019924496644295304, + "em_stderr": 0.0004566676462666983, + "f1": 0.06694840604026871, + "f1_stderr": 0.0014210409267209844, + "acc": 0.5420195874394811, + "acc_stderr": 0.011392971611327397 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "5c8e1776dd4c0050" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "6b8efc645001f9a6" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "d7b61955a5a521b4" + }, + "total_evaluation_time_secondes": "44478.562024116516", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/guanaco-7B-HF/results_2023-07-19T16-53-22.829156.json b/eval-results/TheBloke/guanaco-7B-HF/results_2023-07-19T16-53-22.829156.json new file mode 100644 index 0000000000000000000000000000000000000000..a838f628202eaf91b37cf1556a74e5bea078436a --- /dev/null +++ b/eval-results/TheBloke/guanaco-7B-HF/results_2023-07-19T16-53-22.829156.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.492320819112628, + "acc_stderr": 0.01460966744089257, + "acc_norm": 0.5298634812286689, + "acc_norm_stderr": 0.014585305840007107 + }, + "harness|hellaswag|10": { + "acc": 0.6026687910774746, + "acc_stderr": 0.004883455188908965, + "acc_norm": 0.8005377414857598, + "acc_norm_stderr": 0.0039877946685300605 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720683, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720683 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3925925925925926, + "acc_stderr": 0.04218506215368879, + "acc_norm": 0.3925925925925926, + "acc_norm_stderr": 0.04218506215368879 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.037827289808654685, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.037827289808654685 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.43018867924528303, + "acc_stderr": 0.030471445867183238, + "acc_norm": 0.43018867924528303, + "acc_norm_stderr": 0.030471445867183238 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3680555555555556, + "acc_stderr": 0.04032999053960718, + "acc_norm": 0.3680555555555556, + "acc_norm_stderr": 0.04032999053960718 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.27167630057803466, + "acc_stderr": 0.03391750322321659, + "acc_norm": 0.27167630057803466, + "acc_norm_stderr": 0.03391750322321659 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.040925639582376556, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.040925639582376556 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237101, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237101 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3659574468085106, + "acc_stderr": 0.0314895582974553, + "acc_norm": 0.3659574468085106, + "acc_norm_stderr": 0.0314895582974553 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3103448275862069, + "acc_stderr": 0.03855289616378949, + "acc_norm": 0.3103448275862069, + "acc_norm_stderr": 0.03855289616378949 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2698412698412698, + "acc_stderr": 0.02286083830923207, + "acc_norm": 0.2698412698412698, + "acc_norm_stderr": 0.02286083830923207 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.19047619047619047, + "acc_stderr": 0.035122074123020514, + "acc_norm": 0.19047619047619047, + "acc_norm_stderr": 0.035122074123020514 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.35161290322580646, + "acc_stderr": 0.027162537826948458, + "acc_norm": 0.35161290322580646, + "acc_norm_stderr": 0.027162537826948458 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3399014778325123, + "acc_stderr": 0.033327690684107895, + "acc_norm": 0.3399014778325123, + "acc_norm_stderr": 0.033327690684107895 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.43636363636363634, + "acc_stderr": 0.03872592983524754, + "acc_norm": 0.43636363636363634, + "acc_norm_stderr": 0.03872592983524754 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3838383838383838, + "acc_stderr": 0.03464881675016338, + "acc_norm": 0.3838383838383838, + "acc_norm_stderr": 0.03464881675016338 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.44041450777202074, + "acc_stderr": 0.03582724530036095, + "acc_norm": 0.44041450777202074, + "acc_norm_stderr": 0.03582724530036095 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.36666666666666664, + "acc_stderr": 0.024433016466052455, + "acc_norm": 0.36666666666666664, + "acc_norm_stderr": 0.024433016466052455 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.02592887613276611, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.02592887613276611 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.030388353551886838, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.030388353551886838 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2582781456953642, + "acc_stderr": 0.035737053147634576, + "acc_norm": 0.2582781456953642, + "acc_norm_stderr": 0.035737053147634576 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.46238532110091746, + "acc_stderr": 0.021376575274397576, + "acc_norm": 0.46238532110091746, + "acc_norm_stderr": 0.021376575274397576 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.23148148148148148, + "acc_stderr": 0.028765111718046955, + "acc_norm": 0.23148148148148148, + "acc_norm_stderr": 0.028765111718046955 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.36764705882352944, + "acc_stderr": 0.03384132045674118, + "acc_norm": 0.36764705882352944, + "acc_norm_stderr": 0.03384132045674118 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.37130801687763715, + "acc_stderr": 0.03145068600744859, + "acc_norm": 0.37130801687763715, + "acc_norm_stderr": 0.03145068600744859 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.4439461883408072, + "acc_stderr": 0.03334625674242728, + "acc_norm": 0.4439461883408072, + "acc_norm_stderr": 0.03334625674242728 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3435114503816794, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.3435114503816794, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5371900826446281, + "acc_stderr": 0.04551711196104218, + "acc_norm": 0.5371900826446281, + "acc_norm_stderr": 0.04551711196104218 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.39814814814814814, + "acc_stderr": 0.04732332615978815, + "acc_norm": 0.39814814814814814, + "acc_norm_stderr": 0.04732332615978815 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3803680981595092, + "acc_stderr": 0.03814269893261837, + "acc_norm": 0.3803680981595092, + "acc_norm_stderr": 0.03814269893261837 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.23214285714285715, + "acc_stderr": 0.04007341809755803, + "acc_norm": 0.23214285714285715, + "acc_norm_stderr": 0.04007341809755803 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.32038834951456313, + "acc_stderr": 0.04620284082280039, + "acc_norm": 0.32038834951456313, + "acc_norm_stderr": 0.04620284082280039 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.49572649572649574, + "acc_stderr": 0.03275489264382132, + "acc_norm": 0.49572649572649574, + "acc_norm_stderr": 0.03275489264382132 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.44189016602809705, + "acc_stderr": 0.01775880053421441, + "acc_norm": 0.44189016602809705, + "acc_norm_stderr": 0.01775880053421441 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.37572254335260113, + "acc_stderr": 0.026074314851657083, + "acc_norm": 0.37572254335260113, + "acc_norm_stderr": 0.026074314851657083 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.38562091503267976, + "acc_stderr": 0.027870745278290324, + "acc_norm": 0.38562091503267976, + "acc_norm_stderr": 0.027870745278290324 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3762057877813505, + "acc_stderr": 0.02751392568354943, + "acc_norm": 0.3762057877813505, + "acc_norm_stderr": 0.02751392568354943 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.38271604938271603, + "acc_stderr": 0.027044538138402612, + "acc_norm": 0.38271604938271603, + "acc_norm_stderr": 0.027044538138402612 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2730496453900709, + "acc_stderr": 0.026577860943307857, + "acc_norm": 0.2730496453900709, + "acc_norm_stderr": 0.026577860943307857 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.29465449804432853, + "acc_stderr": 0.011643576764069546, + "acc_norm": 0.29465449804432853, + "acc_norm_stderr": 0.011643576764069546 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.40808823529411764, + "acc_stderr": 0.029855261393483927, + "acc_norm": 0.40808823529411764, + "acc_norm_stderr": 0.029855261393483927 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.3545751633986928, + "acc_stderr": 0.019353360547553704, + "acc_norm": 0.3545751633986928, + "acc_norm_stderr": 0.019353360547553704 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.45454545454545453, + "acc_stderr": 0.04769300568972743, + "acc_norm": 0.45454545454545453, + "acc_norm_stderr": 0.04769300568972743 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3183673469387755, + "acc_stderr": 0.02982253379398207, + "acc_norm": 0.3183673469387755, + "acc_norm_stderr": 0.02982253379398207 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.4626865671641791, + "acc_stderr": 0.035256751674679745, + "acc_norm": 0.4626865671641791, + "acc_norm_stderr": 0.035256751674679745 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3855421686746988, + "acc_stderr": 0.03789134424611548, + "acc_norm": 0.3855421686746988, + "acc_norm_stderr": 0.03789134424611548 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.47953216374269003, + "acc_stderr": 0.0383161053282193, + "acc_norm": 0.47953216374269003, + "acc_norm_stderr": 0.0383161053282193 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2460220318237454, + "mc1_stderr": 0.015077219200662578, + "mc2": 0.39198062564912123, + "mc2_stderr": 0.014535520033672577 + }, + "all": { + "acc": 0.35974692490672305, + "acc_stderr": 0.03441946919078933, + "acc_norm": 0.3637369522376438, + "acc_norm_stderr": 0.03440387559551367, + "mc1": 0.2460220318237454, + "mc1_stderr": 0.015077219200662578, + "mc2": 0.39198062564912123, + "mc2_stderr": 0.014535520033672577 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/guanaco-7B-HF", + "model_sha": "293c24105fa15afa127a2ec3905fdc2a0a3a6dac", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/guanaco-7B-HF/results_2023-10-23T00-48-06.944333.json b/eval-results/TheBloke/guanaco-7B-HF/results_2023-10-23T00-48-06.944333.json new file mode 100644 index 0000000000000000000000000000000000000000..2c5f03acc28fe99e0b4c00c264713d3f10a5774d --- /dev/null +++ b/eval-results/TheBloke/guanaco-7B-HF/results_2023-10-23T00-48-06.944333.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/guanaco-7B-HF", + "model_sha": "293c24105fa15afa127a2ec3905fdc2a0a3a6dac", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0009437919463087249, + "em_stderr": 0.0003144653119413489, + "f1": 0.05533032718120824, + "f1_stderr": 0.001296240126534493 + }, + "harness|gsm8k|5": { + "acc": 0.05079605761940864, + "acc_stderr": 0.006048352096878091 + }, + "harness|winogrande|5": { + "acc": 0.7142857142857143, + "acc_stderr": 0.012696531870038616 + }, + "all": { + "em": 0.0009437919463087249, + "em_stderr": 0.0003144653119413489, + "f1": 0.05533032718120824, + "f1_stderr": 0.001296240126534493, + "acc": 0.38254088595256147, + "acc_stderr": 0.009372441983458353 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "6892571bac07912a" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "de807b984c24b68f" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "99b59dc3936719e6" + }, + "total_evaluation_time_secondes": "9556.695770740509", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/h2ogpt-oasst1-512-30B-HF/results_2023-08-12T13-18-04.173532.json b/eval-results/TheBloke/h2ogpt-oasst1-512-30B-HF/results_2023-08-12T13-18-04.173532.json new file mode 100644 index 0000000000000000000000000000000000000000..c0c6c9ad6a3afa939230bed89f0af8e6ac679eee --- /dev/null +++ b/eval-results/TheBloke/h2ogpt-oasst1-512-30B-HF/results_2023-08-12T13-18-04.173532.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5273037542662116, + "acc_stderr": 0.014589589101985994, + "acc_norm": 0.5733788395904437, + "acc_norm_stderr": 0.014453185592920293 + }, + "harness|hellaswag|10": { + "acc": 0.611431985660227, + "acc_stderr": 0.004864286176731831, + "acc_norm": 0.8136825333598885, + "acc_norm_stderr": 0.003885668963126071 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45185185185185184, + "acc_stderr": 0.04299268905480864, + "acc_norm": 0.45185185185185184, + "acc_norm_stderr": 0.04299268905480864 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4934210526315789, + "acc_stderr": 0.040685900502249704, + "acc_norm": 0.4934210526315789, + "acc_norm_stderr": 0.040685900502249704 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.49433962264150944, + "acc_stderr": 0.030770900763851302, + "acc_norm": 0.49433962264150944, + "acc_norm_stderr": 0.030770900763851302 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5069444444444444, + "acc_stderr": 0.04180806750294938, + "acc_norm": 0.5069444444444444, + "acc_norm_stderr": 0.04180806750294938 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.44508670520231214, + "acc_stderr": 0.03789401760283647, + "acc_norm": 0.44508670520231214, + "acc_norm_stderr": 0.03789401760283647 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4, + "acc_stderr": 0.03202563076101737, + "acc_norm": 0.4, + "acc_norm_stderr": 0.03202563076101737 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.34210526315789475, + "acc_stderr": 0.04462917535336936, + "acc_norm": 0.34210526315789475, + "acc_norm_stderr": 0.04462917535336936 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4413793103448276, + "acc_stderr": 0.04137931034482758, + "acc_norm": 0.4413793103448276, + "acc_norm_stderr": 0.04137931034482758 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.02455229220934266, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.02455229220934266 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.03670066451047181, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.03670066451047181 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5516129032258065, + "acc_stderr": 0.028292056830112735, + "acc_norm": 0.5516129032258065, + "acc_norm_stderr": 0.028292056830112735 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.39408866995073893, + "acc_stderr": 0.03438157967036543, + "acc_norm": 0.39408866995073893, + "acc_norm_stderr": 0.03438157967036543 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5696969696969697, + "acc_stderr": 0.03866225962879077, + "acc_norm": 0.5696969696969697, + "acc_norm_stderr": 0.03866225962879077 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6060606060606061, + "acc_stderr": 0.03481285338232962, + "acc_norm": 0.6060606060606061, + "acc_norm_stderr": 0.03481285338232962 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7305699481865285, + "acc_stderr": 0.03201867122877793, + "acc_norm": 0.7305699481865285, + "acc_norm_stderr": 0.03201867122877793 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.43333333333333335, + "acc_stderr": 0.025124653525885117, + "acc_norm": 0.43333333333333335, + "acc_norm_stderr": 0.025124653525885117 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.027840811495871937, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.027840811495871937 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42436974789915966, + "acc_stderr": 0.032104790510157764, + "acc_norm": 0.42436974789915966, + "acc_norm_stderr": 0.032104790510157764 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943342, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943342 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6311926605504588, + "acc_stderr": 0.02068622756072957, + "acc_norm": 0.6311926605504588, + "acc_norm_stderr": 0.02068622756072957 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.03256850570293649, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.03256850570293649 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6715686274509803, + "acc_stderr": 0.03296245110172227, + "acc_norm": 0.6715686274509803, + "acc_norm_stderr": 0.03296245110172227 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.729957805907173, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.729957805907173, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5829596412556054, + "acc_stderr": 0.03309266936071721, + "acc_norm": 0.5829596412556054, + "acc_norm_stderr": 0.03309266936071721 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.48854961832061067, + "acc_stderr": 0.043841400240780176, + "acc_norm": 0.48854961832061067, + "acc_norm_stderr": 0.043841400240780176 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5950413223140496, + "acc_stderr": 0.044811377559424694, + "acc_norm": 0.5950413223140496, + "acc_norm_stderr": 0.044811377559424694 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5462962962962963, + "acc_stderr": 0.04812917324536823, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.04812917324536823 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5214723926380368, + "acc_stderr": 0.03924746876751129, + "acc_norm": 0.5214723926380368, + "acc_norm_stderr": 0.03924746876751129 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5533980582524272, + "acc_stderr": 0.04922424153458933, + "acc_norm": 0.5533980582524272, + "acc_norm_stderr": 0.04922424153458933 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7692307692307693, + "acc_stderr": 0.027601921381417583, + "acc_norm": 0.7692307692307693, + "acc_norm_stderr": 0.027601921381417583 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.45, + "acc_stderr": 0.05000000000000001, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05000000000000001 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.665389527458493, + "acc_stderr": 0.016873468641592157, + "acc_norm": 0.665389527458493, + "acc_norm_stderr": 0.016873468641592157 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5809248554913294, + "acc_stderr": 0.026564178111422622, + "acc_norm": 0.5809248554913294, + "acc_norm_stderr": 0.026564178111422622 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.29720670391061454, + "acc_stderr": 0.015285313353641602, + "acc_norm": 0.29720670391061454, + "acc_norm_stderr": 0.015285313353641602 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.48366013071895425, + "acc_stderr": 0.028614624752805407, + "acc_norm": 0.48366013071895425, + "acc_norm_stderr": 0.028614624752805407 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5659163987138264, + "acc_stderr": 0.028150232244535597, + "acc_norm": 0.5659163987138264, + "acc_norm_stderr": 0.028150232244535597 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5740740740740741, + "acc_stderr": 0.027513747284379424, + "acc_norm": 0.5740740740740741, + "acc_norm_stderr": 0.027513747284379424 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3829787234042553, + "acc_stderr": 0.02899908090480617, + "acc_norm": 0.3829787234042553, + "acc_norm_stderr": 0.02899908090480617 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3754889178617992, + "acc_stderr": 0.012367945396728206, + "acc_norm": 0.3754889178617992, + "acc_norm_stderr": 0.012367945396728206 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.49264705882352944, + "acc_stderr": 0.030369552523902173, + "acc_norm": 0.49264705882352944, + "acc_norm_stderr": 0.030369552523902173 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4934640522875817, + "acc_stderr": 0.020226106567657807, + "acc_norm": 0.4934640522875817, + "acc_norm_stderr": 0.020226106567657807 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5727272727272728, + "acc_stderr": 0.04738198703545483, + "acc_norm": 0.5727272727272728, + "acc_norm_stderr": 0.04738198703545483 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4530612244897959, + "acc_stderr": 0.03186785930004129, + "acc_norm": 0.4530612244897959, + "acc_norm_stderr": 0.03186785930004129 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.5920398009950248, + "acc_stderr": 0.03475116365194092, + "acc_norm": 0.5920398009950248, + "acc_norm_stderr": 0.03475116365194092 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.40963855421686746, + "acc_stderr": 0.03828401115079022, + "acc_norm": 0.40963855421686746, + "acc_norm_stderr": 0.03828401115079022 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7251461988304093, + "acc_stderr": 0.03424042924691584, + "acc_norm": 0.7251461988304093, + "acc_norm_stderr": 0.03424042924691584 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3047735618115055, + "mc1_stderr": 0.01611412415688246, + "mc2": 0.4546446919607391, + "mc2_stderr": 0.015036086270568245 + }, + "all": { + "acc": 0.48393371360192644, + "acc_stderr": 0.035037124108220824, + "acc_norm": 0.48814262263622965, + "acc_norm_stderr": 0.03501822545190436, + "mc1": 0.3047735618115055, + "mc1_stderr": 0.01611412415688246, + "mc2": 0.4546446919607391, + "mc2_stderr": 0.015036086270568245 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/h2ogpt-oasst1-512-30B-HF", + "model_sha": "3dc93836e4b08b7b2ee43e69c1e590a36fd24687", + "model_dtype": "torch.float16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2b0e07d4cdd3b0fe", + "hash_cont_tokens": "52204555b6e39a6e" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "578edd77107cb2c3", + "hash_cont_tokens": "25c49737526d9f80" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "6a95a1511f8da075", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "24a78edc4d9a93aa", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "b11106668d6c0974", + "hash_cont_tokens": "ebed26cf74a85815" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "10180ba12a075cb0", + "hash_cont_tokens": "6898ac348a7ae442" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "73351ef4968750a2", + "hash_cont_tokens": "34a058958a45af94" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "a539150af234c668", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "52e12e5a43bcee35", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "d1f3721a5659f7ee", + "hash_cont_tokens": "da408cb12ab08288" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "f2d78f546b5595c2", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "c9cc19179f63d1d6", + "hash_cont_tokens": "370a1a0ab68d15cd" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5046144e67e992e8", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4b14581ba4fc06fc", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "1ee52c413b5b4cc4", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "2914077c4dd3090a", + "hash_cont_tokens": "80dea4d59245cf01" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0f88a874342378de", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "9889933f1dd02a23", + "hash_cont_tokens": "309bef1803097408" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dc309a94c4bfdd2f", + "hash_cont_tokens": "5105a3bd1b39b785" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "0801a0aebec3ba8c", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "5bc4aca8831d9c05", + "hash_cont_tokens": "205c5deee1581b02" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b92bd6b06fc3464c", + "hash_cont_tokens": "272d28867e0ff046" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a549346cde8165e9", + "hash_cont_tokens": "98b3bf311aa83f0d" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "e7e9cf91f9d6a081", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "a61a1670f854d9e1", + "hash_cont_tokens": "d9e66fc7c702b795" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8a77cb7763f28110", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "fcfcfae391f8faa1", + "hash_cont_tokens": "d4b1936084c060e0" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a29454cc1feb23ef", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "b6734a25556d75dc", + "hash_cont_tokens": "2bf9921a39e901d9" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "5720438e29473426", + "hash_cont_tokens": "cab8b16be9576360" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "486321d5858de240", + "hash_cont_tokens": "1c34fbe5a59f1ed1" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "473919e64d1b8c80", + "hash_cont_tokens": "ebd714885a59ef55" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "47a65c81fd7ed010", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "aedfcd41cbd2fcc9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "ed5f2414144d7b72", + "hash_cont_tokens": "aac52fa6a519223b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "692eaacb5b747264", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "2cbce4edca937588", + "hash_cont_tokens": "697179a0dd47c5c0" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "c2f38b19bab1aa2c", + "hash_cont_tokens": "9b19898e3ecb527f" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fde277bc547bc3d8", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "87b232bbebce39db", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "58c21af9da3e126e", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d1f5c770d368e9c6", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "98d6db15a50aaa8e", + "hash_cont_tokens": "1e30d7dedc7588c0" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "2aabd8c7337502f8", + "hash_cont_tokens": "ceee291786cbb123" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "17f8c8f2d4a0a9b1", + "hash_cont_tokens": "484df4c25a5460bd" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "dfc6df491d991966", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "cffe8139e00da9dd", + "hash_cont_tokens": "85a9de6c685b7035" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "4a69ed6ee55918fb", + "hash_cont_tokens": "ad7b5a040535bdcf" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "6cc713f12b5890de", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "b4044fc92756c377", + "hash_cont_tokens": "0b7b5aaef574dc78" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b019784da8db089a", + "hash_cont_tokens": "63a651778e8d72d2" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "f47f37c7c9bfc601", + "hash_cont_tokens": "841583ab707b25d7" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "4d282718d6142410", + "hash_cont_tokens": "9c2c01d3214f66b8" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fbc6026e500537bc", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "150dd1ff81ff642e", + "hash_cont_tokens": "96353c5969a9028a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "fcbac3e735545969", + "hash_cont_tokens": "a1f8901800ac9b68" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ffc962a38441ef13", + "hash_cont_tokens": "08c0be345e5f1c12" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "9ffb65d225ae550f", + "hash_cont_tokens": "16c760a491c6f26e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "1c61d6705b299f5c", + "hash_cont_tokens": "868d6f1055fbd51d" + }, + "total_evaluation_time_secondes": "9170.025273323059", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/h2ogpt-oasst1-512-30B-HF/results_2023-08-14T19-29-40.569762.json b/eval-results/TheBloke/h2ogpt-oasst1-512-30B-HF/results_2023-08-14T19-29-40.569762.json new file mode 100644 index 0000000000000000000000000000000000000000..a6b219feb2988d65bb9aa4d415b91e7b79e62c9c --- /dev/null +++ b/eval-results/TheBloke/h2ogpt-oasst1-512-30B-HF/results_2023-08-14T19-29-40.569762.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5273037542662116, + "acc_stderr": 0.014589589101985994, + "acc_norm": 0.5733788395904437, + "acc_norm_stderr": 0.014453185592920293 + }, + "harness|hellaswag|10": { + "acc": 0.611431985660227, + "acc_stderr": 0.004864286176731831, + "acc_norm": 0.8136825333598885, + "acc_norm_stderr": 0.003885668963126071 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45185185185185184, + "acc_stderr": 0.04299268905480864, + "acc_norm": 0.45185185185185184, + "acc_norm_stderr": 0.04299268905480864 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4934210526315789, + "acc_stderr": 0.040685900502249704, + "acc_norm": 0.4934210526315789, + "acc_norm_stderr": 0.040685900502249704 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.49433962264150944, + "acc_stderr": 0.030770900763851302, + "acc_norm": 0.49433962264150944, + "acc_norm_stderr": 0.030770900763851302 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5069444444444444, + "acc_stderr": 0.04180806750294938, + "acc_norm": 0.5069444444444444, + "acc_norm_stderr": 0.04180806750294938 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.44508670520231214, + "acc_stderr": 0.03789401760283647, + "acc_norm": 0.44508670520231214, + "acc_norm_stderr": 0.03789401760283647 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4, + "acc_stderr": 0.03202563076101737, + "acc_norm": 0.4, + "acc_norm_stderr": 0.03202563076101737 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.34210526315789475, + "acc_stderr": 0.04462917535336936, + "acc_norm": 0.34210526315789475, + "acc_norm_stderr": 0.04462917535336936 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4413793103448276, + "acc_stderr": 0.04137931034482758, + "acc_norm": 0.4413793103448276, + "acc_norm_stderr": 0.04137931034482758 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.02455229220934266, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.02455229220934266 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.03670066451047181, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.03670066451047181 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5516129032258065, + "acc_stderr": 0.028292056830112735, + "acc_norm": 0.5516129032258065, + "acc_norm_stderr": 0.028292056830112735 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.39408866995073893, + "acc_stderr": 0.03438157967036543, + "acc_norm": 0.39408866995073893, + "acc_norm_stderr": 0.03438157967036543 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5696969696969697, + "acc_stderr": 0.03866225962879077, + "acc_norm": 0.5696969696969697, + "acc_norm_stderr": 0.03866225962879077 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6060606060606061, + "acc_stderr": 0.03481285338232962, + "acc_norm": 0.6060606060606061, + "acc_norm_stderr": 0.03481285338232962 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7305699481865285, + "acc_stderr": 0.03201867122877793, + "acc_norm": 0.7305699481865285, + "acc_norm_stderr": 0.03201867122877793 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.43333333333333335, + "acc_stderr": 0.025124653525885117, + "acc_norm": 0.43333333333333335, + "acc_norm_stderr": 0.025124653525885117 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.027840811495871937, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.027840811495871937 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42436974789915966, + "acc_stderr": 0.032104790510157764, + "acc_norm": 0.42436974789915966, + "acc_norm_stderr": 0.032104790510157764 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943342, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943342 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6311926605504588, + "acc_stderr": 0.02068622756072957, + "acc_norm": 0.6311926605504588, + "acc_norm_stderr": 0.02068622756072957 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.03256850570293649, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.03256850570293649 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6715686274509803, + "acc_stderr": 0.03296245110172227, + "acc_norm": 0.6715686274509803, + "acc_norm_stderr": 0.03296245110172227 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.729957805907173, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.729957805907173, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5829596412556054, + "acc_stderr": 0.03309266936071721, + "acc_norm": 0.5829596412556054, + "acc_norm_stderr": 0.03309266936071721 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.48854961832061067, + "acc_stderr": 0.043841400240780176, + "acc_norm": 0.48854961832061067, + "acc_norm_stderr": 0.043841400240780176 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5950413223140496, + "acc_stderr": 0.044811377559424694, + "acc_norm": 0.5950413223140496, + "acc_norm_stderr": 0.044811377559424694 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5462962962962963, + "acc_stderr": 0.04812917324536823, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.04812917324536823 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5214723926380368, + "acc_stderr": 0.03924746876751129, + "acc_norm": 0.5214723926380368, + "acc_norm_stderr": 0.03924746876751129 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5533980582524272, + "acc_stderr": 0.04922424153458933, + "acc_norm": 0.5533980582524272, + "acc_norm_stderr": 0.04922424153458933 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7692307692307693, + "acc_stderr": 0.027601921381417583, + "acc_norm": 0.7692307692307693, + "acc_norm_stderr": 0.027601921381417583 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.45, + "acc_stderr": 0.05000000000000001, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05000000000000001 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.665389527458493, + "acc_stderr": 0.016873468641592157, + "acc_norm": 0.665389527458493, + "acc_norm_stderr": 0.016873468641592157 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5809248554913294, + "acc_stderr": 0.026564178111422622, + "acc_norm": 0.5809248554913294, + "acc_norm_stderr": 0.026564178111422622 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.29720670391061454, + "acc_stderr": 0.015285313353641602, + "acc_norm": 0.29720670391061454, + "acc_norm_stderr": 0.015285313353641602 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.48366013071895425, + "acc_stderr": 0.028614624752805407, + "acc_norm": 0.48366013071895425, + "acc_norm_stderr": 0.028614624752805407 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5659163987138264, + "acc_stderr": 0.028150232244535597, + "acc_norm": 0.5659163987138264, + "acc_norm_stderr": 0.028150232244535597 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5740740740740741, + "acc_stderr": 0.027513747284379424, + "acc_norm": 0.5740740740740741, + "acc_norm_stderr": 0.027513747284379424 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3829787234042553, + "acc_stderr": 0.02899908090480617, + "acc_norm": 0.3829787234042553, + "acc_norm_stderr": 0.02899908090480617 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3754889178617992, + "acc_stderr": 0.012367945396728206, + "acc_norm": 0.3754889178617992, + "acc_norm_stderr": 0.012367945396728206 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.49264705882352944, + "acc_stderr": 0.030369552523902173, + "acc_norm": 0.49264705882352944, + "acc_norm_stderr": 0.030369552523902173 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4934640522875817, + "acc_stderr": 0.020226106567657807, + "acc_norm": 0.4934640522875817, + "acc_norm_stderr": 0.020226106567657807 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5727272727272728, + "acc_stderr": 0.04738198703545483, + "acc_norm": 0.5727272727272728, + "acc_norm_stderr": 0.04738198703545483 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4530612244897959, + "acc_stderr": 0.03186785930004129, + "acc_norm": 0.4530612244897959, + "acc_norm_stderr": 0.03186785930004129 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.5920398009950248, + "acc_stderr": 0.03475116365194092, + "acc_norm": 0.5920398009950248, + "acc_norm_stderr": 0.03475116365194092 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.40963855421686746, + "acc_stderr": 0.03828401115079022, + "acc_norm": 0.40963855421686746, + "acc_norm_stderr": 0.03828401115079022 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7251461988304093, + "acc_stderr": 0.03424042924691584, + "acc_norm": 0.7251461988304093, + "acc_norm_stderr": 0.03424042924691584 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3047735618115055, + "mc1_stderr": 0.01611412415688246, + "mc2": 0.4546446919607391, + "mc2_stderr": 0.015036086270568245 + }, + "all": { + "acc": 0.48393371360192644, + "acc_stderr": 0.035037124108220824, + "acc_norm": 0.48814262263622965, + "acc_norm_stderr": 0.03501822545190436, + "mc1": 0.3047735618115055, + "mc1_stderr": 0.01611412415688246, + "mc2": 0.4546446919607391, + "mc2_stderr": 0.015036086270568245 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/h2ogpt-oasst1-512-30B-HF", + "model_sha": "3dc93836e4b08b7b2ee43e69c1e590a36fd24687", + "model_dtype": "torch.float16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2b0e07d4cdd3b0fe", + "hash_cont_tokens": "52204555b6e39a6e" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "578edd77107cb2c3", + "hash_cont_tokens": "25c49737526d9f80" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "6a95a1511f8da075", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "24a78edc4d9a93aa", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "b11106668d6c0974", + "hash_cont_tokens": "ebed26cf74a85815" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "10180ba12a075cb0", + "hash_cont_tokens": "6898ac348a7ae442" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "73351ef4968750a2", + "hash_cont_tokens": "34a058958a45af94" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "a539150af234c668", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "52e12e5a43bcee35", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "d1f3721a5659f7ee", + "hash_cont_tokens": "da408cb12ab08288" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "f2d78f546b5595c2", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "c9cc19179f63d1d6", + "hash_cont_tokens": "370a1a0ab68d15cd" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5046144e67e992e8", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4b14581ba4fc06fc", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "1ee52c413b5b4cc4", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "2914077c4dd3090a", + "hash_cont_tokens": "80dea4d59245cf01" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0f88a874342378de", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "9889933f1dd02a23", + "hash_cont_tokens": "309bef1803097408" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dc309a94c4bfdd2f", + "hash_cont_tokens": "5105a3bd1b39b785" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "0801a0aebec3ba8c", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "5bc4aca8831d9c05", + "hash_cont_tokens": "205c5deee1581b02" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b92bd6b06fc3464c", + "hash_cont_tokens": "272d28867e0ff046" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a549346cde8165e9", + "hash_cont_tokens": "98b3bf311aa83f0d" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "e7e9cf91f9d6a081", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "a61a1670f854d9e1", + "hash_cont_tokens": "d9e66fc7c702b795" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8a77cb7763f28110", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "fcfcfae391f8faa1", + "hash_cont_tokens": "d4b1936084c060e0" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a29454cc1feb23ef", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "b6734a25556d75dc", + "hash_cont_tokens": "2bf9921a39e901d9" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "5720438e29473426", + "hash_cont_tokens": "cab8b16be9576360" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "486321d5858de240", + "hash_cont_tokens": "1c34fbe5a59f1ed1" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "473919e64d1b8c80", + "hash_cont_tokens": "ebd714885a59ef55" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "47a65c81fd7ed010", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "aedfcd41cbd2fcc9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "ed5f2414144d7b72", + "hash_cont_tokens": "aac52fa6a519223b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "692eaacb5b747264", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "2cbce4edca937588", + "hash_cont_tokens": "697179a0dd47c5c0" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "c2f38b19bab1aa2c", + "hash_cont_tokens": "9b19898e3ecb527f" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fde277bc547bc3d8", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "87b232bbebce39db", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "58c21af9da3e126e", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d1f5c770d368e9c6", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "98d6db15a50aaa8e", + "hash_cont_tokens": "1e30d7dedc7588c0" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "2aabd8c7337502f8", + "hash_cont_tokens": "ceee291786cbb123" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "17f8c8f2d4a0a9b1", + "hash_cont_tokens": "484df4c25a5460bd" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "dfc6df491d991966", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "cffe8139e00da9dd", + "hash_cont_tokens": "85a9de6c685b7035" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "4a69ed6ee55918fb", + "hash_cont_tokens": "ad7b5a040535bdcf" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "6cc713f12b5890de", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "b4044fc92756c377", + "hash_cont_tokens": "0b7b5aaef574dc78" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b019784da8db089a", + "hash_cont_tokens": "63a651778e8d72d2" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "f47f37c7c9bfc601", + "hash_cont_tokens": "841583ab707b25d7" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "4d282718d6142410", + "hash_cont_tokens": "9c2c01d3214f66b8" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fbc6026e500537bc", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "150dd1ff81ff642e", + "hash_cont_tokens": "96353c5969a9028a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "fcbac3e735545969", + "hash_cont_tokens": "a1f8901800ac9b68" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ffc962a38441ef13", + "hash_cont_tokens": "08c0be345e5f1c12" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "9ffb65d225ae550f", + "hash_cont_tokens": "16c760a491c6f26e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "1c61d6705b299f5c", + "hash_cont_tokens": "868d6f1055fbd51d" + }, + "total_evaluation_time_secondes": "9114.379303216934", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/koala-13B-HF/results_2023-07-19T18-49-04.838102.json b/eval-results/TheBloke/koala-13B-HF/results_2023-07-19T18-49-04.838102.json new file mode 100644 index 0000000000000000000000000000000000000000..4be322edf3a368c35a0b33ad41683cb7dae6c720 --- /dev/null +++ b/eval-results/TheBloke/koala-13B-HF/results_2023-07-19T18-49-04.838102.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.49573378839590443, + "acc_stderr": 0.014610858923956948, + "acc_norm": 0.5298634812286689, + "acc_norm_stderr": 0.014585305840007105 + }, + "harness|hellaswag|10": { + "acc": 0.5763792073292173, + "acc_stderr": 0.004931219148182241, + "acc_norm": 0.7759410476000796, + "acc_norm_stderr": 0.004161089244867778 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45185185185185184, + "acc_stderr": 0.04299268905480864, + "acc_norm": 0.45185185185185184, + "acc_norm_stderr": 0.04299268905480864 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.40789473684210525, + "acc_stderr": 0.03999309712777471, + "acc_norm": 0.40789473684210525, + "acc_norm_stderr": 0.03999309712777471 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4679245283018868, + "acc_stderr": 0.030709486992556545, + "acc_norm": 0.4679245283018868, + "acc_norm_stderr": 0.030709486992556545 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4652777777777778, + "acc_stderr": 0.04171115858181618, + "acc_norm": 0.4652777777777778, + "acc_norm_stderr": 0.04171115858181618 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768077, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768077 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3930635838150289, + "acc_stderr": 0.03724249595817728, + "acc_norm": 0.3930635838150289, + "acc_norm_stderr": 0.03724249595817728 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171453, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171453 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.33617021276595743, + "acc_stderr": 0.030881618520676942, + "acc_norm": 0.33617021276595743, + "acc_norm_stderr": 0.030881618520676942 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.040969851398436716, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.040969851398436716 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.02241804289111394, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.02241804289111394 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.042163702135578345, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.042163702135578345 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.4645161290322581, + "acc_stderr": 0.028372287797962956, + "acc_norm": 0.4645161290322581, + "acc_norm_stderr": 0.028372287797962956 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.031785297106427496, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.031785297106427496 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6242424242424243, + "acc_stderr": 0.037818873532059816, + "acc_norm": 0.6242424242424243, + "acc_norm_stderr": 0.037818873532059816 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5808080808080808, + "acc_stderr": 0.03515520728670417, + "acc_norm": 0.5808080808080808, + "acc_norm_stderr": 0.03515520728670417 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6373056994818653, + "acc_stderr": 0.03469713791704371, + "acc_norm": 0.6373056994818653, + "acc_norm_stderr": 0.03469713791704371 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4, + "acc_stderr": 0.024838811988033165, + "acc_norm": 0.4, + "acc_norm_stderr": 0.024838811988033165 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.026067159222275794, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.026067159222275794 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42016806722689076, + "acc_stderr": 0.03206183783236152, + "acc_norm": 0.42016806722689076, + "acc_norm_stderr": 0.03206183783236152 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.03631329803969653, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.03631329803969653 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.581651376146789, + "acc_stderr": 0.021149548596443888, + "acc_norm": 0.581651376146789, + "acc_norm_stderr": 0.021149548596443888 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.03256850570293647, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.03256850570293647 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6274509803921569, + "acc_stderr": 0.033933885849584046, + "acc_norm": 0.6274509803921569, + "acc_norm_stderr": 0.033933885849584046 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6497890295358649, + "acc_stderr": 0.031052391937584346, + "acc_norm": 0.6497890295358649, + "acc_norm_stderr": 0.031052391937584346 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.49327354260089684, + "acc_stderr": 0.03355476596234355, + "acc_norm": 0.49327354260089684, + "acc_norm_stderr": 0.03355476596234355 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5648854961832062, + "acc_stderr": 0.04348208051644858, + "acc_norm": 0.5648854961832062, + "acc_norm_stderr": 0.04348208051644858 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6528925619834711, + "acc_stderr": 0.043457245702925335, + "acc_norm": 0.6528925619834711, + "acc_norm_stderr": 0.043457245702925335 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.048262172941398944, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.048262172941398944 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4785276073619632, + "acc_stderr": 0.0392474687675113, + "acc_norm": 0.4785276073619632, + "acc_norm_stderr": 0.0392474687675113 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.41964285714285715, + "acc_stderr": 0.04684099321077106, + "acc_norm": 0.41964285714285715, + "acc_norm_stderr": 0.04684099321077106 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5242718446601942, + "acc_stderr": 0.049449010929737795, + "acc_norm": 0.5242718446601942, + "acc_norm_stderr": 0.049449010929737795 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6709401709401709, + "acc_stderr": 0.03078232157768817, + "acc_norm": 0.6709401709401709, + "acc_norm_stderr": 0.03078232157768817 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6219667943805874, + "acc_stderr": 0.01733984446210461, + "acc_norm": 0.6219667943805874, + "acc_norm_stderr": 0.01733984446210461 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.026897049996382868, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.026897049996382868 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25139664804469275, + "acc_stderr": 0.014508979453553974, + "acc_norm": 0.25139664804469275, + "acc_norm_stderr": 0.014508979453553974 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5359477124183006, + "acc_stderr": 0.028555827516528777, + "acc_norm": 0.5359477124183006, + "acc_norm_stderr": 0.028555827516528777 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5080385852090032, + "acc_stderr": 0.02839442137098453, + "acc_norm": 0.5080385852090032, + "acc_norm_stderr": 0.02839442137098453 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5061728395061729, + "acc_stderr": 0.027818623962583295, + "acc_norm": 0.5061728395061729, + "acc_norm_stderr": 0.027818623962583295 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3262411347517731, + "acc_stderr": 0.027968453043563168, + "acc_norm": 0.3262411347517731, + "acc_norm_stderr": 0.027968453043563168 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.37809647979139505, + "acc_stderr": 0.012384878406798097, + "acc_norm": 0.37809647979139505, + "acc_norm_stderr": 0.012384878406798097 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.3897058823529412, + "acc_stderr": 0.029624663581159703, + "acc_norm": 0.3897058823529412, + "acc_norm_stderr": 0.029624663581159703 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4264705882352941, + "acc_stderr": 0.02000791273935935, + "acc_norm": 0.4264705882352941, + "acc_norm_stderr": 0.02000791273935935 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.4818181818181818, + "acc_stderr": 0.04785964010794916, + "acc_norm": 0.4818181818181818, + "acc_norm_stderr": 0.04785964010794916 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5755102040816327, + "acc_stderr": 0.031642094879429414, + "acc_norm": 0.5755102040816327, + "acc_norm_stderr": 0.031642094879429414 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6517412935323383, + "acc_stderr": 0.033687874661154596, + "acc_norm": 0.6517412935323383, + "acc_norm_stderr": 0.033687874661154596 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.73, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.73, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.038367221765980515, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.038367221765980515 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.631578947368421, + "acc_stderr": 0.03699658017656878, + "acc_norm": 0.631578947368421, + "acc_norm_stderr": 0.03699658017656878 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.34761321909424725, + "mc1_stderr": 0.016670769188897303, + "mc2": 0.5022661446867869, + "mc2_stderr": 0.01475580770414008 + }, + "all": { + "acc": 0.45602868278820086, + "acc_stderr": 0.03503209125730336, + "acc_norm": 0.4599895562306352, + "acc_norm_stderr": 0.03501860510497685, + "mc1": 0.34761321909424725, + "mc1_stderr": 0.016670769188897303, + "mc2": 0.5022661446867869, + "mc2_stderr": 0.01475580770414008 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/koala-13B-HF", + "model_sha": "b20f96a0171ce4c0fa27d6048215ebe710521587", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/koala-13B-HF/results_2023-10-22T08-43-38.346498.json b/eval-results/TheBloke/koala-13B-HF/results_2023-10-22T08-43-38.346498.json new file mode 100644 index 0000000000000000000000000000000000000000..79834e0e3b4af1c66c18a2a734e135ef3d3b38b9 --- /dev/null +++ b/eval-results/TheBloke/koala-13B-HF/results_2023-10-22T08-43-38.346498.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/koala-13B-HF", + "model_sha": "b20f96a0171ce4c0fa27d6048215ebe710521587", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.021707214765100673, + "em_stderr": 0.0014923686874006184, + "f1": 0.09106753355704705, + "f1_stderr": 0.0020580604985252385 + }, + "harness|gsm8k|5": { + "acc": 0.06823351023502654, + "acc_stderr": 0.006945358944067431 + }, + "harness|winogrande|5": { + "acc": 0.7403314917127072, + "acc_stderr": 0.012322700705552673 + }, + "all": { + "em": 0.021707214765100673, + "em_stderr": 0.0014923686874006184, + "f1": 0.09106753355704705, + "f1_stderr": 0.0020580604985252385, + "acc": 0.40428250097386687, + "acc_stderr": 0.009634029824810052 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "909383e54eae05ad" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "48e0f12bee35d1cd" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "a3d893154ebf5635" + }, + "total_evaluation_time_secondes": "13258.271580219269", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/koala-7B-HF/results_2023-07-19T17-17-07.046452.json b/eval-results/TheBloke/koala-7B-HF/results_2023-07-19T17-17-07.046452.json new file mode 100644 index 0000000000000000000000000000000000000000..de0be1a9188db46fd191d3427bf04ee34c1e640a --- /dev/null +++ b/eval-results/TheBloke/koala-7B-HF/results_2023-07-19T17-17-07.046452.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.4308873720136519, + "acc_stderr": 0.014471133392642471, + "acc_norm": 0.4709897610921502, + "acc_norm_stderr": 0.014586776355294316 + }, + "harness|hellaswag|10": { + "acc": 0.5472017526389166, + "acc_stderr": 0.004967497130451338, + "acc_norm": 0.7358095996813384, + "acc_norm_stderr": 0.004400000822742056 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.03785714465066653, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.03785714465066653 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.21710526315789475, + "acc_stderr": 0.03355045304882921, + "acc_norm": 0.21710526315789475, + "acc_norm_stderr": 0.03355045304882921 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.27169811320754716, + "acc_stderr": 0.027377706624670713, + "acc_norm": 0.27169811320754716, + "acc_norm_stderr": 0.027377706624670713 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.037455547914624555, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.037455547914624555 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.21, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.21, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.24277456647398843, + "acc_stderr": 0.0326926380614177, + "acc_norm": 0.24277456647398843, + "acc_norm_stderr": 0.0326926380614177 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.04023382273617746, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.04023382273617746 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.02880998985410297, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.02880998985410297 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537315, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537315 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.22758620689655173, + "acc_stderr": 0.03493950380131184, + "acc_norm": 0.22758620689655173, + "acc_norm_stderr": 0.03493950380131184 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.22486772486772486, + "acc_stderr": 0.02150209607822914, + "acc_norm": 0.22486772486772486, + "acc_norm_stderr": 0.02150209607822914 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2698412698412698, + "acc_stderr": 0.03970158273235173, + "acc_norm": 0.2698412698412698, + "acc_norm_stderr": 0.03970158273235173 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2032258064516129, + "acc_stderr": 0.022891687984554963, + "acc_norm": 0.2032258064516129, + "acc_norm_stderr": 0.022891687984554963 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.18226600985221675, + "acc_stderr": 0.02716334085964515, + "acc_norm": 0.18226600985221675, + "acc_norm_stderr": 0.02716334085964515 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.22424242424242424, + "acc_stderr": 0.032568666616811015, + "acc_norm": 0.22424242424242424, + "acc_norm_stderr": 0.032568666616811015 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.26262626262626265, + "acc_stderr": 0.031353050095330855, + "acc_norm": 0.26262626262626265, + "acc_norm_stderr": 0.031353050095330855 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.22797927461139897, + "acc_stderr": 0.030276909945178267, + "acc_norm": 0.22797927461139897, + "acc_norm_stderr": 0.030276909945178267 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2205128205128205, + "acc_stderr": 0.021020672680827912, + "acc_norm": 0.2205128205128205, + "acc_norm_stderr": 0.021020672680827912 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.02592887613276612, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.02592887613276612 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.026653531596715484, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.026653531596715484 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2119205298013245, + "acc_stderr": 0.033367670865679766, + "acc_norm": 0.2119205298013245, + "acc_norm_stderr": 0.033367670865679766 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.22201834862385322, + "acc_stderr": 0.01781884956479662, + "acc_norm": 0.22201834862385322, + "acc_norm_stderr": 0.01781884956479662 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.18518518518518517, + "acc_stderr": 0.02649191472735516, + "acc_norm": 0.18518518518518517, + "acc_norm_stderr": 0.02649191472735516 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2696078431372549, + "acc_stderr": 0.03114557065948678, + "acc_norm": 0.2696078431372549, + "acc_norm_stderr": 0.03114557065948678 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.27848101265822783, + "acc_stderr": 0.029178682304842555, + "acc_norm": 0.27848101265822783, + "acc_norm_stderr": 0.029178682304842555 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3183856502242152, + "acc_stderr": 0.03126580522513714, + "acc_norm": 0.3183856502242152, + "acc_norm_stderr": 0.03126580522513714 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.26717557251908397, + "acc_stderr": 0.038808483010823944, + "acc_norm": 0.26717557251908397, + "acc_norm_stderr": 0.038808483010823944 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.371900826446281, + "acc_stderr": 0.04412015806624503, + "acc_norm": 0.371900826446281, + "acc_norm_stderr": 0.04412015806624503 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.28703703703703703, + "acc_stderr": 0.04373313040914761, + "acc_norm": 0.28703703703703703, + "acc_norm_stderr": 0.04373313040914761 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.24539877300613497, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.24539877300613497, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.23214285714285715, + "acc_stderr": 0.04007341809755805, + "acc_norm": 0.23214285714285715, + "acc_norm_stderr": 0.04007341809755805 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.1941747572815534, + "acc_stderr": 0.03916667762822585, + "acc_norm": 0.1941747572815534, + "acc_norm_stderr": 0.03916667762822585 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.33760683760683763, + "acc_stderr": 0.030980296992618558, + "acc_norm": 0.33760683760683763, + "acc_norm_stderr": 0.030980296992618558 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.29118773946360155, + "acc_stderr": 0.016246087069701393, + "acc_norm": 0.29118773946360155, + "acc_norm_stderr": 0.016246087069701393 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2658959537572254, + "acc_stderr": 0.023786203255508283, + "acc_norm": 0.2658959537572254, + "acc_norm_stderr": 0.023786203255508283 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25139664804469275, + "acc_stderr": 0.014508979453553976, + "acc_norm": 0.25139664804469275, + "acc_norm_stderr": 0.014508979453553976 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.23202614379084968, + "acc_stderr": 0.024170840879341016, + "acc_norm": 0.23202614379084968, + "acc_norm_stderr": 0.024170840879341016 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.22186495176848875, + "acc_stderr": 0.023598858292863047, + "acc_norm": 0.22186495176848875, + "acc_norm_stderr": 0.023598858292863047 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.22839506172839505, + "acc_stderr": 0.023358211840626267, + "acc_norm": 0.22839506172839505, + "acc_norm_stderr": 0.023358211840626267 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23404255319148937, + "acc_stderr": 0.025257861359432414, + "acc_norm": 0.23404255319148937, + "acc_norm_stderr": 0.025257861359432414 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.26010430247718386, + "acc_stderr": 0.011204382887823827, + "acc_norm": 0.26010430247718386, + "acc_norm_stderr": 0.011204382887823827 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.21323529411764705, + "acc_stderr": 0.024880971512294264, + "acc_norm": 0.21323529411764705, + "acc_norm_stderr": 0.024880971512294264 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.018120224251484594, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.018120224251484594 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2636363636363636, + "acc_stderr": 0.04220224692971987, + "acc_norm": 0.2636363636363636, + "acc_norm_stderr": 0.04220224692971987 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2530612244897959, + "acc_stderr": 0.02783302387139968, + "acc_norm": 0.2530612244897959, + "acc_norm_stderr": 0.02783302387139968 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.31840796019900497, + "acc_stderr": 0.03294118479054096, + "acc_norm": 0.31840796019900497, + "acc_norm_stderr": 0.03294118479054096 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.29518072289156627, + "acc_stderr": 0.035509201856896294, + "acc_norm": 0.29518072289156627, + "acc_norm_stderr": 0.035509201856896294 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.036155076303109344, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.036155076303109344 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3084455324357405, + "mc1_stderr": 0.01616803938315687, + "mc2": 0.4596019229751083, + "mc2_stderr": 0.014686609641025555 + }, + "all": { + "acc": 0.2632154859542651, + "acc_stderr": 0.0317705519428871, + "acc_norm": 0.26709193063428066, + "acc_norm_stderr": 0.03176289341161494, + "mc1": 0.3084455324357405, + "mc1_stderr": 0.01616803938315687, + "mc2": 0.4596019229751083, + "mc2_stderr": 0.014686609641025555 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/koala-7B-HF", + "model_sha": "d102fe3b68f1a5a50d547e4fd1c8b33b783c993b", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/koala-7B-HF/results_2023-10-22T01-40-19.739323.json b/eval-results/TheBloke/koala-7B-HF/results_2023-10-22T01-40-19.739323.json new file mode 100644 index 0000000000000000000000000000000000000000..96821c8c8e30e58541af5506fd5a8cc718ca605e --- /dev/null +++ b/eval-results/TheBloke/koala-7B-HF/results_2023-10-22T01-40-19.739323.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/koala-7B-HF", + "model_sha": "d102fe3b68f1a5a50d547e4fd1c8b33b783c993b", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.15855704697986578, + "em_stderr": 0.003740630102537935, + "f1": 0.21851510067114052, + "f1_stderr": 0.0038089998736125477 + }, + "harness|gsm8k|5": { + "acc": 0.03639120545868082, + "acc_stderr": 0.005158113489231195 + }, + "harness|winogrande|5": { + "acc": 0.6992896606156275, + "acc_stderr": 0.012888010494704718 + }, + "all": { + "em": 0.15855704697986578, + "em_stderr": 0.003740630102537935, + "f1": 0.21851510067114052, + "f1_stderr": 0.0038089998736125477, + "acc": 0.36784043303715414, + "acc_stderr": 0.009023061991967956 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "54f7506bf28ed641" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "304d120300c42a57" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "8768e709bffbf1bb" + }, + "total_evaluation_time_secondes": "9397.05928349495", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/landmark-attention-llama7b-fp16/results_2023-07-31T15-07-15.770295.json b/eval-results/TheBloke/landmark-attention-llama7b-fp16/results_2023-07-31T15-07-15.770295.json new file mode 100644 index 0000000000000000000000000000000000000000..b822d76b4e1d7975c053d89d5c53c2357e5a7474 --- /dev/null +++ b/eval-results/TheBloke/landmark-attention-llama7b-fp16/results_2023-07-31T15-07-15.770295.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.4445392491467577, + "acc_stderr": 0.014521226405627075, + "acc_norm": 0.4735494880546075, + "acc_norm_stderr": 0.014590931358120163 + }, + "harness|hellaswag|10": { + "acc": 0.5030870344552878, + "acc_stderr": 0.00498968630748456, + "acc_norm": 0.6581358295160327, + "acc_norm_stderr": 0.00473364927481452 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.362962962962963, + "acc_stderr": 0.041539484047424, + "acc_norm": 0.362962962962963, + "acc_norm_stderr": 0.041539484047424 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3026315789473684, + "acc_stderr": 0.037385206761196686, + "acc_norm": 0.3026315789473684, + "acc_norm_stderr": 0.037385206761196686 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.30566037735849055, + "acc_stderr": 0.028353298073322663, + "acc_norm": 0.30566037735849055, + "acc_norm_stderr": 0.028353298073322663 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3402777777777778, + "acc_stderr": 0.03962135573486219, + "acc_norm": 0.3402777777777778, + "acc_norm_stderr": 0.03962135573486219 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.03295304696818318, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.03295304696818318 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.37446808510638296, + "acc_stderr": 0.03163910665367291, + "acc_norm": 0.37446808510638296, + "acc_norm_stderr": 0.03163910665367291 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.0409698513984367, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.0409698513984367 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.037245636197746325, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.037245636197746325 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.26455026455026454, + "acc_stderr": 0.022717467897708624, + "acc_norm": 0.26455026455026454, + "acc_norm_stderr": 0.022717467897708624 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.03893259610604673, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.03893259610604673 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3032258064516129, + "acc_stderr": 0.026148685930671746, + "acc_norm": 0.3032258064516129, + "acc_norm_stderr": 0.026148685930671746 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.22167487684729065, + "acc_stderr": 0.029225575892489617, + "acc_norm": 0.22167487684729065, + "acc_norm_stderr": 0.029225575892489617 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.3696969696969697, + "acc_stderr": 0.03769430314512568, + "acc_norm": 0.3696969696969697, + "acc_norm_stderr": 0.03769430314512568 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2474747474747475, + "acc_stderr": 0.030746300742124505, + "acc_norm": 0.2474747474747475, + "acc_norm_stderr": 0.030746300742124505 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.3626943005181347, + "acc_stderr": 0.03469713791704372, + "acc_norm": 0.3626943005181347, + "acc_norm_stderr": 0.03469713791704372 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.26153846153846155, + "acc_stderr": 0.022282141204204426, + "acc_norm": 0.26153846153846155, + "acc_norm_stderr": 0.022282141204204426 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24814814814814815, + "acc_stderr": 0.0263357394040558, + "acc_norm": 0.24814814814814815, + "acc_norm_stderr": 0.0263357394040558 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2773109243697479, + "acc_stderr": 0.02907937453948001, + "acc_norm": 0.2773109243697479, + "acc_norm_stderr": 0.02907937453948001 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23178807947019867, + "acc_stderr": 0.03445406271987054, + "acc_norm": 0.23178807947019867, + "acc_norm_stderr": 0.03445406271987054 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3137614678899083, + "acc_stderr": 0.019894723341469148, + "acc_norm": 0.3137614678899083, + "acc_norm_stderr": 0.019894723341469148 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.20833333333333334, + "acc_stderr": 0.027696910713093936, + "acc_norm": 0.20833333333333334, + "acc_norm_stderr": 0.027696910713093936 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.034107853389047184, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.034107853389047184 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.3459915611814346, + "acc_stderr": 0.03096481058878671, + "acc_norm": 0.3459915611814346, + "acc_norm_stderr": 0.03096481058878671 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.4349775784753363, + "acc_stderr": 0.033272833702713445, + "acc_norm": 0.4349775784753363, + "acc_norm_stderr": 0.033272833702713445 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.31297709923664124, + "acc_stderr": 0.04066962905677697, + "acc_norm": 0.31297709923664124, + "acc_norm_stderr": 0.04066962905677697 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.4462809917355372, + "acc_stderr": 0.0453793517794788, + "acc_norm": 0.4462809917355372, + "acc_norm_stderr": 0.0453793517794788 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.32407407407407407, + "acc_stderr": 0.045245960070300476, + "acc_norm": 0.32407407407407407, + "acc_norm_stderr": 0.045245960070300476 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4049079754601227, + "acc_stderr": 0.038566721635489125, + "acc_norm": 0.4049079754601227, + "acc_norm_stderr": 0.038566721635489125 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764376, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.2815533980582524, + "acc_stderr": 0.04453254836326469, + "acc_norm": 0.2815533980582524, + "acc_norm_stderr": 0.04453254836326469 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.4017094017094017, + "acc_stderr": 0.03211693751051621, + "acc_norm": 0.4017094017094017, + "acc_norm_stderr": 0.03211693751051621 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.3767560664112388, + "acc_stderr": 0.017328292907303044, + "acc_norm": 0.3767560664112388, + "acc_norm_stderr": 0.017328292907303044 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.3208092485549133, + "acc_stderr": 0.025131000233647907, + "acc_norm": 0.3208092485549133, + "acc_norm_stderr": 0.025131000233647907 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24692737430167597, + "acc_stderr": 0.01442229220480884, + "acc_norm": 0.24692737430167597, + "acc_norm_stderr": 0.01442229220480884 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.33986928104575165, + "acc_stderr": 0.027121956071388845, + "acc_norm": 0.33986928104575165, + "acc_norm_stderr": 0.027121956071388845 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3054662379421222, + "acc_stderr": 0.02616058445014048, + "acc_norm": 0.3054662379421222, + "acc_norm_stderr": 0.02616058445014048 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.3055555555555556, + "acc_stderr": 0.025630824975621344, + "acc_norm": 0.3055555555555556, + "acc_norm_stderr": 0.025630824975621344 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.25177304964539005, + "acc_stderr": 0.025892151156709405, + "acc_norm": 0.25177304964539005, + "acc_norm_stderr": 0.025892151156709405 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.27249022164276404, + "acc_stderr": 0.011371658294311523, + "acc_norm": 0.27249022164276404, + "acc_norm_stderr": 0.011371658294311523 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2867647058823529, + "acc_stderr": 0.02747227447323382, + "acc_norm": 0.2867647058823529, + "acc_norm_stderr": 0.02747227447323382 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.019450768432505514, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.019450768432505514 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.37272727272727274, + "acc_stderr": 0.04631381319425464, + "acc_norm": 0.37272727272727274, + "acc_norm_stderr": 0.04631381319425464 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.22448979591836735, + "acc_stderr": 0.026711430555538405, + "acc_norm": 0.22448979591836735, + "acc_norm_stderr": 0.026711430555538405 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.34328358208955223, + "acc_stderr": 0.03357379665433431, + "acc_norm": 0.34328358208955223, + "acc_norm_stderr": 0.03357379665433431 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3253012048192771, + "acc_stderr": 0.03647168523683228, + "acc_norm": 0.3253012048192771, + "acc_norm_stderr": 0.03647168523683228 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.45614035087719296, + "acc_stderr": 0.038200425866029675, + "acc_norm": 0.45614035087719296, + "acc_norm_stderr": 0.038200425866029675 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2533659730722154, + "mc1_stderr": 0.01522589934082683, + "mc2": 0.4262711761930085, + "mc2_stderr": 0.014459968244566542 + }, + "all": { + "acc": 0.32127377571128635, + "acc_stderr": 0.03364647565597259, + "acc_norm": 0.3243934203548219, + "acc_norm_stderr": 0.03364331748512213, + "mc1": 0.2533659730722154, + "mc1_stderr": 0.01522589934082683, + "mc2": 0.4262711761930085, + "mc2_stderr": 0.014459968244566542 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/landmark-attention-llama7b-fp16", + "model_sha": "bf8bdcb0c30cceb0ceda33cf5fde683807e39a58", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "2597.8905625343323", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/landmark-attention-llama7b-fp16/results_2023-10-22T21-06-08.838189.json b/eval-results/TheBloke/landmark-attention-llama7b-fp16/results_2023-10-22T21-06-08.838189.json new file mode 100644 index 0000000000000000000000000000000000000000..86a9847a009622b77c64f432ea88fa8c7c1c0f4d --- /dev/null +++ b/eval-results/TheBloke/landmark-attention-llama7b-fp16/results_2023-10-22T21-06-08.838189.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/landmark-attention-llama7b-fp16", + "model_sha": "bf8bdcb0c30cceb0ceda33cf5fde683807e39a58", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0014681208053691276, + "em_stderr": 0.0003921042190298539, + "f1": 0.04697252516778534, + "f1_stderr": 0.0013361369387872978 + }, + "harness|gsm8k|5": { + "acc": 0.01592115238817286, + "acc_stderr": 0.0034478192723890015 + }, + "harness|winogrande|5": { + "acc": 0.6803472770323599, + "acc_stderr": 0.01310652851766513 + }, + "all": { + "em": 0.0014681208053691276, + "em_stderr": 0.0003921042190298539, + "f1": 0.04697252516778534, + "f1_stderr": 0.0013361369387872978, + "acc": 0.34813421471026634, + "acc_stderr": 0.008277173895027065 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "3fca0237d0121d2e" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "ec365546485facc6" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "9db754233716ce8c" + }, + "total_evaluation_time_secondes": "22530.621866464615", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/llama-2-70b-Guanaco-QLoRA-fp16/results_2023-07-25T19-54-57.592623.json b/eval-results/TheBloke/llama-2-70b-Guanaco-QLoRA-fp16/results_2023-07-25T19-54-57.592623.json new file mode 100644 index 0000000000000000000000000000000000000000..786539c6ce19697055d73b498c385c54c52632df --- /dev/null +++ b/eval-results/TheBloke/llama-2-70b-Guanaco-QLoRA-fp16/results_2023-07-25T19-54-57.592623.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.6407849829351536, + "acc_stderr": 0.014020224155839162, + "acc_norm": 0.6825938566552902, + "acc_norm_stderr": 0.013602239088038167 + }, + "harness|hellaswag|10": { + "acc": 0.6952798247361084, + "acc_stderr": 0.00459348111167722, + "acc_norm": 0.8831905994821748, + "acc_norm_stderr": 0.0032053660514213653 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6370370370370371, + "acc_stderr": 0.041539484047424, + "acc_norm": 0.6370370370370371, + "acc_norm_stderr": 0.041539484047424 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.8026315789473685, + "acc_stderr": 0.03238981601699397, + "acc_norm": 0.8026315789473685, + "acc_norm_stderr": 0.03238981601699397 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6981132075471698, + "acc_stderr": 0.02825420034443866, + "acc_norm": 0.6981132075471698, + "acc_norm_stderr": 0.02825420034443866 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8194444444444444, + "acc_stderr": 0.03216600808802267, + "acc_norm": 0.8194444444444444, + "acc_norm_stderr": 0.03216600808802267 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6878612716763006, + "acc_stderr": 0.03533133389323657, + "acc_norm": 0.6878612716763006, + "acc_norm_stderr": 0.03533133389323657 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.78, + "acc_stderr": 0.041633319989322626, + "acc_norm": 0.78, + "acc_norm_stderr": 0.041633319989322626 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6595744680851063, + "acc_stderr": 0.030976692998534443, + "acc_norm": 0.6595744680851063, + "acc_norm_stderr": 0.030976692998534443 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4473684210526316, + "acc_stderr": 0.04677473004491199, + "acc_norm": 0.4473684210526316, + "acc_norm_stderr": 0.04677473004491199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6758620689655173, + "acc_stderr": 0.03900432069185555, + "acc_norm": 0.6758620689655173, + "acc_norm_stderr": 0.03900432069185555 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.025634258115554955, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.025634258115554955 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.04444444444444449, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.04444444444444449 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8193548387096774, + "acc_stderr": 0.021886178567172534, + "acc_norm": 0.8193548387096774, + "acc_norm_stderr": 0.021886178567172534 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5270935960591133, + "acc_stderr": 0.03512819077876106, + "acc_norm": 0.5270935960591133, + "acc_norm_stderr": 0.03512819077876106 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.76, + "acc_stderr": 0.04292346959909282, + "acc_norm": 0.76, + "acc_norm_stderr": 0.04292346959909282 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8363636363636363, + "acc_stderr": 0.02888787239548795, + "acc_norm": 0.8363636363636363, + "acc_norm_stderr": 0.02888787239548795 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8686868686868687, + "acc_stderr": 0.024063156416822516, + "acc_norm": 0.8686868686868687, + "acc_norm_stderr": 0.024063156416822516 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.927461139896373, + "acc_stderr": 0.018718998520678175, + "acc_norm": 0.927461139896373, + "acc_norm_stderr": 0.018718998520678175 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.717948717948718, + "acc_stderr": 0.022815813098896597, + "acc_norm": 0.717948717948718, + "acc_norm_stderr": 0.022815813098896597 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34074074074074073, + "acc_stderr": 0.028897748741131147, + "acc_norm": 0.34074074074074073, + "acc_norm_stderr": 0.028897748741131147 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.773109243697479, + "acc_stderr": 0.027205371538279472, + "acc_norm": 0.773109243697479, + "acc_norm_stderr": 0.027205371538279472 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.5099337748344371, + "acc_stderr": 0.04081677107248437, + "acc_norm": 0.5099337748344371, + "acc_norm_stderr": 0.04081677107248437 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8825688073394495, + "acc_stderr": 0.013802780227377342, + "acc_norm": 0.8825688073394495, + "acc_norm_stderr": 0.013802780227377342 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.6296296296296297, + "acc_stderr": 0.03293377139415191, + "acc_norm": 0.6296296296296297, + "acc_norm_stderr": 0.03293377139415191 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9117647058823529, + "acc_stderr": 0.019907399791316952, + "acc_norm": 0.9117647058823529, + "acc_norm_stderr": 0.019907399791316952 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8734177215189873, + "acc_stderr": 0.02164419572795517, + "acc_norm": 0.8734177215189873, + "acc_norm_stderr": 0.02164419572795517 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.8071748878923767, + "acc_stderr": 0.026478240960489365, + "acc_norm": 0.8071748878923767, + "acc_norm_stderr": 0.026478240960489365 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8549618320610687, + "acc_stderr": 0.030884661089515375, + "acc_norm": 0.8549618320610687, + "acc_norm_stderr": 0.030884661089515375 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8760330578512396, + "acc_stderr": 0.030083098716035202, + "acc_norm": 0.8760330578512396, + "acc_norm_stderr": 0.030083098716035202 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8240740740740741, + "acc_stderr": 0.036809181416738807, + "acc_norm": 0.8240740740740741, + "acc_norm_stderr": 0.036809181416738807 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8220858895705522, + "acc_stderr": 0.03004735765580663, + "acc_norm": 0.8220858895705522, + "acc_norm_stderr": 0.03004735765580663 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5357142857142857, + "acc_stderr": 0.04733667890053756, + "acc_norm": 0.5357142857142857, + "acc_norm_stderr": 0.04733667890053756 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.039166677628225836, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.039166677628225836 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.905982905982906, + "acc_stderr": 0.01911989279892498, + "acc_norm": 0.905982905982906, + "acc_norm_stderr": 0.01911989279892498 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.855683269476373, + "acc_stderr": 0.012566417503320939, + "acc_norm": 0.855683269476373, + "acc_norm_stderr": 0.012566417503320939 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7976878612716763, + "acc_stderr": 0.021628077380196117, + "acc_norm": 0.7976878612716763, + "acc_norm_stderr": 0.021628077380196117 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.5206703910614525, + "acc_stderr": 0.016708205559996137, + "acc_norm": 0.5206703910614525, + "acc_norm_stderr": 0.016708205559996137 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.02355083135199509, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.02355083135199509 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7813504823151125, + "acc_stderr": 0.02347558141786111, + "acc_norm": 0.7813504823151125, + "acc_norm_stderr": 0.02347558141786111 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8518518518518519, + "acc_stderr": 0.01976645956359726, + "acc_norm": 0.8518518518518519, + "acc_norm_stderr": 0.01976645956359726 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5709219858156028, + "acc_stderr": 0.029525914302558562, + "acc_norm": 0.5709219858156028, + "acc_norm_stderr": 0.029525914302558562 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5384615384615384, + "acc_stderr": 0.012732398286190431, + "acc_norm": 0.5384615384615384, + "acc_norm_stderr": 0.012732398286190431 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7389705882352942, + "acc_stderr": 0.02667925227010313, + "acc_norm": 0.7389705882352942, + "acc_norm_stderr": 0.02667925227010313 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7598039215686274, + "acc_stderr": 0.017282760695167404, + "acc_norm": 0.7598039215686274, + "acc_norm_stderr": 0.017282760695167404 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7, + "acc_stderr": 0.04389311454644287, + "acc_norm": 0.7, + "acc_norm_stderr": 0.04389311454644287 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.8040816326530612, + "acc_stderr": 0.025409301953225678, + "acc_norm": 0.8040816326530612, + "acc_norm_stderr": 0.025409301953225678 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8805970149253731, + "acc_stderr": 0.02292879327721974, + "acc_norm": 0.8805970149253731, + "acc_norm_stderr": 0.02292879327721974 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.92, + "acc_stderr": 0.0272659924344291, + "acc_norm": 0.92, + "acc_norm_stderr": 0.0272659924344291 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5542168674698795, + "acc_stderr": 0.03869543323472101, + "acc_norm": 0.5542168674698795, + "acc_norm_stderr": 0.03869543323472101 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8771929824561403, + "acc_stderr": 0.02517298435015575, + "acc_norm": 0.8771929824561403, + "acc_norm_stderr": 0.02517298435015575 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.39657282741738065, + "mc1_stderr": 0.017124930942023518, + "mc2": 0.5569422346576033, + "mc2_stderr": 0.014723503655435666 + }, + "all": { + "acc": 0.701167209383954, + "acc_stderr": 0.030973312656304777, + "acc_norm": 0.7050607627477882, + "acc_norm_stderr": 0.030942700789727547, + "mc1": 0.39657282741738065, + "mc1_stderr": 0.017124930942023518, + "mc2": 0.5569422346576033, + "mc2_stderr": 0.014723503655435666 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/llama-2-70b-Guanaco-QLoRA-fp16", + "model_sha": "54b0e39d5e9aee7b323f50b0a26db15295c3d5c9", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "26432.39414000511", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/llama-2-70b-Guanaco-QLoRA-fp16/results_2023-10-22T03-53-16.698758.json b/eval-results/TheBloke/llama-2-70b-Guanaco-QLoRA-fp16/results_2023-10-22T03-53-16.698758.json new file mode 100644 index 0000000000000000000000000000000000000000..7913a4aed594407a929089d6a750cae384a017ca --- /dev/null +++ b/eval-results/TheBloke/llama-2-70b-Guanaco-QLoRA-fp16/results_2023-10-22T03-53-16.698758.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/llama-2-70b-Guanaco-QLoRA-fp16", + "model_sha": "8fbc71bdecdb55fceb0a7ea093d7c48730217abb", + "model_size": "128.56 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.05620805369127517, + "em_stderr": 0.0023587236332230886, + "f1": 0.11980180369127513, + "f1_stderr": 0.002592264922824749 + }, + "harness|gsm8k|5": { + "acc": 0.2979529946929492, + "acc_stderr": 0.012597932232914513 + }, + "harness|winogrande|5": { + "acc": 0.8397790055248618, + "acc_stderr": 0.010309209498187472 + }, + "all": { + "em": 0.05620805369127517, + "em_stderr": 0.0023587236332230886, + "f1": 0.11980180369127513, + "f1_stderr": 0.002592264922824749, + "acc": 0.5688660001089055, + "acc_stderr": 0.011453570865550992 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "68e509b434a844ca" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "94535b697c2b9de9" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "9b4698ec7cff45c8" + }, + "total_evaluation_time_secondes": "44377.45410966873", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/llama-30b-supercot-SuperHOT-8K-fp16/results_2023-08-01T15-49-06.725548.json b/eval-results/TheBloke/llama-30b-supercot-SuperHOT-8K-fp16/results_2023-08-01T15-49-06.725548.json new file mode 100644 index 0000000000000000000000000000000000000000..d151b776e17359d1edad5d4637eaa84e2cc74033 --- /dev/null +++ b/eval-results/TheBloke/llama-30b-supercot-SuperHOT-8K-fp16/results_2023-08-01T15-49-06.725548.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.22866894197952217, + "acc_stderr": 0.012272853582540806, + "acc_norm": 0.25853242320819114, + "acc_norm_stderr": 0.012794553754288686 + }, + "harness|hellaswag|10": { + "acc": 0.2740489942242581, + "acc_stderr": 0.004451222241494048, + "acc_norm": 0.3053176658036248, + "acc_norm_stderr": 0.004596006250433552 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.18518518518518517, + "acc_stderr": 0.03355677216313142, + "acc_norm": 0.18518518518518517, + "acc_norm_stderr": 0.03355677216313142 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.18421052631578946, + "acc_stderr": 0.0315469804508223, + "acc_norm": 0.18421052631578946, + "acc_norm_stderr": 0.0315469804508223 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.21509433962264152, + "acc_stderr": 0.02528839450289137, + "acc_norm": 0.21509433962264152, + "acc_norm_stderr": 0.02528839450289137 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.21, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.21, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749874, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749874 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813365 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.22758620689655173, + "acc_stderr": 0.03493950380131184, + "acc_norm": 0.22758620689655173, + "acc_norm_stderr": 0.03493950380131184 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.021132859182754444, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.021132859182754444 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04040610178208841, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04040610178208841 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24838709677419354, + "acc_stderr": 0.02458002892148101, + "acc_norm": 0.24838709677419354, + "acc_norm_stderr": 0.02458002892148101 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.18226600985221675, + "acc_stderr": 0.02716334085964515, + "acc_norm": 0.18226600985221675, + "acc_norm_stderr": 0.02716334085964515 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.18181818181818182, + "acc_stderr": 0.027479603010538794, + "acc_norm": 0.18181818181818182, + "acc_norm_stderr": 0.027479603010538794 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.24352331606217617, + "acc_stderr": 0.030975436386845426, + "acc_norm": 0.24352331606217617, + "acc_norm_stderr": 0.030975436386845426 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2205128205128205, + "acc_stderr": 0.021020672680827912, + "acc_norm": 0.2205128205128205, + "acc_norm_stderr": 0.021020672680827912 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.02534809746809783, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.02534809746809783 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.22268907563025211, + "acc_stderr": 0.02702543349888238, + "acc_norm": 0.22268907563025211, + "acc_norm_stderr": 0.02702543349888238 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.17880794701986755, + "acc_stderr": 0.031287448506007245, + "acc_norm": 0.17880794701986755, + "acc_norm_stderr": 0.031287448506007245 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1761467889908257, + "acc_stderr": 0.01633288239343138, + "acc_norm": 0.1761467889908257, + "acc_norm_stderr": 0.01633288239343138 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.18981481481481483, + "acc_stderr": 0.026744714834691926, + "acc_norm": 0.18981481481481483, + "acc_norm_stderr": 0.026744714834691926 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.32286995515695066, + "acc_stderr": 0.03138147637575499, + "acc_norm": 0.32286995515695066, + "acc_norm_stderr": 0.03138147637575499 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2085889570552147, + "acc_stderr": 0.03192193448934722, + "acc_norm": 0.2085889570552147, + "acc_norm_stderr": 0.03192193448934722 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2905982905982906, + "acc_stderr": 0.02974504857267404, + "acc_norm": 0.2905982905982906, + "acc_norm_stderr": 0.02974504857267404 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.23116219667943805, + "acc_stderr": 0.015075523238101091, + "acc_norm": 0.23116219667943805, + "acc_norm_stderr": 0.015075523238101091 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.25163398692810457, + "acc_stderr": 0.0248480182638752, + "acc_norm": 0.25163398692810457, + "acc_norm_stderr": 0.0248480182638752 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1864951768488746, + "acc_stderr": 0.02212243977248077, + "acc_norm": 0.1864951768488746, + "acc_norm_stderr": 0.02212243977248077 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2191358024691358, + "acc_stderr": 0.0230167056402622, + "acc_norm": 0.2191358024691358, + "acc_norm_stderr": 0.0230167056402622 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23049645390070922, + "acc_stderr": 0.025123739226872405, + "acc_norm": 0.23049645390070922, + "acc_norm_stderr": 0.025123739226872405 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142692, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142692 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.18382352941176472, + "acc_stderr": 0.023529242185193106, + "acc_norm": 0.18382352941176472, + "acc_norm_stderr": 0.023529242185193106 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25, + "acc_stderr": 0.01751781884501444, + "acc_norm": 0.25, + "acc_norm_stderr": 0.01751781884501444 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.18775510204081633, + "acc_stderr": 0.02500025603954621, + "acc_norm": 0.18775510204081633, + "acc_norm_stderr": 0.02500025603954621 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401465, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401465 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.28313253012048195, + "acc_stderr": 0.03507295431370518, + "acc_norm": 0.28313253012048195, + "acc_norm_stderr": 0.03507295431370518 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2350061199510404, + "mc1_stderr": 0.014843061507731608, + "mc2": 0.4704454489388094, + "mc2_stderr": 0.016777097412683316 + }, + "all": { + "acc": 0.23555810672922425, + "acc_stderr": 0.030884313140032385, + "acc_norm": 0.2365942449124113, + "acc_norm_stderr": 0.030895609482077938, + "mc1": 0.2350061199510404, + "mc1_stderr": 0.014843061507731608, + "mc2": 0.4704454489388094, + "mc2_stderr": 0.016777097412683316 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/llama-30b-supercot-SuperHOT-8K-fp16", + "model_sha": "7efdff78a90132c1c66e1d27518ad7cbadffa139", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "13028.805430173874", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/manticore-13b-chat-pyg-GPTQ/results_2023-09-13T00-35-05.075823.json b/eval-results/TheBloke/manticore-13b-chat-pyg-GPTQ/results_2023-09-13T00-35-05.075823.json new file mode 100644 index 0000000000000000000000000000000000000000..f0f1da90a7f0f9ad2f7f860746a86a053df2ac2b --- /dev/null +++ b/eval-results/TheBloke/manticore-13b-chat-pyg-GPTQ/results_2023-09-13T00-35-05.075823.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "TheBloke/manticore-13b-chat-pyg-GPTQ", + "model_sha": "923f27245d13058c9c1b3ab0eab6c6c93ffc162e", + "model_size": "6.8 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5511945392491467, + "acc_stderr": 0.014534599585097662, + "acc_norm": 0.5784982935153583, + "acc_norm_stderr": 0.014430197069326025 + }, + "harness|hellaswag|10": { + "acc": 0.6094403505277833, + "acc_stderr": 0.004868787333436583, + "acc_norm": 0.8106950806612229, + "acc_norm_stderr": 0.0039095001598848985 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.506578947368421, + "acc_stderr": 0.040685900502249704, + "acc_norm": 0.506578947368421, + "acc_norm_stderr": 0.040685900502249704 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5169811320754717, + "acc_stderr": 0.030755120364119905, + "acc_norm": 0.5169811320754717, + "acc_norm_stderr": 0.030755120364119905 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4791666666666667, + "acc_stderr": 0.041775789507399935, + "acc_norm": 0.4791666666666667, + "acc_norm_stderr": 0.041775789507399935 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.42196531791907516, + "acc_stderr": 0.0376574669386515, + "acc_norm": 0.42196531791907516, + "acc_norm_stderr": 0.0376574669386515 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.04533838195929774, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.04533838195929774 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3829787234042553, + "acc_stderr": 0.03177821250236922, + "acc_norm": 0.3829787234042553, + "acc_norm_stderr": 0.03177821250236922 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.41379310344827586, + "acc_stderr": 0.04104269211806232, + "acc_norm": 0.41379310344827586, + "acc_norm_stderr": 0.04104269211806232 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2830687830687831, + "acc_stderr": 0.023201392938194978, + "acc_norm": 0.2830687830687831, + "acc_norm_stderr": 0.023201392938194978 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.04104947269903394, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.04104947269903394 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.49032258064516127, + "acc_stderr": 0.028438677998909565, + "acc_norm": 0.49032258064516127, + "acc_norm_stderr": 0.028438677998909565 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2955665024630542, + "acc_stderr": 0.032104944337514575, + "acc_norm": 0.2955665024630542, + "acc_norm_stderr": 0.032104944337514575 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6, + "acc_stderr": 0.03825460278380026, + "acc_norm": 0.6, + "acc_norm_stderr": 0.03825460278380026 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5404040404040404, + "acc_stderr": 0.03550702465131343, + "acc_norm": 0.5404040404040404, + "acc_norm_stderr": 0.03550702465131343 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6424870466321243, + "acc_stderr": 0.034588160421810114, + "acc_norm": 0.6424870466321243, + "acc_norm_stderr": 0.034588160421810114 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.43846153846153846, + "acc_stderr": 0.02515826601686857, + "acc_norm": 0.43846153846153846, + "acc_norm_stderr": 0.02515826601686857 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085622, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085622 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.453781512605042, + "acc_stderr": 0.032339434681820885, + "acc_norm": 0.453781512605042, + "acc_norm_stderr": 0.032339434681820885 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6018348623853211, + "acc_stderr": 0.02098798942265427, + "acc_norm": 0.6018348623853211, + "acc_norm_stderr": 0.02098798942265427 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.27314814814814814, + "acc_stderr": 0.030388051301678116, + "acc_norm": 0.27314814814814814, + "acc_norm_stderr": 0.030388051301678116 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6421568627450981, + "acc_stderr": 0.03364487286088299, + "acc_norm": 0.6421568627450981, + "acc_norm_stderr": 0.03364487286088299 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7088607594936709, + "acc_stderr": 0.02957160106575337, + "acc_norm": 0.7088607594936709, + "acc_norm_stderr": 0.02957160106575337 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5426008968609866, + "acc_stderr": 0.03343577705583065, + "acc_norm": 0.5426008968609866, + "acc_norm_stderr": 0.03343577705583065 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5343511450381679, + "acc_stderr": 0.043749285605997376, + "acc_norm": 0.5343511450381679, + "acc_norm_stderr": 0.043749285605997376 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6446280991735537, + "acc_stderr": 0.04369236326573981, + "acc_norm": 0.6446280991735537, + "acc_norm_stderr": 0.04369236326573981 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.04820403072760627, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.04820403072760627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.50920245398773, + "acc_stderr": 0.03927705600787443, + "acc_norm": 0.50920245398773, + "acc_norm_stderr": 0.03927705600787443 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764376, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6116504854368932, + "acc_stderr": 0.0482572933735639, + "acc_norm": 0.6116504854368932, + "acc_norm_stderr": 0.0482572933735639 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.717948717948718, + "acc_stderr": 0.029480360549541187, + "acc_norm": 0.717948717948718, + "acc_norm_stderr": 0.029480360549541187 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.52, + "acc_stderr": 0.05021167315686779, + "acc_norm": 0.52, + "acc_norm_stderr": 0.05021167315686779 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6590038314176245, + "acc_stderr": 0.016951781383223313, + "acc_norm": 0.6590038314176245, + "acc_norm_stderr": 0.016951781383223313 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.4797687861271676, + "acc_stderr": 0.026897049996382868, + "acc_norm": 0.4797687861271676, + "acc_norm_stderr": 0.026897049996382868 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23575418994413408, + "acc_stderr": 0.014196375686290804, + "acc_norm": 0.23575418994413408, + "acc_norm_stderr": 0.014196375686290804 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5032679738562091, + "acc_stderr": 0.028629305194003543, + "acc_norm": 0.5032679738562091, + "acc_norm_stderr": 0.028629305194003543 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5434083601286174, + "acc_stderr": 0.028290869054197604, + "acc_norm": 0.5434083601286174, + "acc_norm_stderr": 0.028290869054197604 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.027777777777777804, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.027777777777777804 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4078014184397163, + "acc_stderr": 0.029316011776343555, + "acc_norm": 0.4078014184397163, + "acc_norm_stderr": 0.029316011776343555 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3970013037809648, + "acc_stderr": 0.012496346982909556, + "acc_norm": 0.3970013037809648, + "acc_norm_stderr": 0.012496346982909556 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.47058823529411764, + "acc_stderr": 0.030320243265004137, + "acc_norm": 0.47058823529411764, + "acc_norm_stderr": 0.030320243265004137 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.48856209150326796, + "acc_stderr": 0.020222541515610863, + "acc_norm": 0.48856209150326796, + "acc_norm_stderr": 0.020222541515610863 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.509090909090909, + "acc_stderr": 0.04788339768702861, + "acc_norm": 0.509090909090909, + "acc_norm_stderr": 0.04788339768702861 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5183673469387755, + "acc_stderr": 0.03198761546763127, + "acc_norm": 0.5183673469387755, + "acc_norm_stderr": 0.03198761546763127 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6467661691542289, + "acc_stderr": 0.03379790611796777, + "acc_norm": 0.6467661691542289, + "acc_norm_stderr": 0.03379790611796777 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42168674698795183, + "acc_stderr": 0.03844453181770917, + "acc_norm": 0.42168674698795183, + "acc_norm_stderr": 0.03844453181770917 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6900584795321637, + "acc_stderr": 0.035469769593931624, + "acc_norm": 0.6900584795321637, + "acc_norm_stderr": 0.035469769593931624 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3268053855569155, + "mc1_stderr": 0.016419874731135032, + "mc2": 0.4776861669816965, + "mc2_stderr": 0.014996477492223563 + }, + "all": { + "acc": 0.47910157210911797, + "acc_stderr": 0.035136312942949756, + "acc_norm": 0.4829754447260613, + "acc_norm_stderr": 0.035118284304147665, + "mc1": 0.3268053855569155, + "mc1_stderr": 0.016419874731135032, + "mc2": 0.4776861669816965, + "mc2_stderr": 0.014996477492223563 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4221.170780181885", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/manticore-13b-chat-pyg-GPTQ/results_2023-11-07T17-20-44.747146.json b/eval-results/TheBloke/manticore-13b-chat-pyg-GPTQ/results_2023-11-07T17-20-44.747146.json new file mode 100644 index 0000000000000000000000000000000000000000..bba1bcf8f1126c2c86f98d7f699a617fe52594c9 --- /dev/null +++ b/eval-results/TheBloke/manticore-13b-chat-pyg-GPTQ/results_2023-11-07T17-20-44.747146.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/manticore-13b-chat-pyg-GPTQ", + "model_sha": "a371849d0728d3f911db94509a0b6f807bc0e32f", + "model_dtype": "torch.float16", + "model_size": "6.8 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.006921140939597316, + "em_stderr": 0.0008490247804930618, + "f1": 0.06798238255033592, + "f1_stderr": 0.0015724347441108313 + }, + "harness|gsm8k|5": { + "acc": 0.08491281273692192, + "acc_stderr": 0.007678212824450799 + }, + "harness|winogrande|5": { + "acc": 0.7592738752959748, + "acc_stderr": 0.012015559212224183 + }, + "all": { + "em": 0.006921140939597316, + "em_stderr": 0.0008490247804930618, + "f1": 0.06798238255033592, + "f1_stderr": 0.0015724347441108313, + "acc": 0.4220933440164484, + "acc_stderr": 0.00984688601833749 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "0e7e691ddb0c3903" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "1f75147df603f54a" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "32b7bf72994b949a" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/medalpaca-13B-GPTQ-4bit/results_2023-08-21T20-37-20.555998.json b/eval-results/TheBloke/medalpaca-13B-GPTQ-4bit/results_2023-08-21T20-37-20.555998.json new file mode 100644 index 0000000000000000000000000000000000000000..81e2a2d10164d901ba84cae62b2ce6011df0092e --- /dev/null +++ b/eval-results/TheBloke/medalpaca-13B-GPTQ-4bit/results_2023-08-21T20-37-20.555998.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.23720136518771331, + "acc_stderr": 0.012430399829260832, + "acc_norm": 0.2935153583617747, + "acc_norm_stderr": 0.013307250444941127 + }, + "harness|hellaswag|10": { + "acc": 0.2560246962756423, + "acc_stderr": 0.004355436696716298, + "acc_norm": 0.26319458275243973, + "acc_norm_stderr": 0.004394671271021432 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.22962962962962963, + "acc_stderr": 0.03633384414073461, + "acc_norm": 0.22962962962962963, + "acc_norm_stderr": 0.03633384414073461 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.03782728980865469, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.03782728980865469 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.3018867924528302, + "acc_stderr": 0.028254200344438665, + "acc_norm": 0.3018867924528302, + "acc_norm_stderr": 0.028254200344438665 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.30057803468208094, + "acc_stderr": 0.0349610148119118, + "acc_norm": 0.30057803468208094, + "acc_norm_stderr": 0.0349610148119118 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.04533838195929775, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.04533838195929775 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.20851063829787234, + "acc_stderr": 0.026556982117838728, + "acc_norm": 0.20851063829787234, + "acc_norm_stderr": 0.026556982117838728 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.19298245614035087, + "acc_stderr": 0.03712454853721368, + "acc_norm": 0.19298245614035087, + "acc_norm_stderr": 0.03712454853721368 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.022644212615525218, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.022644212615525218 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2698412698412698, + "acc_stderr": 0.03970158273235172, + "acc_norm": 0.2698412698412698, + "acc_norm_stderr": 0.03970158273235172 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653695, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653695 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.29354838709677417, + "acc_stderr": 0.025906087021319295, + "acc_norm": 0.29354838709677417, + "acc_norm_stderr": 0.025906087021319295 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03010833071801162, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03010833071801162 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.23, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.23, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.03453131801885415, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.03453131801885415 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3484848484848485, + "acc_stderr": 0.033948539651564025, + "acc_norm": 0.3484848484848485, + "acc_norm_stderr": 0.033948539651564025 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.33678756476683935, + "acc_stderr": 0.034107802518361825, + "acc_norm": 0.33678756476683935, + "acc_norm_stderr": 0.034107802518361825 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3641025641025641, + "acc_stderr": 0.02439667298509477, + "acc_norm": 0.3641025641025641, + "acc_norm_stderr": 0.02439667298509477 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2111111111111111, + "acc_stderr": 0.024882116857655085, + "acc_norm": 0.2111111111111111, + "acc_norm_stderr": 0.024882116857655085 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3487394957983193, + "acc_stderr": 0.03095663632856655, + "acc_norm": 0.3487394957983193, + "acc_norm_stderr": 0.03095663632856655 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.30458715596330277, + "acc_stderr": 0.019732299420354038, + "acc_norm": 0.30458715596330277, + "acc_norm_stderr": 0.019732299420354038 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.030225226160012397, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.030225226160012397 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.28921568627450983, + "acc_stderr": 0.031822318676475524, + "acc_norm": 0.28921568627450983, + "acc_norm_stderr": 0.031822318676475524 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.20675105485232068, + "acc_stderr": 0.026361651668389104, + "acc_norm": 0.20675105485232068, + "acc_norm_stderr": 0.026361651668389104 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.10762331838565023, + "acc_stderr": 0.020799400082879997, + "acc_norm": 0.10762331838565023, + "acc_norm_stderr": 0.020799400082879997 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2824427480916031, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.2824427480916031, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2231404958677686, + "acc_stderr": 0.03800754475228732, + "acc_norm": 0.2231404958677686, + "acc_norm_stderr": 0.03800754475228732 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2392638036809816, + "acc_stderr": 0.033519538795212696, + "acc_norm": 0.2392638036809816, + "acc_norm_stderr": 0.033519538795212696 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.038946411200447915, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.038946411200447915 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3786407766990291, + "acc_stderr": 0.04802694698258972, + "acc_norm": 0.3786407766990291, + "acc_norm_stderr": 0.04802694698258972 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.19658119658119658, + "acc_stderr": 0.02603538609895129, + "acc_norm": 0.19658119658119658, + "acc_norm_stderr": 0.02603538609895129 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909281, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909281 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.20434227330779056, + "acc_stderr": 0.0144191239809319, + "acc_norm": 0.20434227330779056, + "acc_norm_stderr": 0.0144191239809319 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.23699421965317918, + "acc_stderr": 0.02289408248992599, + "acc_norm": 0.23699421965317918, + "acc_norm_stderr": 0.02289408248992599 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.21899441340782122, + "acc_stderr": 0.013831676687303198, + "acc_norm": 0.21899441340782122, + "acc_norm_stderr": 0.013831676687303198 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.024954184324879905, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.024954184324879905 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24115755627009647, + "acc_stderr": 0.024296594034763426, + "acc_norm": 0.24115755627009647, + "acc_norm_stderr": 0.024296594034763426 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2191358024691358, + "acc_stderr": 0.023016705640262192, + "acc_norm": 0.2191358024691358, + "acc_norm_stderr": 0.023016705640262192 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.26595744680851063, + "acc_stderr": 0.026358065698880592, + "acc_norm": 0.26595744680851063, + "acc_norm_stderr": 0.026358065698880592 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24771838331160365, + "acc_stderr": 0.011025499291443744, + "acc_norm": 0.24771838331160365, + "acc_norm_stderr": 0.011025499291443744 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.02841820861940679, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.02841820861940679 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.238562091503268, + "acc_stderr": 0.017242385828779593, + "acc_norm": 0.238562091503268, + "acc_norm_stderr": 0.017242385828779593 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.24545454545454545, + "acc_stderr": 0.041220665028782834, + "acc_norm": 0.24545454545454545, + "acc_norm_stderr": 0.041220665028782834 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.24489795918367346, + "acc_stderr": 0.027529637440174917, + "acc_norm": 0.24489795918367346, + "acc_norm_stderr": 0.027529637440174917 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.27860696517412936, + "acc_stderr": 0.031700561834973086, + "acc_norm": 0.27860696517412936, + "acc_norm_stderr": 0.031700561834973086 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.1927710843373494, + "acc_stderr": 0.030709824050565274, + "acc_norm": 0.1927710843373494, + "acc_norm_stderr": 0.030709824050565274 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.17543859649122806, + "acc_stderr": 0.029170885500727654, + "acc_norm": 0.17543859649122806, + "acc_norm_stderr": 0.029170885500727654 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.24112607099143207, + "mc1_stderr": 0.014974827279752339, + "mc2": 0.4951295157746196, + "mc2_stderr": 0.016901428175034105 + }, + "all": { + "acc": 0.25415553813915254, + "acc_stderr": 0.031533026883914704, + "acc_norm": 0.2552315360993366, + "acc_norm_stderr": 0.03154855375154158, + "mc1": 0.24112607099143207, + "mc1_stderr": 0.014974827279752339, + "mc2": 0.4951295157746196, + "mc2_stderr": 0.016901428175034105 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/medalpaca-13B-GPTQ-4bit", + "model_sha": "12190f743a19e91dfe1f5c77abc0c1bf486073dd", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2b0e07d4cdd3b0fe", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "578edd77107cb2c3", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "6a95a1511f8da075", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "24a78edc4d9a93aa", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "b11106668d6c0974", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "10180ba12a075cb0", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "73351ef4968750a2", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "a539150af234c668", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "52e12e5a43bcee35", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "d1f3721a5659f7ee", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "f2d78f546b5595c2", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "c9cc19179f63d1d6", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5046144e67e992e8", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4b14581ba4fc06fc", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "1ee52c413b5b4cc4", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "2914077c4dd3090a", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0f88a874342378de", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "9889933f1dd02a23", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dc309a94c4bfdd2f", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "0801a0aebec3ba8c", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "5bc4aca8831d9c05", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b92bd6b06fc3464c", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a549346cde8165e9", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "e7e9cf91f9d6a081", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "a61a1670f854d9e1", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8a77cb7763f28110", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "fcfcfae391f8faa1", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a29454cc1feb23ef", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "b6734a25556d75dc", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "5720438e29473426", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "486321d5858de240", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "473919e64d1b8c80", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "47a65c81fd7ed010", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "aedfcd41cbd2fcc9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "ed5f2414144d7b72", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "692eaacb5b747264", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "2cbce4edca937588", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "c2f38b19bab1aa2c", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fde277bc547bc3d8", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "87b232bbebce39db", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "58c21af9da3e126e", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d1f5c770d368e9c6", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "98d6db15a50aaa8e", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "2aabd8c7337502f8", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "17f8c8f2d4a0a9b1", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "dfc6df491d991966", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "cffe8139e00da9dd", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "4a69ed6ee55918fb", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "6cc713f12b5890de", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "b4044fc92756c377", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b019784da8db089a", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "f47f37c7c9bfc601", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "4d282718d6142410", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fbc6026e500537bc", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "150dd1ff81ff642e", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "fcbac3e735545969", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ffc962a38441ef13", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "9ffb65d225ae550f", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "1c61d6705b299f5c", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4732.123983621597", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/medalpaca-13B-GPTQ-4bit/results_2023-11-05T14-02-24.762310.json b/eval-results/TheBloke/medalpaca-13B-GPTQ-4bit/results_2023-11-05T14-02-24.762310.json new file mode 100644 index 0000000000000000000000000000000000000000..a67691a6dd878b3dfa56e9eff645f78959f495b8 --- /dev/null +++ b/eval-results/TheBloke/medalpaca-13B-GPTQ-4bit/results_2023-11-05T14-02-24.762310.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/medalpaca-13B-GPTQ-4bit", + "model_sha": "461a1b6082640b4f96e5290457d2c2263ffb8f6b", + "model_dtype": "torch.float16", + "model_size": "6.8 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.06994546979865772, + "em_stderr": 0.0026120028912023246, + "f1": 0.12759542785234868, + "f1_stderr": 0.002878081471947882 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5311760063141279, + "acc_stderr": 0.01402514264063952 + }, + "all": { + "em": 0.06994546979865772, + "em_stderr": 0.0026120028912023246, + "f1": 0.12759542785234868, + "f1_stderr": 0.002878081471947882, + "acc": 0.26558800315706393, + "acc_stderr": 0.00701257132031976 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "afa3f956b5946008", + "hash_cont_tokens": "b750ee345b880dd3" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6f81fd8346219949", + "hash_cont_tokens": "81fd31b171045d47" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "b84988137a00c1f4", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "fb75bd68fb756923", + "hash_cont_tokens": "0304d06a96a298c7" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/medalpaca-13B-GPTQ-4bit/results_2023-11-07T11-22-05.804023.json b/eval-results/TheBloke/medalpaca-13B-GPTQ-4bit/results_2023-11-07T11-22-05.804023.json new file mode 100644 index 0000000000000000000000000000000000000000..ad57a9adbedc75b8388e8c6dea8700e974992e6f --- /dev/null +++ b/eval-results/TheBloke/medalpaca-13B-GPTQ-4bit/results_2023-11-07T11-22-05.804023.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/medalpaca-13B-GPTQ-4bit", + "model_sha": "461a1b6082640b4f96e5290457d2c2263ffb8f6b", + "model_dtype": "torch.float16", + "model_size": "6.8 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.06973573825503356, + "em_stderr": 0.0026083779557512714, + "f1": 0.12751992449664398, + "f1_stderr": 0.0028759868015646797 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5311760063141279, + "acc_stderr": 0.01402514264063952 + }, + "all": { + "em": 0.06973573825503356, + "em_stderr": 0.0026083779557512714, + "f1": 0.12751992449664398, + "f1_stderr": 0.0028759868015646797, + "acc": 0.26558800315706393, + "acc_stderr": 0.00701257132031976 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "afa3f956b5946008", + "hash_cont_tokens": "1721c9b5436ef713" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6f81fd8346219949", + "hash_cont_tokens": "8b6ab3ee5e584ba1" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "b84988137a00c1f4", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "fb75bd68fb756923", + "hash_cont_tokens": "36d0052bd9eb0f34" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/neural-chat-7B-v3-2-GPTQ/results_2023-12-11T00-12-21.907526.json b/eval-results/TheBloke/neural-chat-7B-v3-2-GPTQ/results_2023-12-11T00-12-21.907526.json new file mode 100644 index 0000000000000000000000000000000000000000..4fe7946d005cba433a06389cf3d99dc51196d1e3 --- /dev/null +++ b/eval-results/TheBloke/neural-chat-7B-v3-2-GPTQ/results_2023-12-11T00-12-21.907526.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 698617.606478256, + "end_time": 708410.862221901, + "total_evaluation_time_secondes": "9793.255743644899", + "model_name": "TheBloke/neural-chat-7B-v3-2-GPTQ", + "model_sha": "cfe57da77e55efcb0e1087dc3948aeaa6ca55c74", + "model_dtype": "None", + "model_size": "4.37 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6296928327645052, + "acc_stderr": 0.01411129875167495, + "acc_norm": 0.659556313993174, + "acc_norm_stderr": 0.013847460518892978 + }, + "harness|hellaswag|10": { + "acc": 0.6360286795459071, + "acc_stderr": 0.004801572028920794, + "acc_norm": 0.8324039036048596, + "acc_norm_stderr": 0.003727438786513393 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5407407407407407, + "acc_stderr": 0.04304979692464241, + "acc_norm": 0.5407407407407407, + "acc_norm_stderr": 0.04304979692464241 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6710526315789473, + "acc_stderr": 0.03823428969926605, + "acc_norm": 0.6710526315789473, + "acc_norm_stderr": 0.03823428969926605 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.660377358490566, + "acc_stderr": 0.02914690474779833, + "acc_norm": 0.660377358490566, + "acc_norm_stderr": 0.02914690474779833 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7152777777777778, + "acc_stderr": 0.037738099906869334, + "acc_norm": 0.7152777777777778, + "acc_norm_stderr": 0.037738099906869334 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5722543352601156, + "acc_stderr": 0.03772446857518027, + "acc_norm": 0.5722543352601156, + "acc_norm_stderr": 0.03772446857518027 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.49019607843137253, + "acc_stderr": 0.04974229460422817, + "acc_norm": 0.49019607843137253, + "acc_norm_stderr": 0.04974229460422817 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5148936170212766, + "acc_stderr": 0.03267151848924777, + "acc_norm": 0.5148936170212766, + "acc_norm_stderr": 0.03267151848924777 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4298245614035088, + "acc_stderr": 0.04657047260594964, + "acc_norm": 0.4298245614035088, + "acc_norm_stderr": 0.04657047260594964 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3544973544973545, + "acc_stderr": 0.024636830602842, + "acc_norm": 0.3544973544973545, + "acc_norm_stderr": 0.024636830602842 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.043758884927270605, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.043758884927270605 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7419354838709677, + "acc_stderr": 0.02489246917246283, + "acc_norm": 0.7419354838709677, + "acc_norm_stderr": 0.02489246917246283 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.47783251231527096, + "acc_stderr": 0.03514528562175007, + "acc_norm": 0.47783251231527096, + "acc_norm_stderr": 0.03514528562175007 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7515151515151515, + "acc_stderr": 0.033744026441394036, + "acc_norm": 0.7515151515151515, + "acc_norm_stderr": 0.033744026441394036 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7626262626262627, + "acc_stderr": 0.0303137105381989, + "acc_norm": 0.7626262626262627, + "acc_norm_stderr": 0.0303137105381989 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8341968911917098, + "acc_stderr": 0.026839845022314415, + "acc_norm": 0.8341968911917098, + "acc_norm_stderr": 0.026839845022314415 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5948717948717949, + "acc_stderr": 0.024890471769938145, + "acc_norm": 0.5948717948717949, + "acc_norm_stderr": 0.024890471769938145 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.027840811495871923, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.027840811495871923 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6638655462184874, + "acc_stderr": 0.030684737115135356, + "acc_norm": 0.6638655462184874, + "acc_norm_stderr": 0.030684737115135356 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.36423841059602646, + "acc_stderr": 0.03929111781242742, + "acc_norm": 0.36423841059602646, + "acc_norm_stderr": 0.03929111781242742 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7963302752293578, + "acc_stderr": 0.017266742087630783, + "acc_norm": 0.7963302752293578, + "acc_norm_stderr": 0.017266742087630783 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4861111111111111, + "acc_stderr": 0.03408655867977749, + "acc_norm": 0.4861111111111111, + "acc_norm_stderr": 0.03408655867977749 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.75, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.75, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7848101265822784, + "acc_stderr": 0.02675082699467617, + "acc_norm": 0.7848101265822784, + "acc_norm_stderr": 0.02675082699467617 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6771300448430493, + "acc_stderr": 0.031381476375754995, + "acc_norm": 0.6771300448430493, + "acc_norm_stderr": 0.031381476375754995 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6564885496183206, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.6564885496183206, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7851239669421488, + "acc_stderr": 0.037494924487096966, + "acc_norm": 0.7851239669421488, + "acc_norm_stderr": 0.037494924487096966 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7037037037037037, + "acc_stderr": 0.04414343666854934, + "acc_norm": 0.7037037037037037, + "acc_norm_stderr": 0.04414343666854934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6319018404907976, + "acc_stderr": 0.03789213935838396, + "acc_norm": 0.6319018404907976, + "acc_norm_stderr": 0.03789213935838396 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.48214285714285715, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.48214285714285715, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7669902912621359, + "acc_stderr": 0.04185832598928315, + "acc_norm": 0.7669902912621359, + "acc_norm_stderr": 0.04185832598928315 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8205128205128205, + "acc_stderr": 0.02514093595033544, + "acc_norm": 0.8205128205128205, + "acc_norm_stderr": 0.02514093595033544 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.68, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.68, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7841634738186463, + "acc_stderr": 0.014711684386139953, + "acc_norm": 0.7841634738186463, + "acc_norm_stderr": 0.014711684386139953 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6358381502890174, + "acc_stderr": 0.02590663263101613, + "acc_norm": 0.6358381502890174, + "acc_norm_stderr": 0.02590663263101613 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.39106145251396646, + "acc_stderr": 0.016320763763808383, + "acc_norm": 0.39106145251396646, + "acc_norm_stderr": 0.016320763763808383 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6830065359477124, + "acc_stderr": 0.026643278474508755, + "acc_norm": 0.6830065359477124, + "acc_norm_stderr": 0.026643278474508755 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6881028938906752, + "acc_stderr": 0.026311858071854155, + "acc_norm": 0.6881028938906752, + "acc_norm_stderr": 0.026311858071854155 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6728395061728395, + "acc_stderr": 0.02610567386140983, + "acc_norm": 0.6728395061728395, + "acc_norm_stderr": 0.02610567386140983 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.42907801418439717, + "acc_stderr": 0.029525914302558555, + "acc_norm": 0.42907801418439717, + "acc_norm_stderr": 0.029525914302558555 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4165580182529335, + "acc_stderr": 0.012591153245057392, + "acc_norm": 0.4165580182529335, + "acc_norm_stderr": 0.012591153245057392 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5625, + "acc_stderr": 0.030134614954403924, + "acc_norm": 0.5625, + "acc_norm_stderr": 0.030134614954403924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6045751633986928, + "acc_stderr": 0.01978046595477751, + "acc_norm": 0.6045751633986928, + "acc_norm_stderr": 0.01978046595477751 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6181818181818182, + "acc_stderr": 0.046534298079135075, + "acc_norm": 0.6181818181818182, + "acc_norm_stderr": 0.046534298079135075 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7061224489795919, + "acc_stderr": 0.029162738410249772, + "acc_norm": 0.7061224489795919, + "acc_norm_stderr": 0.029162738410249772 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7860696517412935, + "acc_stderr": 0.02899690969332891, + "acc_norm": 0.7860696517412935, + "acc_norm_stderr": 0.02899690969332891 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5421686746987951, + "acc_stderr": 0.0387862677100236, + "acc_norm": 0.5421686746987951, + "acc_norm_stderr": 0.0387862677100236 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7719298245614035, + "acc_stderr": 0.03218093795602357, + "acc_norm": 0.7719298245614035, + "acc_norm_stderr": 0.03218093795602357 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.4541003671970624, + "mc1_stderr": 0.017429593091323522, + "mc2": 0.5979099902582387, + "mc2_stderr": 0.01509977856693472 + }, + "harness|winogrande|5": { + "acc": 0.7947908445146015, + "acc_stderr": 0.01135031570746206 + }, + "harness|gsm8k|5": { + "acc": 0.5284306292645944, + "acc_stderr": 0.013750202076584419 + }, + "all": { + "acc": 0.6058481456466821, + "acc_stderr": 0.03323160720607251, + "acc_norm": 0.6077924426433228, + "acc_norm_stderr": 0.033909992378155715, + "mc1": 0.4541003671970624, + "mc1_stderr": 0.017429593091323522, + "mc2": 0.5979099902582387, + "mc2_stderr": 0.01509977856693472 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "82651fa89adaef98" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "7c1b22a1fe265a8c" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/openchat_v2_openorca_preview-GPTQ/results_2023-08-22T11-30-59.875390.json b/eval-results/TheBloke/openchat_v2_openorca_preview-GPTQ/results_2023-08-22T11-30-59.875390.json new file mode 100644 index 0000000000000000000000000000000000000000..35daf81f0dedebcf61150b30156ddafd23ec6762 --- /dev/null +++ b/eval-results/TheBloke/openchat_v2_openorca_preview-GPTQ/results_2023-08-22T11-30-59.875390.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.23208191126279865, + "acc_stderr": 0.012336718284948854, + "acc_norm": 0.27986348122866894, + "acc_norm_stderr": 0.013119040897725922 + }, + "harness|hellaswag|10": { + "acc": 0.25851424019119695, + "acc_stderr": 0.004369232540125881, + "acc_norm": 0.2606054570802629, + "acc_norm_stderr": 0.004380678585341419 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653695, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653695 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.03673731683969506, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.03673731683969506 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2679245283018868, + "acc_stderr": 0.02725726032249485, + "acc_norm": 0.2679245283018868, + "acc_norm_stderr": 0.02725726032249485 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2254335260115607, + "acc_stderr": 0.03186209851641143, + "acc_norm": 0.2254335260115607, + "acc_norm_stderr": 0.03186209851641143 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.04755129616062947, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.04755129616062947 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.18723404255319148, + "acc_stderr": 0.025501588341883603, + "acc_norm": 0.18723404255319148, + "acc_norm_stderr": 0.025501588341883603 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.16551724137931034, + "acc_stderr": 0.030970559966224085, + "acc_norm": 0.16551724137931034, + "acc_norm_stderr": 0.030970559966224085 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25132275132275134, + "acc_stderr": 0.022340482339643895, + "acc_norm": 0.25132275132275134, + "acc_norm_stderr": 0.022340482339643895 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04040610178208841, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04040610178208841 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.17, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.17, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.23548387096774193, + "acc_stderr": 0.024137632429337707, + "acc_norm": 0.23548387096774193, + "acc_norm_stderr": 0.024137632429337707 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.18719211822660098, + "acc_stderr": 0.027444924966882618, + "acc_norm": 0.18719211822660098, + "acc_norm_stderr": 0.027444924966882618 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720683, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720683 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.23737373737373738, + "acc_stderr": 0.030313710538198892, + "acc_norm": 0.23737373737373738, + "acc_norm_stderr": 0.030313710538198892 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.24352331606217617, + "acc_stderr": 0.030975436386845436, + "acc_norm": 0.24352331606217617, + "acc_norm_stderr": 0.030975436386845436 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.30256410256410254, + "acc_stderr": 0.02329088805377272, + "acc_norm": 0.30256410256410254, + "acc_norm_stderr": 0.02329088805377272 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2, + "acc_stderr": 0.02438843043398766, + "acc_norm": 0.2, + "acc_norm_stderr": 0.02438843043398766 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.029597329730978082, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.029597329730978082 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.036313298039696545, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.036313298039696545 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.26788990825688075, + "acc_stderr": 0.01898746225797865, + "acc_norm": 0.26788990825688075, + "acc_norm_stderr": 0.01898746225797865 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.23148148148148148, + "acc_stderr": 0.02876511171804696, + "acc_norm": 0.23148148148148148, + "acc_norm_stderr": 0.02876511171804696 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.02977177522814563, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.02977177522814563 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.21518987341772153, + "acc_stderr": 0.026750826994676177, + "acc_norm": 0.21518987341772153, + "acc_norm_stderr": 0.026750826994676177 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.19282511210762332, + "acc_stderr": 0.02647824096048936, + "acc_norm": 0.19282511210762332, + "acc_norm_stderr": 0.02647824096048936 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2824427480916031, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.2824427480916031, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.19008264462809918, + "acc_stderr": 0.03581796951709282, + "acc_norm": 0.19008264462809918, + "acc_norm_stderr": 0.03581796951709282 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.041331194402438404, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.041331194402438404 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.03259177392742177, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.03259177392742177 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.24271844660194175, + "acc_stderr": 0.042450224863844935, + "acc_norm": 0.24271844660194175, + "acc_norm_stderr": 0.042450224863844935 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.23504273504273504, + "acc_stderr": 0.027778835904935427, + "acc_norm": 0.23504273504273504, + "acc_norm_stderr": 0.027778835904935427 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.21328224776500637, + "acc_stderr": 0.014648172749593522, + "acc_norm": 0.21328224776500637, + "acc_norm_stderr": 0.014648172749593522 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.23410404624277456, + "acc_stderr": 0.022797110278071134, + "acc_norm": 0.23410404624277456, + "acc_norm_stderr": 0.022797110278071134 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24804469273743016, + "acc_stderr": 0.014444157808261466, + "acc_norm": 0.24804469273743016, + "acc_norm_stderr": 0.014444157808261466 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.02463004897982478, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.02463004897982478 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.21221864951768488, + "acc_stderr": 0.023222756797435122, + "acc_norm": 0.21221864951768488, + "acc_norm_stderr": 0.023222756797435122 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25308641975308643, + "acc_stderr": 0.024191808600713006, + "acc_norm": 0.25308641975308643, + "acc_norm_stderr": 0.024191808600713006 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.25886524822695034, + "acc_stderr": 0.026129572527180848, + "acc_norm": 0.25886524822695034, + "acc_norm_stderr": 0.026129572527180848 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2529335071707953, + "acc_stderr": 0.01110226871383999, + "acc_norm": 0.2529335071707953, + "acc_norm_stderr": 0.01110226871383999 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2610294117647059, + "acc_stderr": 0.02667925227010313, + "acc_norm": 0.2610294117647059, + "acc_norm_stderr": 0.02667925227010313 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25326797385620914, + "acc_stderr": 0.017593486895366835, + "acc_norm": 0.25326797385620914, + "acc_norm_stderr": 0.017593486895366835 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.20909090909090908, + "acc_stderr": 0.03895091015724138, + "acc_norm": 0.20909090909090908, + "acc_norm_stderr": 0.03895091015724138 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2897959183673469, + "acc_stderr": 0.029043088683304342, + "acc_norm": 0.2897959183673469, + "acc_norm_stderr": 0.029043088683304342 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23880597014925373, + "acc_stderr": 0.030147775935409217, + "acc_norm": 0.23880597014925373, + "acc_norm_stderr": 0.030147775935409217 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.25301204819277107, + "acc_stderr": 0.03384429155233134, + "acc_norm": 0.25301204819277107, + "acc_norm_stderr": 0.03384429155233134 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.031267817146631786, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.031267817146631786 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23378212974296206, + "mc1_stderr": 0.014816195991931598, + "mc2": 0.5007929816225261, + "mc2_stderr": 0.017079917935026806 + }, + "all": { + "acc": 0.24246694991848233, + "acc_stderr": 0.03117174175399139, + "acc_norm": 0.2433122513905999, + "acc_norm_stderr": 0.03118519546005906, + "mc1": 0.23378212974296206, + "mc1_stderr": 0.014816195991931598, + "mc2": 0.5007929816225261, + "mc2_stderr": 0.017079917935026806 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/openchat_v2_openorca_preview-GPTQ", + "model_sha": "5a4c2ea612b71d7c00118f796db7189bc1a0c930", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4748.589247465134", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/openchat_v2_openorca_preview-GPTQ/results_2023-11-05T09-50-36.220410.json b/eval-results/TheBloke/openchat_v2_openorca_preview-GPTQ/results_2023-11-05T09-50-36.220410.json new file mode 100644 index 0000000000000000000000000000000000000000..c85a554c482ad2a65f6aacc95ac8006db1609217 --- /dev/null +++ b/eval-results/TheBloke/openchat_v2_openorca_preview-GPTQ/results_2023-11-05T09-50-36.220410.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/openchat_v2_openorca_preview-GPTQ", + "model_sha": "5a4c2ea612b71d7c00118f796db7189bc1a0c930", + "model_dtype": "torch.float16", + "model_size": "6.8 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.0028313758389261743, + "em_stderr": 0.0005441551135493808, + "f1": 0.07965813758389279, + "f1_stderr": 0.0016966560385267672 + }, + "harness|gsm8k|5": { + "acc": 0.1326762699014405, + "acc_stderr": 0.009343929131442216 + }, + "harness|winogrande|5": { + "acc": 0.7063930544593529, + "acc_stderr": 0.012799397296204164 + }, + "all": { + "em": 0.0028313758389261743, + "em_stderr": 0.0005441551135493808, + "f1": 0.07965813758389279, + "f1_stderr": 0.0016966560385267672, + "acc": 0.4195346621803967, + "acc_stderr": 0.01107166321382319 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "6b39700a568f2ddc" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "14d7f3ce63ef064d" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "0561ad4fa658fd2d" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/openchat_v2_openorca_preview-GPTQ/results_2023-11-07T20-49-23.758831.json b/eval-results/TheBloke/openchat_v2_openorca_preview-GPTQ/results_2023-11-07T20-49-23.758831.json new file mode 100644 index 0000000000000000000000000000000000000000..ee73f057abbf3bab320ec71b598c12797222d39f --- /dev/null +++ b/eval-results/TheBloke/openchat_v2_openorca_preview-GPTQ/results_2023-11-07T20-49-23.758831.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/openchat_v2_openorca_preview-GPTQ", + "model_sha": "5a4c2ea612b71d7c00118f796db7189bc1a0c930", + "model_dtype": "torch.float16", + "model_size": "6.8 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.0028313758389261743, + "em_stderr": 0.0005441551135493808, + "f1": 0.07958787751677875, + "f1_stderr": 0.00169607187925535 + }, + "harness|gsm8k|5": { + "acc": 0.1326762699014405, + "acc_stderr": 0.009343929131442216 + }, + "harness|winogrande|5": { + "acc": 0.7063930544593529, + "acc_stderr": 0.012799397296204164 + }, + "all": { + "em": 0.0028313758389261743, + "em_stderr": 0.0005441551135493808, + "f1": 0.07958787751677875, + "f1_stderr": 0.00169607187925535, + "acc": 0.4195346621803967, + "acc_stderr": 0.01107166321382319 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "f71865138b585238" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "69589abd9e2a0f83" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "80f592aba9f1f2f5" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/orca_mini_13B-GPTQ/results_2023-08-21T16-54-09.124965.json b/eval-results/TheBloke/orca_mini_13B-GPTQ/results_2023-08-21T16-54-09.124965.json new file mode 100644 index 0000000000000000000000000000000000000000..50ccd5813f0431d3fa3349ef629cfb29cb4b0869 --- /dev/null +++ b/eval-results/TheBloke/orca_mini_13B-GPTQ/results_2023-08-21T16-54-09.124965.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.23976109215017063, + "acc_stderr": 0.012476304127453954, + "acc_norm": 0.27303754266211605, + "acc_norm_stderr": 0.01301933276263573 + }, + "harness|hellaswag|10": { + "acc": 0.2561242780322645, + "acc_stderr": 0.004355992090030988, + "acc_norm": 0.25851424019119695, + "acc_norm_stderr": 0.004369232540125872 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2, + "acc_stderr": 0.034554737023254366, + "acc_norm": 0.2, + "acc_norm_stderr": 0.034554737023254366 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.2236842105263158, + "acc_stderr": 0.033911609343436025, + "acc_norm": 0.2236842105263158, + "acc_norm_stderr": 0.033911609343436025 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2981132075471698, + "acc_stderr": 0.028152837942493875, + "acc_norm": 0.2981132075471698, + "acc_norm_stderr": 0.028152837942493875 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2986111111111111, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.2986111111111111, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.31213872832369943, + "acc_stderr": 0.035331333893236574, + "acc_norm": 0.31213872832369943, + "acc_norm_stderr": 0.035331333893236574 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.04784060704105654, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.04784060704105654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.24680851063829787, + "acc_stderr": 0.028185441301234102, + "acc_norm": 0.24680851063829787, + "acc_norm_stderr": 0.028185441301234102 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.037245636197746325, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.037245636197746325 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2328042328042328, + "acc_stderr": 0.02176596167215453, + "acc_norm": 0.2328042328042328, + "acc_norm_stderr": 0.02176596167215453 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2698412698412698, + "acc_stderr": 0.03970158273235173, + "acc_norm": 0.2698412698412698, + "acc_norm_stderr": 0.03970158273235173 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036847, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036847 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.26129032258064516, + "acc_stderr": 0.02499305339776482, + "acc_norm": 0.26129032258064516, + "acc_norm_stderr": 0.02499305339776482 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.03144712581678242, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.03144712581678242 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.296969696969697, + "acc_stderr": 0.0356796977226805, + "acc_norm": 0.296969696969697, + "acc_norm_stderr": 0.0356796977226805 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.26262626262626265, + "acc_stderr": 0.03135305009533086, + "acc_norm": 0.26262626262626265, + "acc_norm_stderr": 0.03135305009533086 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.26424870466321243, + "acc_stderr": 0.03182155050916648, + "acc_norm": 0.26424870466321243, + "acc_norm_stderr": 0.03182155050916648 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.24615384615384617, + "acc_stderr": 0.02184086699042308, + "acc_norm": 0.24615384615384617, + "acc_norm_stderr": 0.02184086699042308 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.02592887613276611, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.02592887613276611 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3025210084033613, + "acc_stderr": 0.029837962388291936, + "acc_norm": 0.3025210084033613, + "acc_norm_stderr": 0.029837962388291936 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.17880794701986755, + "acc_stderr": 0.03128744850600724, + "acc_norm": 0.17880794701986755, + "acc_norm_stderr": 0.03128744850600724 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.24954128440366974, + "acc_stderr": 0.018553897629501617, + "acc_norm": 0.24954128440366974, + "acc_norm_stderr": 0.018553897629501617 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3055555555555556, + "acc_stderr": 0.03141554629402544, + "acc_norm": 0.3055555555555556, + "acc_norm_stderr": 0.03141554629402544 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.22058823529411764, + "acc_stderr": 0.02910225438967408, + "acc_norm": 0.22058823529411764, + "acc_norm_stderr": 0.02910225438967408 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.22784810126582278, + "acc_stderr": 0.027303484599069422, + "acc_norm": 0.22784810126582278, + "acc_norm_stderr": 0.027303484599069422 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.22869955156950672, + "acc_stderr": 0.028188240046929196, + "acc_norm": 0.22869955156950672, + "acc_norm_stderr": 0.028188240046929196 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.29770992366412213, + "acc_stderr": 0.040103589424622034, + "acc_norm": 0.29770992366412213, + "acc_norm_stderr": 0.040103589424622034 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.19834710743801653, + "acc_stderr": 0.036401182719909456, + "acc_norm": 0.19834710743801653, + "acc_norm_stderr": 0.036401182719909456 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.04133119440243839, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.04133119440243839 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22699386503067484, + "acc_stderr": 0.032910995786157665, + "acc_norm": 0.22699386503067484, + "acc_norm_stderr": 0.032910995786157665 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.26785714285714285, + "acc_stderr": 0.04203277291467762, + "acc_norm": 0.26785714285714285, + "acc_norm_stderr": 0.04203277291467762 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3106796116504854, + "acc_stderr": 0.045821241601615506, + "acc_norm": 0.3106796116504854, + "acc_norm_stderr": 0.045821241601615506 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.31196581196581197, + "acc_stderr": 0.030351527323344944, + "acc_norm": 0.31196581196581197, + "acc_norm_stderr": 0.030351527323344944 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.26436781609195403, + "acc_stderr": 0.015769984840690515, + "acc_norm": 0.26436781609195403, + "acc_norm_stderr": 0.015769984840690515 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.22832369942196531, + "acc_stderr": 0.022598703804321628, + "acc_norm": 0.22832369942196531, + "acc_norm_stderr": 0.022598703804321628 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24134078212290502, + "acc_stderr": 0.014310999547961452, + "acc_norm": 0.24134078212290502, + "acc_norm_stderr": 0.014310999547961452 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.24836601307189543, + "acc_stderr": 0.024739981355113592, + "acc_norm": 0.24836601307189543, + "acc_norm_stderr": 0.024739981355113592 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2090032154340836, + "acc_stderr": 0.02309314039837422, + "acc_norm": 0.2090032154340836, + "acc_norm_stderr": 0.02309314039837422 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.23148148148148148, + "acc_stderr": 0.023468429832451152, + "acc_norm": 0.23148148148148148, + "acc_norm_stderr": 0.023468429832451152 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23049645390070922, + "acc_stderr": 0.025123739226872402, + "acc_norm": 0.23049645390070922, + "acc_norm_stderr": 0.025123739226872402 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2542372881355932, + "acc_stderr": 0.011121129007840689, + "acc_norm": 0.2542372881355932, + "acc_norm_stderr": 0.011121129007840689 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2977941176470588, + "acc_stderr": 0.02777829870154544, + "acc_norm": 0.2977941176470588, + "acc_norm_stderr": 0.02777829870154544 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.22712418300653595, + "acc_stderr": 0.016949853279212376, + "acc_norm": 0.22712418300653595, + "acc_norm_stderr": 0.016949853279212376 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2, + "acc_stderr": 0.03831305140884603, + "acc_norm": 0.2, + "acc_norm_stderr": 0.03831305140884603 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.22040816326530613, + "acc_stderr": 0.026537045312145287, + "acc_norm": 0.22040816326530613, + "acc_norm_stderr": 0.026537045312145287 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.22885572139303484, + "acc_stderr": 0.029705284056772436, + "acc_norm": 0.22885572139303484, + "acc_norm_stderr": 0.029705284056772436 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2289156626506024, + "acc_stderr": 0.03270745277352477, + "acc_norm": 0.2289156626506024, + "acc_norm_stderr": 0.03270745277352477 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.03377310252209195, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.03377310252209195 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2252141982864137, + "mc1_stderr": 0.014623240768023493, + "mc2": 0.48061175625899105, + "mc2_stderr": 0.01683098292520878 + }, + "all": { + "acc": 0.25295491441958895, + "acc_stderr": 0.031640437837766676, + "acc_norm": 0.2535594298885869, + "acc_norm_stderr": 0.03164986612734764, + "mc1": 0.2252141982864137, + "mc1_stderr": 0.014623240768023493, + "mc2": 0.48061175625899105, + "mc2_stderr": 0.01683098292520878 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/orca_mini_13B-GPTQ", + "model_sha": "8ec18e5c597da86fa123c08b6e6bef7da6ec7440", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "f67e97951c3d15a2", + "hash_cont_tokens": "2d6aa4faff4a6929" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4679, + "non-padded": 8, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "6399312a03cd68b3", + "hash_cont_tokens": "7e249f66cf3b6917" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40023, + "non-padded": 145, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "3349874343aa44c2", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "5976a633406d18cc", + "hash_cont_tokens": "b408913f391dc598" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "189f9fa34a3f30f5", + "hash_cont_tokens": "835883b48e70ba57" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "6c02d263b6cc2129", + "hash_cont_tokens": "5094a3a595eede08" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "171b0dbd52999169", + "hash_cont_tokens": "96c880c9478a4037" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "95b7cc8e9e1a4118", + "hash_cont_tokens": "b3a20826b3f8c5d4" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "e4efed9eb2e784ae", + "hash_cont_tokens": "f057dbded53380e3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "11adac28742fb23f", + "hash_cont_tokens": "d19f6b748cdbad5b" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "9f4f2d20cecda785", + "hash_cont_tokens": "e56bfd4bbfc1ef36" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "c2a90547be159e8c", + "hash_cont_tokens": "a70c5f4d9a2c1827" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "03e88c07e22149f4", + "hash_cont_tokens": "0ba3d5d2be74a648" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "83d192794a8fe4bc", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "345a1856603df514", + "hash_cont_tokens": "4dc3a1c45702aea2" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a5f8f113baad49", + "hash_cont_tokens": "0f0b927db9d3942a" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "7d20980d5510ca34", + "hash_cont_tokens": "1dc6f9f294f4f994" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 576, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "778dca97005a88b2", + "hash_cont_tokens": "3dc5225712642c2f" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "000abf5d6f1490ad", + "hash_cont_tokens": "9f965ac6be1cdde5" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d324e1e6bd69855f", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "cb39d48169201064", + "hash_cont_tokens": "4d4d502b30f05cea" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "74ee107afdcc44a4", + "hash_cont_tokens": "2e4918d0f54f7676" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "3d224c5cedd88b9e", + "hash_cont_tokens": "eb9f83faa5dda212" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "8f329952733ee805", + "hash_cont_tokens": "ff5ae57ff23b53d1" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "9808574566aa2b50", + "hash_cont_tokens": "db85309de1591035" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "d05461b8235146a3", + "hash_cont_tokens": "3e0d38987de1e280" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "5650dd590734c60c", + "hash_cont_tokens": "6132e48ff0edea66" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "f9e51c2f1a990818", + "hash_cont_tokens": "941e1571780b4f99" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "5f4a849a5f048639", + "hash_cont_tokens": "adaff269482fbdf1" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "74d333ba94074881", + "hash_cont_tokens": "10b0d8b339ed816d" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "e78378867618b875", + "hash_cont_tokens": "78d01d57974daadb" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "15540fffd9eee834", + "hash_cont_tokens": "c7d55c803ccf3281" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c5307fb57068ab45", + "hash_cont_tokens": "7d705edd113a3d4d" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c9c6b94623d30863", + "hash_cont_tokens": "8b1babefc36685c4" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "e59842f6cfec0152", + "hash_cont_tokens": "b196c68db4825727" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "8545dd29ac846f1a", + "hash_cont_tokens": "ffc3b70128684ad0" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "4bbb7ca5ffd567b5", + "hash_cont_tokens": "a4bc4e51b98c6bfb" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "65000072a32406f9", + "hash_cont_tokens": "ea7ff206c4da6f57" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7fd6460eeb00056e", + "hash_cont_tokens": "e457ae5e94e0bccc" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "d1aa501a0e064fb1", + "hash_cont_tokens": "ccb1bcc13368aac8" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 444, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "145ae760c6adcdf7", + "hash_cont_tokens": "c93d7596aa2246ea" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "5fef1de288784fcd", + "hash_cont_tokens": "af4b0ee8ee2bb07f" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "07aa6ef91a779814", + "hash_cont_tokens": "adad8c87d9018d3a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4381bd08515b7ec3", + "hash_cont_tokens": "5b068e21debc566e" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "c84eefddf6c4e857", + "hash_cont_tokens": "88f4d84033888e35" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1369, + "non-padded": 15, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "e517ebc9537a4fda", + "hash_cont_tokens": "dc85635d6d9e8615" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "94b48a7e0f456d9a", + "hash_cont_tokens": "f4bee5edc9711a7c" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "e6592c8c9a7eece5", + "hash_cont_tokens": "faaa18e05a96eb91" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "fca6ee5e34632392", + "hash_cont_tokens": "258de2e25b517c62" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "73bcbb38b80a5e92", + "hash_cont_tokens": "ae7e03e070aecc31" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1125, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "907b05062632ecac", + "hash_cont_tokens": "5c9515fd601cb0d7" + }, + "truncated": 152, + "non-truncated": 5984, + "padded": 5984, + "non-padded": 152, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "62615d792681bd06", + "hash_cont_tokens": "f0a7e2ab8764a525" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "8af2adac0f1b82b7", + "hash_cont_tokens": "c77557f3e3645c61" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "c1004ab861d1fab6", + "hash_cont_tokens": "c54f38d507746b57" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "5afa4fb3b299242b", + "hash_cont_tokens": "6165a23e658b6aab" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "c9c9c2d95d080a2d", + "hash_cont_tokens": "dce62751a5803c9d" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "7b39d57cf50a9e0c", + "hash_cont_tokens": "aa21f27d8c55d48c" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "9450d47158e3266c", + "hash_cont_tokens": "37efad130a2850f9" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d9dc5a21cea74601", + "hash_cont_tokens": "f8476c0c6f07dff2" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "11318d42aa148b00", + "hash_cont_tokens": "41a137d0d70d9dbb" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "1e5c044975ae648c", + "hash_cont_tokens": "15f036dacc38826e" + }, + "total_evaluation_time_secondes": "4487.615822315216", + "truncated": 1628, + "non-truncated": 109391, + "padded": 109212, + "non-padded": 1807, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/orca_mini_13B-GPTQ/results_2023-11-05T13-43-32.201116.json b/eval-results/TheBloke/orca_mini_13B-GPTQ/results_2023-11-05T13-43-32.201116.json new file mode 100644 index 0000000000000000000000000000000000000000..db37e8fb27bf247d5e29281194b28e36949292da --- /dev/null +++ b/eval-results/TheBloke/orca_mini_13B-GPTQ/results_2023-11-05T13-43-32.201116.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/orca_mini_13B-GPTQ", + "model_sha": "8ec18e5c597da86fa123c08b6e6bef7da6ec7440", + "model_dtype": "torch.float16", + "model_size": "6.8 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.04047818791946309, + "em_stderr": 0.002018262301743542, + "f1": 0.11767512583892628, + "f1_stderr": 0.0025441207373296006 + }, + "harness|gsm8k|5": { + "acc": 0.000758150113722517, + "acc_stderr": 0.0007581501137225239 + }, + "harness|winogrande|5": { + "acc": 0.6377269139700079, + "acc_stderr": 0.01350885547625251 + }, + "all": { + "em": 0.04047818791946309, + "em_stderr": 0.002018262301743542, + "f1": 0.11767512583892628, + "f1_stderr": 0.0025441207373296006, + "acc": 0.3192425320418652, + "acc_stderr": 0.007133502794987517 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "b378143524018ba9", + "hash_cont_tokens": "87301fa983ccd2fb" + }, + "truncated": 905, + "non_truncated": 8631, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "d9e7a50c5ecff9b6", + "hash_cont_tokens": "105906b8c99621c5" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "1af498b9535af31d", + "hash_cont_tokens": "6e9d2660a6f59318" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2413, + "non_padded": 121, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a4313842ba3114db", + "hash_cont_tokens": "17620cc80e7c3a7f" + }, + "truncated": 905, + "non_truncated": 11217, + "padded": 2413, + "non_padded": 10976, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/orca_mini_13B-GPTQ/results_2023-11-07T10-33-18.298818.json b/eval-results/TheBloke/orca_mini_13B-GPTQ/results_2023-11-07T10-33-18.298818.json new file mode 100644 index 0000000000000000000000000000000000000000..3f1e1251a11a2ac1bee1dbb4087996b7b6ca759f --- /dev/null +++ b/eval-results/TheBloke/orca_mini_13B-GPTQ/results_2023-11-07T10-33-18.298818.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/orca_mini_13B-GPTQ", + "model_sha": "8ec18e5c597da86fa123c08b6e6bef7da6ec7440", + "model_dtype": "torch.float16", + "model_size": "6.8 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.04047818791946309, + "em_stderr": 0.002018262301743542, + "f1": 0.11770658557046992, + "f1_stderr": 0.002544480345951201 + }, + "harness|gsm8k|5": { + "acc": 0.000758150113722517, + "acc_stderr": 0.0007581501137225239 + }, + "harness|winogrande|5": { + "acc": 0.6377269139700079, + "acc_stderr": 0.01350885547625251 + }, + "all": { + "em": 0.04047818791946309, + "em_stderr": 0.002018262301743542, + "f1": 0.11770658557046992, + "f1_stderr": 0.002544480345951201, + "acc": 0.3192425320418652, + "acc_stderr": 0.007133502794987517 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "b378143524018ba9", + "hash_cont_tokens": "227209b4dea08b33" + }, + "truncated": 905, + "non_truncated": 8631, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "d9e7a50c5ecff9b6", + "hash_cont_tokens": "33b9baf0e83b478d" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "1af498b9535af31d", + "hash_cont_tokens": "6e9d2660a6f59318" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2413, + "non_padded": 121, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a4313842ba3114db", + "hash_cont_tokens": "5e222849c4e7095f" + }, + "truncated": 905, + "non_truncated": 11217, + "padded": 2413, + "non_padded": 10976, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/orca_mini_v3_13B-GPTQ/results_2023-12-04T12-38-59.699618.json b/eval-results/TheBloke/orca_mini_v3_13B-GPTQ/results_2023-12-04T12-38-59.699618.json new file mode 100644 index 0000000000000000000000000000000000000000..1b6697e83fe77a41a78f396e5d1d05c143b1468f --- /dev/null +++ b/eval-results/TheBloke/orca_mini_v3_13B-GPTQ/results_2023-12-04T12-38-59.699618.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 136021.844256558, + "end_time": 148410.741376248, + "total_evaluation_time_secondes": "12388.897119690024", + "model_name": "TheBloke/orca_mini_v3_13B-GPTQ", + "model_sha": "7b7a2dcd946f393e26215268c4c7e0699be2bbd8", + "model_dtype": "None", + "model_size": "6.84 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5930034129692833, + "acc_stderr": 0.01435639941800912, + "acc_norm": 0.6194539249146758, + "acc_norm_stderr": 0.014188277712349814 + }, + "harness|hellaswag|10": { + "acc": 0.617805218084047, + "acc_stderr": 0.004849306998727771, + "acc_norm": 0.81557458673571, + "acc_norm_stderr": 0.003870381199967957 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.04605661864718381, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04605661864718381 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4740740740740741, + "acc_stderr": 0.04313531696750574, + "acc_norm": 0.4740740740740741, + "acc_norm_stderr": 0.04313531696750574 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5, + "acc_stderr": 0.04068942293855797, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04068942293855797 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6188679245283019, + "acc_stderr": 0.029890609686286637, + "acc_norm": 0.6188679245283019, + "acc_norm_stderr": 0.029890609686286637 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5902777777777778, + "acc_stderr": 0.041124909746707884, + "acc_norm": 0.5902777777777778, + "acc_norm_stderr": 0.041124909746707884 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.049888765156985884, + "acc_norm": 0.44, + "acc_norm_stderr": 0.049888765156985884 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4913294797687861, + "acc_stderr": 0.03811890988940412, + "acc_norm": 0.4913294797687861, + "acc_norm_stderr": 0.03811890988940412 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.04533838195929776, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.04533838195929776 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.74, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.74, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4808510638297872, + "acc_stderr": 0.03266204299064678, + "acc_norm": 0.4808510638297872, + "acc_norm_stderr": 0.03266204299064678 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5103448275862069, + "acc_stderr": 0.04165774775728763, + "acc_norm": 0.5103448275862069, + "acc_norm_stderr": 0.04165774775728763 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30952380952380953, + "acc_stderr": 0.023809523809523846, + "acc_norm": 0.30952380952380953, + "acc_norm_stderr": 0.023809523809523846 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04216370213557836, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04216370213557836 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6387096774193548, + "acc_stderr": 0.02732754844795754, + "acc_norm": 0.6387096774193548, + "acc_norm_stderr": 0.02732754844795754 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43842364532019706, + "acc_stderr": 0.03491207857486519, + "acc_norm": 0.43842364532019706, + "acc_norm_stderr": 0.03491207857486519 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6848484848484848, + "acc_stderr": 0.0362773057502241, + "acc_norm": 0.6848484848484848, + "acc_norm_stderr": 0.0362773057502241 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7373737373737373, + "acc_stderr": 0.03135305009533086, + "acc_norm": 0.7373737373737373, + "acc_norm_stderr": 0.03135305009533086 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8082901554404145, + "acc_stderr": 0.02840895362624527, + "acc_norm": 0.8082901554404145, + "acc_norm_stderr": 0.02840895362624527 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.541025641025641, + "acc_stderr": 0.025265525491284295, + "acc_norm": 0.541025641025641, + "acc_norm_stderr": 0.025265525491284295 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.02831753349606647, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.02831753349606647 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5840336134453782, + "acc_stderr": 0.032016501007396114, + "acc_norm": 0.5840336134453782, + "acc_norm_stderr": 0.032016501007396114 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2847682119205298, + "acc_stderr": 0.03684881521389024, + "acc_norm": 0.2847682119205298, + "acc_norm_stderr": 0.03684881521389024 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7541284403669725, + "acc_stderr": 0.018461940968708436, + "acc_norm": 0.7541284403669725, + "acc_norm_stderr": 0.018461940968708436 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4861111111111111, + "acc_stderr": 0.03408655867977749, + "acc_norm": 0.4861111111111111, + "acc_norm_stderr": 0.03408655867977749 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7352941176470589, + "acc_stderr": 0.030964517926923403, + "acc_norm": 0.7352941176470589, + "acc_norm_stderr": 0.030964517926923403 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7637130801687764, + "acc_stderr": 0.027652153144159256, + "acc_norm": 0.7637130801687764, + "acc_norm_stderr": 0.027652153144159256 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6278026905829597, + "acc_stderr": 0.03244305283008731, + "acc_norm": 0.6278026905829597, + "acc_norm_stderr": 0.03244305283008731 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6564885496183206, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.6564885496183206, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.03984979653302872, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.03984979653302872 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.75, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6748466257668712, + "acc_stderr": 0.036803503712864616, + "acc_norm": 0.6748466257668712, + "acc_norm_stderr": 0.036803503712864616 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.04521829902833585, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.04521829902833585 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7669902912621359, + "acc_stderr": 0.04185832598928315, + "acc_norm": 0.7669902912621359, + "acc_norm_stderr": 0.04185832598928315 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.0272360139461967, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.0272360139461967 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.756066411238825, + "acc_stderr": 0.015357212665829461, + "acc_norm": 0.756066411238825, + "acc_norm_stderr": 0.015357212665829461 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.025722802200895803, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.025722802200895803 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3787709497206704, + "acc_stderr": 0.016223533510365113, + "acc_norm": 0.3787709497206704, + "acc_norm_stderr": 0.016223533510365113 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6078431372549019, + "acc_stderr": 0.027956046165424516, + "acc_norm": 0.6078431372549019, + "acc_norm_stderr": 0.027956046165424516 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6655948553054662, + "acc_stderr": 0.026795422327893934, + "acc_norm": 0.6655948553054662, + "acc_norm_stderr": 0.026795422327893934 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6327160493827161, + "acc_stderr": 0.0268228017595079, + "acc_norm": 0.6327160493827161, + "acc_norm_stderr": 0.0268228017595079 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.40425531914893614, + "acc_stderr": 0.029275532159704725, + "acc_norm": 0.40425531914893614, + "acc_norm_stderr": 0.029275532159704725 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4198174706649283, + "acc_stderr": 0.012604960816087377, + "acc_norm": 0.4198174706649283, + "acc_norm_stderr": 0.012604960816087377 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5147058823529411, + "acc_stderr": 0.03035969707904612, + "acc_norm": 0.5147058823529411, + "acc_norm_stderr": 0.03035969707904612 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5669934640522876, + "acc_stderr": 0.020045442473324227, + "acc_norm": 0.5669934640522876, + "acc_norm_stderr": 0.020045442473324227 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6, + "acc_stderr": 0.0469237132203465, + "acc_norm": 0.6, + "acc_norm_stderr": 0.0469237132203465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6530612244897959, + "acc_stderr": 0.030472526026726496, + "acc_norm": 0.6530612244897959, + "acc_norm_stderr": 0.030472526026726496 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6616915422885572, + "acc_stderr": 0.033455630703391935, + "acc_norm": 0.6616915422885572, + "acc_norm_stderr": 0.033455630703391935 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4578313253012048, + "acc_stderr": 0.038786267710023595, + "acc_norm": 0.4578313253012048, + "acc_norm_stderr": 0.038786267710023595 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7719298245614035, + "acc_stderr": 0.03218093795602357, + "acc_norm": 0.7719298245614035, + "acc_norm_stderr": 0.03218093795602357 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3561811505507956, + "mc1_stderr": 0.016763790728446335, + "mc2": 0.4922092515317753, + "mc2_stderr": 0.015510989644544924 + }, + "harness|winogrande|5": { + "acc": 0.7576953433307024, + "acc_stderr": 0.012042352526174789 + }, + "harness|gsm8k|5": { + "acc": 0.29492039423805916, + "acc_stderr": 0.01256069801095475 + }, + "all": { + "acc": 0.5613401785987572, + "acc_stderr": 0.033576900106646816, + "acc_norm": 0.5663280514839403, + "acc_norm_stderr": 0.0342786577705747, + "mc1": 0.3561811505507956, + "mc1_stderr": 0.016763790728446335, + "mc2": 0.4922092515317753, + "mc2_stderr": 0.015510989644544924 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "f29249301f17971c" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "a8fa53915153e1db", + "hash_cont_tokens": "f1ab731982591f9c" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/orca_mini_v3_7B-GPTQ/results_2023-08-22T13-46-10.418493.json b/eval-results/TheBloke/orca_mini_v3_7B-GPTQ/results_2023-08-22T13-46-10.418493.json new file mode 100644 index 0000000000000000000000000000000000000000..b64f84c11ab3a1661864cc4a342e997737521069 --- /dev/null +++ b/eval-results/TheBloke/orca_mini_v3_7B-GPTQ/results_2023-08-22T13-46-10.418493.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.2354948805460751, + "acc_stderr": 0.012399451855004748, + "acc_norm": 0.30119453924914674, + "acc_norm_stderr": 0.013406741767847617 + }, + "harness|hellaswag|10": { + "acc": 0.25492929695279826, + "acc_stderr": 0.0043493077027351645, + "acc_norm": 0.2599083847839076, + "acc_norm_stderr": 0.0043768776192341175 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768081, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768081 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.038201699145179055, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.038201699145179055 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17105263157894737, + "acc_stderr": 0.030643607071677088, + "acc_norm": 0.17105263157894737, + "acc_norm_stderr": 0.030643607071677088 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2679245283018868, + "acc_stderr": 0.027257260322494845, + "acc_norm": 0.2679245283018868, + "acc_norm_stderr": 0.027257260322494845 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.24305555555555555, + "acc_stderr": 0.035868792800803406, + "acc_norm": 0.24305555555555555, + "acc_norm_stderr": 0.035868792800803406 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.17, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.17, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2138728323699422, + "acc_stderr": 0.031265112061730424, + "acc_norm": 0.2138728323699422, + "acc_norm_stderr": 0.031265112061730424 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.04023382273617749, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.04023382273617749 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.28085106382978725, + "acc_stderr": 0.02937917046412482, + "acc_norm": 0.28085106382978725, + "acc_norm_stderr": 0.02937917046412482 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537317, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537317 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2206896551724138, + "acc_stderr": 0.03455930201924811, + "acc_norm": 0.2206896551724138, + "acc_norm_stderr": 0.03455930201924811 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.022644212615525218, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.022644212615525218 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.1984126984126984, + "acc_stderr": 0.03567016675276863, + "acc_norm": 0.1984126984126984, + "acc_norm_stderr": 0.03567016675276863 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24838709677419354, + "acc_stderr": 0.024580028921481003, + "acc_norm": 0.24838709677419354, + "acc_norm_stderr": 0.024580028921481003 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.28078817733990147, + "acc_stderr": 0.0316185633535861, + "acc_norm": 0.28078817733990147, + "acc_norm_stderr": 0.0316185633535861 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909281, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909281 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.18686868686868688, + "acc_stderr": 0.02777253333421898, + "acc_norm": 0.18686868686868688, + "acc_norm_stderr": 0.02777253333421898 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.24352331606217617, + "acc_stderr": 0.030975436386845436, + "acc_norm": 0.24352331606217617, + "acc_norm_stderr": 0.030975436386845436 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.23333333333333334, + "acc_stderr": 0.021444547301560486, + "acc_norm": 0.23333333333333334, + "acc_norm_stderr": 0.021444547301560486 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.27037037037037037, + "acc_stderr": 0.02708037281514568, + "acc_norm": 0.27037037037037037, + "acc_norm_stderr": 0.02708037281514568 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.23109243697478993, + "acc_stderr": 0.027381406927868963, + "acc_norm": 0.23109243697478993, + "acc_norm_stderr": 0.027381406927868963 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23841059602649006, + "acc_stderr": 0.0347918557259966, + "acc_norm": 0.23841059602649006, + "acc_norm_stderr": 0.0347918557259966 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.25137614678899084, + "acc_stderr": 0.018599206360287415, + "acc_norm": 0.25137614678899084, + "acc_norm_stderr": 0.018599206360287415 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.14351851851851852, + "acc_stderr": 0.02391077925264438, + "acc_norm": 0.14351851851851852, + "acc_norm_stderr": 0.02391077925264438 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.0319800166011507, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.0319800166011507 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2320675105485232, + "acc_stderr": 0.02747974455080851, + "acc_norm": 0.2320675105485232, + "acc_norm_stderr": 0.02747974455080851 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.28699551569506726, + "acc_stderr": 0.030360379710291947, + "acc_norm": 0.28699551569506726, + "acc_norm_stderr": 0.030360379710291947 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.20610687022900764, + "acc_stderr": 0.03547771004159464, + "acc_norm": 0.20610687022900764, + "acc_norm_stderr": 0.03547771004159464 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.3305785123966942, + "acc_stderr": 0.04294340845212094, + "acc_norm": 0.3305785123966942, + "acc_norm_stderr": 0.04294340845212094 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.04284467968052192, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.04284467968052192 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22699386503067484, + "acc_stderr": 0.03291099578615769, + "acc_norm": 0.22699386503067484, + "acc_norm_stderr": 0.03291099578615769 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2767857142857143, + "acc_stderr": 0.042466243366976256, + "acc_norm": 0.2767857142857143, + "acc_norm_stderr": 0.042466243366976256 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.14563106796116504, + "acc_stderr": 0.034926064766237906, + "acc_norm": 0.14563106796116504, + "acc_norm_stderr": 0.034926064766237906 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.24358974358974358, + "acc_stderr": 0.028120966503914414, + "acc_norm": 0.24358974358974358, + "acc_norm_stderr": 0.028120966503914414 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.24776500638569604, + "acc_stderr": 0.01543808308056897, + "acc_norm": 0.24776500638569604, + "acc_norm_stderr": 0.01543808308056897 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.22254335260115607, + "acc_stderr": 0.02239421566194282, + "acc_norm": 0.22254335260115607, + "acc_norm_stderr": 0.02239421566194282 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24134078212290502, + "acc_stderr": 0.014310999547961447, + "acc_norm": 0.24134078212290502, + "acc_norm_stderr": 0.014310999547961447 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.21895424836601307, + "acc_stderr": 0.02367908986180772, + "acc_norm": 0.21895424836601307, + "acc_norm_stderr": 0.02367908986180772 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2379421221864952, + "acc_stderr": 0.024185150647818707, + "acc_norm": 0.2379421221864952, + "acc_norm_stderr": 0.024185150647818707 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.24691358024691357, + "acc_stderr": 0.02399350170904211, + "acc_norm": 0.24691358024691357, + "acc_norm_stderr": 0.02399350170904211 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.24468085106382978, + "acc_stderr": 0.025645553622266736, + "acc_norm": 0.24468085106382978, + "acc_norm_stderr": 0.025645553622266736 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.25358539765319427, + "acc_stderr": 0.011111715336101129, + "acc_norm": 0.25358539765319427, + "acc_norm_stderr": 0.011111715336101129 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.15808823529411764, + "acc_stderr": 0.02216146260806851, + "acc_norm": 0.15808823529411764, + "acc_norm_stderr": 0.02216146260806851 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.26143790849673204, + "acc_stderr": 0.01777694715752803, + "acc_norm": 0.26143790849673204, + "acc_norm_stderr": 0.01777694715752803 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.33636363636363636, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.33636363636363636, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.17142857142857143, + "acc_stderr": 0.024127463462650153, + "acc_norm": 0.17142857142857143, + "acc_norm_stderr": 0.024127463462650153 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23383084577114427, + "acc_stderr": 0.029929415408348398, + "acc_norm": 0.23383084577114427, + "acc_norm_stderr": 0.029929415408348398 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909282, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909282 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3493975903614458, + "acc_stderr": 0.03711725190740749, + "acc_norm": 0.3493975903614458, + "acc_norm_stderr": 0.03711725190740749 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.25146198830409355, + "acc_stderr": 0.033275044238468436, + "acc_norm": 0.25146198830409355, + "acc_norm_stderr": 0.033275044238468436 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2423500611995104, + "mc1_stderr": 0.015000674373570345, + "mc2": 0.48438228589364474, + "mc2_stderr": 0.01700627816360895 + }, + "all": { + "acc": 0.24319179243780645, + "acc_stderr": 0.031158862195277914, + "acc_norm": 0.244389737294318, + "acc_norm_stderr": 0.031176402192385405, + "mc1": 0.2423500611995104, + "mc1_stderr": 0.015000674373570345, + "mc2": 0.48438228589364474, + "mc2_stderr": 0.01700627816360895 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/orca_mini_v3_7B-GPTQ", + "model_sha": "06ddd48cd904907e3c73d2dfe47d28626053598b", + "model_dtype": "8bit", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "5361.1907641887665", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/orca_mini_v3_7B-GPTQ/results_2023-12-04T11-27-39.056243.json b/eval-results/TheBloke/orca_mini_v3_7B-GPTQ/results_2023-12-04T11-27-39.056243.json new file mode 100644 index 0000000000000000000000000000000000000000..d0106d4ee40342e7681c5cfef9ddecb5690a442c --- /dev/null +++ b/eval-results/TheBloke/orca_mini_v3_7B-GPTQ/results_2023-12-04T11-27-39.056243.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 136129.656646575, + "end_time": 144122.582858982, + "total_evaluation_time_secondes": "7992.926212407008", + "model_name": "TheBloke/orca_mini_v3_7B-GPTQ", + "model_sha": "4f06a6151128861d5bb256275620f7eadcab3238", + "model_dtype": "None", + "model_size": "3.69 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4974402730375427, + "acc_stderr": 0.014611199329843788, + "acc_norm": 0.5452218430034129, + "acc_norm_stderr": 0.014551507060836355 + }, + "harness|hellaswag|10": { + "acc": 0.5929097789285003, + "acc_stderr": 0.0049028788067330365, + "acc_norm": 0.7853017327225652, + "acc_norm_stderr": 0.004097736838432052 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.046482319871173156, + "acc_norm": 0.31, + "acc_norm_stderr": 0.046482319871173156 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5, + "acc_stderr": 0.04068942293855797, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04068942293855797 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5924528301886792, + "acc_stderr": 0.030242233800854494, + "acc_norm": 0.5924528301886792, + "acc_norm_stderr": 0.030242233800854494 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.041553199555931467, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.041553199555931467 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.049888765156985884, + "acc_norm": 0.44, + "acc_norm_stderr": 0.049888765156985884 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4797687861271676, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.4797687861271676, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.043364327079931785, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.043364327079931785 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.03261936918467382, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.03261936918467382 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.38596491228070173, + "acc_stderr": 0.045796394220704334, + "acc_norm": 0.38596491228070173, + "acc_norm_stderr": 0.045796394220704334 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.42758620689655175, + "acc_stderr": 0.04122737111370332, + "acc_norm": 0.42758620689655175, + "acc_norm_stderr": 0.04122737111370332 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.02345603738398203, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.02345603738398203 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04216370213557835, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04216370213557835 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5935483870967742, + "acc_stderr": 0.0279417273462563, + "acc_norm": 0.5935483870967742, + "acc_norm_stderr": 0.0279417273462563 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3842364532019704, + "acc_stderr": 0.0342239856565755, + "acc_norm": 0.3842364532019704, + "acc_norm_stderr": 0.0342239856565755 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6424242424242425, + "acc_stderr": 0.037425970438065864, + "acc_norm": 0.6424242424242425, + "acc_norm_stderr": 0.037425970438065864 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6868686868686869, + "acc_stderr": 0.033042050878136525, + "acc_norm": 0.6868686868686869, + "acc_norm_stderr": 0.033042050878136525 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7305699481865285, + "acc_stderr": 0.03201867122877794, + "acc_norm": 0.7305699481865285, + "acc_norm_stderr": 0.03201867122877794 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.48205128205128206, + "acc_stderr": 0.025334667080954935, + "acc_norm": 0.48205128205128206, + "acc_norm_stderr": 0.025334667080954935 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.027309140588230182, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.027309140588230182 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.03242225027115007, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.03242225027115007 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3708609271523179, + "acc_stderr": 0.03943966699183629, + "acc_norm": 0.3708609271523179, + "acc_norm_stderr": 0.03943966699183629 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7027522935779816, + "acc_stderr": 0.019595707224643526, + "acc_norm": 0.7027522935779816, + "acc_norm_stderr": 0.019595707224643526 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.03372343271653063, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.03372343271653063 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7107843137254902, + "acc_stderr": 0.031822318676475544, + "acc_norm": 0.7107843137254902, + "acc_norm_stderr": 0.031822318676475544 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7341772151898734, + "acc_stderr": 0.02875679962965834, + "acc_norm": 0.7341772151898734, + "acc_norm_stderr": 0.02875679962965834 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5695067264573991, + "acc_stderr": 0.0332319730294294, + "acc_norm": 0.5695067264573991, + "acc_norm_stderr": 0.0332319730294294 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5954198473282443, + "acc_stderr": 0.043046937953806645, + "acc_norm": 0.5954198473282443, + "acc_norm_stderr": 0.043046937953806645 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7024793388429752, + "acc_stderr": 0.04173349148083499, + "acc_norm": 0.7024793388429752, + "acc_norm_stderr": 0.04173349148083499 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.0471282125742677, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.0471282125742677 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.588957055214724, + "acc_stderr": 0.038656978537853624, + "acc_norm": 0.588957055214724, + "acc_norm_stderr": 0.038656978537853624 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7281553398058253, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.7281553398058253, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7564102564102564, + "acc_stderr": 0.028120966503914414, + "acc_norm": 0.7564102564102564, + "acc_norm_stderr": 0.028120966503914414 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6871008939974457, + "acc_stderr": 0.016580935940304055, + "acc_norm": 0.6871008939974457, + "acc_norm_stderr": 0.016580935940304055 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5404624277456648, + "acc_stderr": 0.02683080599895225, + "acc_norm": 0.5404624277456648, + "acc_norm_stderr": 0.02683080599895225 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2905027932960894, + "acc_stderr": 0.015183844307206144, + "acc_norm": 0.2905027932960894, + "acc_norm_stderr": 0.015183844307206144 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.028431095444176643, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.028431095444176643 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5916398713826366, + "acc_stderr": 0.027917050748484627, + "acc_norm": 0.5916398713826366, + "acc_norm_stderr": 0.027917050748484627 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5432098765432098, + "acc_stderr": 0.027716661650194038, + "acc_norm": 0.5432098765432098, + "acc_norm_stderr": 0.027716661650194038 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.38652482269503546, + "acc_stderr": 0.029049190342543454, + "acc_norm": 0.38652482269503546, + "acc_norm_stderr": 0.029049190342543454 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.36962190352020863, + "acc_stderr": 0.012328445778575248, + "acc_norm": 0.36962190352020863, + "acc_norm_stderr": 0.012328445778575248 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5257352941176471, + "acc_stderr": 0.030332578094555033, + "acc_norm": 0.5257352941176471, + "acc_norm_stderr": 0.030332578094555033 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.49673202614379086, + "acc_stderr": 0.020227402794434867, + "acc_norm": 0.49673202614379086, + "acc_norm_stderr": 0.020227402794434867 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5727272727272728, + "acc_stderr": 0.047381987035454834, + "acc_norm": 0.5727272727272728, + "acc_norm_stderr": 0.047381987035454834 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6489795918367347, + "acc_stderr": 0.03055531675557364, + "acc_norm": 0.6489795918367347, + "acc_norm_stderr": 0.03055531675557364 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7064676616915423, + "acc_stderr": 0.032200241045342054, + "acc_norm": 0.7064676616915423, + "acc_norm_stderr": 0.032200241045342054 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.73, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.73, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3493975903614458, + "acc_stderr": 0.0371172519074075, + "acc_norm": 0.3493975903614458, + "acc_norm_stderr": 0.0371172519074075 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6608187134502924, + "acc_stderr": 0.03631053496488904, + "acc_norm": 0.6608187134502924, + "acc_norm_stderr": 0.03631053496488904 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35495716034271724, + "mc1_stderr": 0.016750862381375898, + "mc2": 0.5120018406031775, + "mc2_stderr": 0.015691567132244007 + }, + "harness|winogrande|5": { + "acc": 0.7466456195737964, + "acc_stderr": 0.012223754434233618 + }, + "harness|gsm8k|5": { + "acc": 0.15314632297194844, + "acc_stderr": 0.009919728152791475 + }, + "all": { + "acc": 0.5171661173804333, + "acc_stderr": 0.03432159517380606, + "acc_norm": 0.5235171990071291, + "acc_norm_stderr": 0.03509506760640401, + "mc1": 0.35495716034271724, + "mc1_stderr": 0.016750862381375898, + "mc2": 0.5120018406031775, + "mc2_stderr": 0.015691567132244007 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "89f938cb253e7ded" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "a8fa53915153e1db", + "hash_cont_tokens": "8384db027c9676e0" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/robin-13B-v2-fp16/results_2023-07-31T15-48-06.598529.json b/eval-results/TheBloke/robin-13B-v2-fp16/results_2023-07-31T15-48-06.598529.json new file mode 100644 index 0000000000000000000000000000000000000000..c6d0819f5fc13970858f5f457cc16a51e8223314 --- /dev/null +++ b/eval-results/TheBloke/robin-13B-v2-fp16/results_2023-07-31T15-48-06.598529.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5401023890784983, + "acc_stderr": 0.01456431885692485, + "acc_norm": 0.5648464163822525, + "acc_norm_stderr": 0.014487986197186045 + }, + "harness|hellaswag|10": { + "acc": 0.5945030870344553, + "acc_stderr": 0.004899845087183104, + "acc_norm": 0.8037243576976698, + "acc_norm_stderr": 0.003963677261161229 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252606, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252606 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4868421052631579, + "acc_stderr": 0.04067533136309173, + "acc_norm": 0.4868421052631579, + "acc_norm_stderr": 0.04067533136309173 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4679245283018868, + "acc_stderr": 0.03070948699255655, + "acc_norm": 0.4679245283018868, + "acc_norm_stderr": 0.03070948699255655 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.04174752578923185, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.04174752578923185 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117317, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117317 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.44508670520231214, + "acc_stderr": 0.03789401760283646, + "acc_norm": 0.44508670520231214, + "acc_norm_stderr": 0.03789401760283646 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.0379328118530781, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.0379328118530781 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4, + "acc_stderr": 0.03202563076101735, + "acc_norm": 0.4, + "acc_norm_stderr": 0.03202563076101735 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.04339138322579861, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.04339138322579861 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4068965517241379, + "acc_stderr": 0.04093793981266237, + "acc_norm": 0.4068965517241379, + "acc_norm_stderr": 0.04093793981266237 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.02256989707491841, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.02256989707491841 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.04163453031302859, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.04163453031302859 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.49032258064516127, + "acc_stderr": 0.028438677998909558, + "acc_norm": 0.49032258064516127, + "acc_norm_stderr": 0.028438677998909558 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.32019704433497537, + "acc_stderr": 0.032826493853041504, + "acc_norm": 0.32019704433497537, + "acc_norm_stderr": 0.032826493853041504 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6303030303030303, + "acc_stderr": 0.037694303145125674, + "acc_norm": 0.6303030303030303, + "acc_norm_stderr": 0.037694303145125674 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5606060606060606, + "acc_stderr": 0.03536085947529479, + "acc_norm": 0.5606060606060606, + "acc_norm_stderr": 0.03536085947529479 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6683937823834197, + "acc_stderr": 0.03397636541089118, + "acc_norm": 0.6683937823834197, + "acc_norm_stderr": 0.03397636541089118 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.44871794871794873, + "acc_stderr": 0.025217315184846482, + "acc_norm": 0.44871794871794873, + "acc_norm_stderr": 0.025217315184846482 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23333333333333334, + "acc_stderr": 0.02578787422095932, + "acc_norm": 0.23333333333333334, + "acc_norm_stderr": 0.02578787422095932 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.0322529423239964, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.0322529423239964 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6605504587155964, + "acc_stderr": 0.02030210934266235, + "acc_norm": 0.6605504587155964, + "acc_norm_stderr": 0.02030210934266235 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.30092592592592593, + "acc_stderr": 0.03128039084329882, + "acc_norm": 0.30092592592592593, + "acc_norm_stderr": 0.03128039084329882 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6274509803921569, + "acc_stderr": 0.03393388584958404, + "acc_norm": 0.6274509803921569, + "acc_norm_stderr": 0.03393388584958404 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7215189873417721, + "acc_stderr": 0.029178682304842544, + "acc_norm": 0.7215189873417721, + "acc_norm_stderr": 0.029178682304842544 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5695067264573991, + "acc_stderr": 0.033231973029429394, + "acc_norm": 0.5695067264573991, + "acc_norm_stderr": 0.033231973029429394 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6106870229007634, + "acc_stderr": 0.04276486542814591, + "acc_norm": 0.6106870229007634, + "acc_norm_stderr": 0.04276486542814591 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7024793388429752, + "acc_stderr": 0.04173349148083499, + "acc_norm": 0.7024793388429752, + "acc_norm_stderr": 0.04173349148083499 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5740740740740741, + "acc_stderr": 0.0478034362693679, + "acc_norm": 0.5740740740740741, + "acc_norm_stderr": 0.0478034362693679 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5828220858895705, + "acc_stderr": 0.03874102859818081, + "acc_norm": 0.5828220858895705, + "acc_norm_stderr": 0.03874102859818081 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5089285714285714, + "acc_stderr": 0.04745033255489122, + "acc_norm": 0.5089285714285714, + "acc_norm_stderr": 0.04745033255489122 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6407766990291263, + "acc_stderr": 0.047504583990416946, + "acc_norm": 0.6407766990291263, + "acc_norm_stderr": 0.047504583990416946 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7521367521367521, + "acc_stderr": 0.0282863240755644, + "acc_norm": 0.7521367521367521, + "acc_norm_stderr": 0.0282863240755644 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6883780332056194, + "acc_stderr": 0.016562433867284176, + "acc_norm": 0.6883780332056194, + "acc_norm_stderr": 0.016562433867284176 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5, + "acc_stderr": 0.026919095102908273, + "acc_norm": 0.5, + "acc_norm_stderr": 0.026919095102908273 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25027932960893856, + "acc_stderr": 0.01448750085285041, + "acc_norm": 0.25027932960893856, + "acc_norm_stderr": 0.01448750085285041 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5065359477124183, + "acc_stderr": 0.028627470550556047, + "acc_norm": 0.5065359477124183, + "acc_norm_stderr": 0.028627470550556047 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5337620578778135, + "acc_stderr": 0.028333277109562786, + "acc_norm": 0.5337620578778135, + "acc_norm_stderr": 0.028333277109562786 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5524691358024691, + "acc_stderr": 0.02766713856942271, + "acc_norm": 0.5524691358024691, + "acc_norm_stderr": 0.02766713856942271 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.37943262411347517, + "acc_stderr": 0.028947338851614105, + "acc_norm": 0.37943262411347517, + "acc_norm_stderr": 0.028947338851614105 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4211212516297262, + "acc_stderr": 0.012610325733489903, + "acc_norm": 0.4211212516297262, + "acc_norm_stderr": 0.012610325733489903 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5147058823529411, + "acc_stderr": 0.03035969707904612, + "acc_norm": 0.5147058823529411, + "acc_norm_stderr": 0.03035969707904612 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.48366013071895425, + "acc_stderr": 0.020217030653186453, + "acc_norm": 0.48366013071895425, + "acc_norm_stderr": 0.020217030653186453 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5636363636363636, + "acc_stderr": 0.04750185058907296, + "acc_norm": 0.5636363636363636, + "acc_norm_stderr": 0.04750185058907296 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5551020408163265, + "acc_stderr": 0.031814251181977865, + "acc_norm": 0.5551020408163265, + "acc_norm_stderr": 0.031814251181977865 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6567164179104478, + "acc_stderr": 0.03357379665433431, + "acc_norm": 0.6567164179104478, + "acc_norm_stderr": 0.03357379665433431 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932264, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932264 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4578313253012048, + "acc_stderr": 0.038786267710023595, + "acc_norm": 0.4578313253012048, + "acc_norm_stderr": 0.038786267710023595 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.695906432748538, + "acc_stderr": 0.0352821125824523, + "acc_norm": 0.695906432748538, + "acc_norm_stderr": 0.0352821125824523 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.34149326805385555, + "mc1_stderr": 0.016600688619950826, + "mc2": 0.5063100731922137, + "mc2_stderr": 0.014760623429029368 + }, + "all": { + "acc": 0.49056004249413854, + "acc_stderr": 0.034895228964178376, + "acc_norm": 0.49452555601900244, + "acc_norm_stderr": 0.03487806793899599, + "mc1": 0.34149326805385555, + "mc1_stderr": 0.016600688619950826, + "mc2": 0.5063100731922137, + "mc2_stderr": 0.014760623429029368 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/robin-13B-v2-fp16", + "model_sha": "f4dd8fc4440ed84fcf3ff1122f2b7f6024cca29d", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2b0e07d4cdd3b0fe", + "hash_cont_tokens": "52204555b6e39a6e" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "578edd77107cb2c3", + "hash_cont_tokens": "25c49737526d9f80" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "6a95a1511f8da075", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "24a78edc4d9a93aa", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "b11106668d6c0974", + "hash_cont_tokens": "ebed26cf74a85815" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "10180ba12a075cb0", + "hash_cont_tokens": "6898ac348a7ae442" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "73351ef4968750a2", + "hash_cont_tokens": "34a058958a45af94" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "a539150af234c668", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "52e12e5a43bcee35", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "d1f3721a5659f7ee", + "hash_cont_tokens": "da408cb12ab08288" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "f2d78f546b5595c2", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "c9cc19179f63d1d6", + "hash_cont_tokens": "370a1a0ab68d15cd" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5046144e67e992e8", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4b14581ba4fc06fc", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "1ee52c413b5b4cc4", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "2914077c4dd3090a", + "hash_cont_tokens": "80dea4d59245cf01" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0f88a874342378de", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "9889933f1dd02a23", + "hash_cont_tokens": "309bef1803097408" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dc309a94c4bfdd2f", + "hash_cont_tokens": "5105a3bd1b39b785" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "0801a0aebec3ba8c", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "5bc4aca8831d9c05", + "hash_cont_tokens": "205c5deee1581b02" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b92bd6b06fc3464c", + "hash_cont_tokens": "272d28867e0ff046" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a549346cde8165e9", + "hash_cont_tokens": "98b3bf311aa83f0d" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "e7e9cf91f9d6a081", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "a61a1670f854d9e1", + "hash_cont_tokens": "d9e66fc7c702b795" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8a77cb7763f28110", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "fcfcfae391f8faa1", + "hash_cont_tokens": "d4b1936084c060e0" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a29454cc1feb23ef", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "b6734a25556d75dc", + "hash_cont_tokens": "2bf9921a39e901d9" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "5720438e29473426", + "hash_cont_tokens": "cab8b16be9576360" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "486321d5858de240", + "hash_cont_tokens": "1c34fbe5a59f1ed1" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "473919e64d1b8c80", + "hash_cont_tokens": "ebd714885a59ef55" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "47a65c81fd7ed010", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "aedfcd41cbd2fcc9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "ed5f2414144d7b72", + "hash_cont_tokens": "aac52fa6a519223b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "692eaacb5b747264", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "2cbce4edca937588", + "hash_cont_tokens": "697179a0dd47c5c0" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "c2f38b19bab1aa2c", + "hash_cont_tokens": "9b19898e3ecb527f" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fde277bc547bc3d8", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "87b232bbebce39db", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "58c21af9da3e126e", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d1f5c770d368e9c6", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "98d6db15a50aaa8e", + "hash_cont_tokens": "1e30d7dedc7588c0" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "2aabd8c7337502f8", + "hash_cont_tokens": "ceee291786cbb123" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "17f8c8f2d4a0a9b1", + "hash_cont_tokens": "484df4c25a5460bd" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "dfc6df491d991966", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "cffe8139e00da9dd", + "hash_cont_tokens": "85a9de6c685b7035" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "4a69ed6ee55918fb", + "hash_cont_tokens": "ad7b5a040535bdcf" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "6cc713f12b5890de", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "b4044fc92756c377", + "hash_cont_tokens": "0b7b5aaef574dc78" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b019784da8db089a", + "hash_cont_tokens": "63a651778e8d72d2" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "f47f37c7c9bfc601", + "hash_cont_tokens": "841583ab707b25d7" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "4d282718d6142410", + "hash_cont_tokens": "9c2c01d3214f66b8" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fbc6026e500537bc", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "150dd1ff81ff642e", + "hash_cont_tokens": "96353c5969a9028a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "fcbac3e735545969", + "hash_cont_tokens": "a1f8901800ac9b68" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ffc962a38441ef13", + "hash_cont_tokens": "08c0be345e5f1c12" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "9ffb65d225ae550f", + "hash_cont_tokens": "16c760a491c6f26e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "1c61d6705b299f5c", + "hash_cont_tokens": "868d6f1055fbd51d" + }, + "total_evaluation_time_secondes": "3740.7513077259064", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/robin-33B-v2-GPTQ/results_2023-08-22T13-23-21.800878.json b/eval-results/TheBloke/robin-33B-v2-GPTQ/results_2023-08-22T13-23-21.800878.json new file mode 100644 index 0000000000000000000000000000000000000000..8e30a30038cd72866aa88ec94f53dd55ae9e3b7d --- /dev/null +++ b/eval-results/TheBloke/robin-33B-v2-GPTQ/results_2023-08-22T13-23-21.800878.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.23122866894197952, + "acc_stderr": 0.012320858834772278, + "acc_norm": 0.2773037542662116, + "acc_norm_stderr": 0.013082095839059374 + }, + "harness|hellaswag|10": { + "acc": 0.2546305516829317, + "acc_stderr": 0.0043476298890409385, + "acc_norm": 0.26289583748257317, + "acc_norm_stderr": 0.004393066760916822 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653695, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653695 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.03673731683969506, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.03673731683969506 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.23026315789473684, + "acc_stderr": 0.034260594244031654, + "acc_norm": 0.23026315789473684, + "acc_norm_stderr": 0.034260594244031654 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542126, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542126 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2641509433962264, + "acc_stderr": 0.02713429162874172, + "acc_norm": 0.2641509433962264, + "acc_norm_stderr": 0.02713429162874172 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.25, + "acc_stderr": 0.03621034121889507, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03621034121889507 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2658959537572254, + "acc_stderr": 0.033687629322594295, + "acc_norm": 0.2658959537572254, + "acc_norm_stderr": 0.033687629322594295 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.046550104113196177, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.046550104113196177 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.18723404255319148, + "acc_stderr": 0.025501588341883603, + "acc_norm": 0.18723404255319148, + "acc_norm_stderr": 0.025501588341883603 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537315, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537315 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.1724137931034483, + "acc_stderr": 0.03147830790259574, + "acc_norm": 0.1724137931034483, + "acc_norm_stderr": 0.03147830790259574 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.021935878081184756, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.021935878081184756 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23015873015873015, + "acc_stderr": 0.037649508797906045, + "acc_norm": 0.23015873015873015, + "acc_norm_stderr": 0.037649508797906045 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.17, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.17, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.22903225806451613, + "acc_stderr": 0.023904914311782658, + "acc_norm": 0.22903225806451613, + "acc_norm_stderr": 0.023904914311782658 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.1921182266009852, + "acc_stderr": 0.02771931570961477, + "acc_norm": 0.1921182266009852, + "acc_norm_stderr": 0.02771931570961477 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322674, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322674 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.23737373737373738, + "acc_stderr": 0.030313710538198892, + "acc_norm": 0.23737373737373738, + "acc_norm_stderr": 0.030313710538198892 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.2538860103626943, + "acc_stderr": 0.03141024780565319, + "acc_norm": 0.2538860103626943, + "acc_norm_stderr": 0.03141024780565319 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.29743589743589743, + "acc_stderr": 0.02317740813146595, + "acc_norm": 0.29743589743589743, + "acc_norm_stderr": 0.02317740813146595 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2074074074074074, + "acc_stderr": 0.024720713193952158, + "acc_norm": 0.2074074074074074, + "acc_norm_stderr": 0.024720713193952158 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.029597329730978082, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.029597329730978082 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.24503311258278146, + "acc_stderr": 0.03511807571804725, + "acc_norm": 0.24503311258278146, + "acc_norm_stderr": 0.03511807571804725 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.27706422018348625, + "acc_stderr": 0.01918848259016954, + "acc_norm": 0.27706422018348625, + "acc_norm_stderr": 0.01918848259016954 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.17592592592592593, + "acc_stderr": 0.025967420958258533, + "acc_norm": 0.17592592592592593, + "acc_norm_stderr": 0.025967420958258533 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.029771775228145638, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.029771775228145638 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.21518987341772153, + "acc_stderr": 0.02675082699467617, + "acc_norm": 0.21518987341772153, + "acc_norm_stderr": 0.02675082699467617 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.19282511210762332, + "acc_stderr": 0.02647824096048936, + "acc_norm": 0.19282511210762332, + "acc_norm_stderr": 0.02647824096048936 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2824427480916031, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.2824427480916031, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2809917355371901, + "acc_stderr": 0.041032038305145124, + "acc_norm": 0.2809917355371901, + "acc_norm_stderr": 0.041032038305145124 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.041331194402438404, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.041331194402438404 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22699386503067484, + "acc_stderr": 0.032910995786157686, + "acc_norm": 0.22699386503067484, + "acc_norm_stderr": 0.032910995786157686 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.26785714285714285, + "acc_stderr": 0.04203277291467764, + "acc_norm": 0.26785714285714285, + "acc_norm_stderr": 0.04203277291467764 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.24271844660194175, + "acc_stderr": 0.042450224863844935, + "acc_norm": 0.24271844660194175, + "acc_norm_stderr": 0.042450224863844935 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.23504273504273504, + "acc_stderr": 0.027778835904935427, + "acc_norm": 0.23504273504273504, + "acc_norm_stderr": 0.027778835904935427 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.21328224776500637, + "acc_stderr": 0.014648172749593522, + "acc_norm": 0.21328224776500637, + "acc_norm_stderr": 0.014648172749593522 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2398843930635838, + "acc_stderr": 0.022989592543123563, + "acc_norm": 0.2398843930635838, + "acc_norm_stderr": 0.022989592543123563 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2335195530726257, + "acc_stderr": 0.01414957534897625, + "acc_norm": 0.2335195530726257, + "acc_norm_stderr": 0.01414957534897625 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.02392915551735129, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.02392915551735129 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.21221864951768488, + "acc_stderr": 0.023222756797435122, + "acc_norm": 0.21221864951768488, + "acc_norm_stderr": 0.023222756797435122 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.19135802469135801, + "acc_stderr": 0.02188770461339615, + "acc_norm": 0.19135802469135801, + "acc_norm_stderr": 0.02188770461339615 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23404255319148937, + "acc_stderr": 0.025257861359432403, + "acc_norm": 0.23404255319148937, + "acc_norm_stderr": 0.025257861359432403 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.25358539765319427, + "acc_stderr": 0.011111715336101132, + "acc_norm": 0.25358539765319427, + "acc_norm_stderr": 0.011111715336101132 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.21691176470588236, + "acc_stderr": 0.02503584522771127, + "acc_norm": 0.21691176470588236, + "acc_norm_stderr": 0.02503584522771127 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.017401816711427657, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.017401816711427657 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.17551020408163265, + "acc_stderr": 0.024352800722970018, + "acc_norm": 0.17551020408163265, + "acc_norm_stderr": 0.024352800722970018 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24875621890547264, + "acc_stderr": 0.030567675938916707, + "acc_norm": 0.24875621890547264, + "acc_norm_stderr": 0.030567675938916707 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2469879518072289, + "acc_stderr": 0.03357351982064537, + "acc_norm": 0.2469879518072289, + "acc_norm_stderr": 0.03357351982064537 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.031267817146631786, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.031267817146631786 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2423500611995104, + "mc1_stderr": 0.015000674373570345, + "mc2": 0.49536982988840905, + "mc2_stderr": 0.016949260989828546 + }, + "all": { + "acc": 0.23552388556599335, + "acc_stderr": 0.030915991946675134, + "acc_norm": 0.23644490880538102, + "acc_norm_stderr": 0.030929664385254164, + "mc1": 0.2423500611995104, + "mc1_stderr": 0.015000674373570345, + "mc2": 0.49536982988840905, + "mc2_stderr": 0.016949260989828546 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/robin-33B-v2-GPTQ", + "model_sha": "4c2588d65302e9ca634548ed81e8650fb2975686", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "9844.128355503082", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/robin-33B-v2-GPTQ/results_2023-11-07T16-48-37.652137.json b/eval-results/TheBloke/robin-33B-v2-GPTQ/results_2023-11-07T16-48-37.652137.json new file mode 100644 index 0000000000000000000000000000000000000000..78e23c1c373eeae673e08da2f1a1a0698fb7ed42 --- /dev/null +++ b/eval-results/TheBloke/robin-33B-v2-GPTQ/results_2023-11-07T16-48-37.652137.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/robin-33B-v2-GPTQ", + "model_sha": "4c2588d65302e9ca634548ed81e8650fb2975686", + "model_dtype": "torch.float16", + "model_size": "15.83 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.0019924496644295304, + "em_stderr": 0.000456667646266702, + "f1": 0.06134647651006716, + "f1_stderr": 0.0012678217129970543 + }, + "harness|gsm8k|5": { + "acc": 0.27748294162244125, + "acc_stderr": 0.012333447581047537 + }, + "harness|winogrande|5": { + "acc": 0.797947908445146, + "acc_stderr": 0.011285013754047434 + }, + "all": { + "em": 0.0019924496644295304, + "em_stderr": 0.000456667646266702, + "f1": 0.06134647651006716, + "f1_stderr": 0.0012678217129970543, + "acc": 0.5377154250337937, + "acc_stderr": 0.011809230667547486 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "df0dcbbf2386766b" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "cc34e53b0619c18f" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "f243cbe53705068b" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/robin-33B-v2-fp16/results_2023-07-31T16-41-32.452325.json b/eval-results/TheBloke/robin-33B-v2-fp16/results_2023-07-31T16-41-32.452325.json new file mode 100644 index 0000000000000000000000000000000000000000..f55b0970def8320c01f837b7caca4b10b0519b89 --- /dev/null +++ b/eval-results/TheBloke/robin-33B-v2-fp16/results_2023-07-31T16-41-32.452325.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5947098976109215, + "acc_stderr": 0.014346869060229321, + "acc_norm": 0.6237201365187713, + "acc_norm_stderr": 0.014157022555407156 + }, + "harness|hellaswag|10": { + "acc": 0.6331408086038638, + "acc_stderr": 0.004809626723626824, + "acc_norm": 0.8362875921131249, + "acc_norm_stderr": 0.0036925819391622834 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5037037037037037, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.5037037037037037, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5657894736842105, + "acc_stderr": 0.04033565667848319, + "acc_norm": 0.5657894736842105, + "acc_norm_stderr": 0.04033565667848319 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5207547169811321, + "acc_stderr": 0.030746349975723463, + "acc_norm": 0.5207547169811321, + "acc_norm_stderr": 0.030746349975723463 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.041553199555931467, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.041553199555931467 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.48554913294797686, + "acc_stderr": 0.03810871630454764, + "acc_norm": 0.48554913294797686, + "acc_norm_stderr": 0.03810871630454764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3137254901960784, + "acc_stderr": 0.04617034827006718, + "acc_norm": 0.3137254901960784, + "acc_norm_stderr": 0.04617034827006718 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.65, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.65, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.451063829787234, + "acc_stderr": 0.032529096196131965, + "acc_norm": 0.451063829787234, + "acc_norm_stderr": 0.032529096196131965 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.04537815354939392, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.04537815354939392 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.0416180850350153, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.0416180850350153 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.328042328042328, + "acc_stderr": 0.024180497164376896, + "acc_norm": 0.328042328042328, + "acc_norm_stderr": 0.024180497164376896 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.04163453031302859, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.04163453031302859 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6161290322580645, + "acc_stderr": 0.027666182075539638, + "acc_norm": 0.6161290322580645, + "acc_norm_stderr": 0.027666182075539638 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.37438423645320196, + "acc_stderr": 0.03405155380561953, + "acc_norm": 0.37438423645320196, + "acc_norm_stderr": 0.03405155380561953 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7333333333333333, + "acc_stderr": 0.03453131801885416, + "acc_norm": 0.7333333333333333, + "acc_norm_stderr": 0.03453131801885416 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.03191178226713547, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.03191178226713547 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.772020725388601, + "acc_stderr": 0.030276909945178267, + "acc_norm": 0.772020725388601, + "acc_norm_stderr": 0.030276909945178267 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5025641025641026, + "acc_stderr": 0.025350672979412202, + "acc_norm": 0.5025641025641026, + "acc_norm_stderr": 0.025350672979412202 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.026842057873833706, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.026842057873833706 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5630252100840336, + "acc_stderr": 0.03221943636566196, + "acc_norm": 0.5630252100840336, + "acc_norm_stderr": 0.03221943636566196 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.038020397601079024, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.038020397601079024 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7339449541284404, + "acc_stderr": 0.018946022322225607, + "acc_norm": 0.7339449541284404, + "acc_norm_stderr": 0.018946022322225607 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4212962962962963, + "acc_stderr": 0.03367462138896078, + "acc_norm": 0.4212962962962963, + "acc_norm_stderr": 0.03367462138896078 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7598039215686274, + "acc_stderr": 0.02998373305591361, + "acc_norm": 0.7598039215686274, + "acc_norm_stderr": 0.02998373305591361 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229966, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229966 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6188340807174888, + "acc_stderr": 0.03259625118416827, + "acc_norm": 0.6188340807174888, + "acc_norm_stderr": 0.03259625118416827 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6106870229007634, + "acc_stderr": 0.04276486542814591, + "acc_norm": 0.6106870229007634, + "acc_norm_stderr": 0.04276486542814591 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7024793388429752, + "acc_stderr": 0.04173349148083499, + "acc_norm": 0.7024793388429752, + "acc_norm_stderr": 0.04173349148083499 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.04557239513497751, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.04557239513497751 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6932515337423313, + "acc_stderr": 0.036230899157241474, + "acc_norm": 0.6932515337423313, + "acc_norm_stderr": 0.036230899157241474 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3392857142857143, + "acc_stderr": 0.04493949068613539, + "acc_norm": 0.3392857142857143, + "acc_norm_stderr": 0.04493949068613539 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6310679611650486, + "acc_stderr": 0.0477761518115674, + "acc_norm": 0.6310679611650486, + "acc_norm_stderr": 0.0477761518115674 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8418803418803419, + "acc_stderr": 0.023902325549560396, + "acc_norm": 0.8418803418803419, + "acc_norm_stderr": 0.023902325549560396 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7305236270753512, + "acc_stderr": 0.01586624307321506, + "acc_norm": 0.7305236270753512, + "acc_norm_stderr": 0.01586624307321506 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5953757225433526, + "acc_stderr": 0.02642481659400985, + "acc_norm": 0.5953757225433526, + "acc_norm_stderr": 0.02642481659400985 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.26033519553072626, + "acc_stderr": 0.014676252009319476, + "acc_norm": 0.26033519553072626, + "acc_norm_stderr": 0.014676252009319476 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.02818059632825929, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.02818059632825929 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6302250803858521, + "acc_stderr": 0.027417996705630998, + "acc_norm": 0.6302250803858521, + "acc_norm_stderr": 0.027417996705630998 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6080246913580247, + "acc_stderr": 0.027163686038271146, + "acc_norm": 0.6080246913580247, + "acc_norm_stderr": 0.027163686038271146 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.43617021276595747, + "acc_stderr": 0.02958345203628407, + "acc_norm": 0.43617021276595747, + "acc_norm_stderr": 0.02958345203628407 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.41264667535853977, + "acc_stderr": 0.012573836633799015, + "acc_norm": 0.41264667535853977, + "acc_norm_stderr": 0.012573836633799015 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5404411764705882, + "acc_stderr": 0.03027332507734575, + "acc_norm": 0.5404411764705882, + "acc_norm_stderr": 0.03027332507734575 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5571895424836601, + "acc_stderr": 0.020095083154577347, + "acc_norm": 0.5571895424836601, + "acc_norm_stderr": 0.020095083154577347 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6, + "acc_stderr": 0.0469237132203465, + "acc_norm": 0.6, + "acc_norm_stderr": 0.0469237132203465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.031680911612338825, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.031680911612338825 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7661691542288557, + "acc_stderr": 0.02992941540834839, + "acc_norm": 0.7661691542288557, + "acc_norm_stderr": 0.02992941540834839 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.03889951252827217, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.03889951252827217 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7602339181286549, + "acc_stderr": 0.032744852119469564, + "acc_norm": 0.7602339181286549, + "acc_norm_stderr": 0.032744852119469564 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3574051407588739, + "mc1_stderr": 0.016776599676729398, + "mc2": 0.5388029530988832, + "mc2_stderr": 0.014742138833066059 + }, + "all": { + "acc": 0.5493694357469432, + "acc_stderr": 0.03462857618448208, + "acc_norm": 0.5533043005336739, + "acc_norm_stderr": 0.03460642548466365, + "mc1": 0.3574051407588739, + "mc1_stderr": 0.016776599676729398, + "mc2": 0.5388029530988832, + "mc2_stderr": 0.014742138833066059 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/robin-33B-v2-fp16", + "model_sha": "c0ed7d40c3e52379780638dac3bd1f69597b8e18", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2b0e07d4cdd3b0fe", + "hash_cont_tokens": "52204555b6e39a6e" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "578edd77107cb2c3", + "hash_cont_tokens": "25c49737526d9f80" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "6a95a1511f8da075", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "24a78edc4d9a93aa", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "b11106668d6c0974", + "hash_cont_tokens": "ebed26cf74a85815" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "10180ba12a075cb0", + "hash_cont_tokens": "6898ac348a7ae442" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "73351ef4968750a2", + "hash_cont_tokens": "34a058958a45af94" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "a539150af234c668", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "52e12e5a43bcee35", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "d1f3721a5659f7ee", + "hash_cont_tokens": "da408cb12ab08288" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "f2d78f546b5595c2", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "c9cc19179f63d1d6", + "hash_cont_tokens": "370a1a0ab68d15cd" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5046144e67e992e8", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4b14581ba4fc06fc", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "1ee52c413b5b4cc4", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "2914077c4dd3090a", + "hash_cont_tokens": "80dea4d59245cf01" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0f88a874342378de", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "9889933f1dd02a23", + "hash_cont_tokens": "309bef1803097408" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dc309a94c4bfdd2f", + "hash_cont_tokens": "5105a3bd1b39b785" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "0801a0aebec3ba8c", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "5bc4aca8831d9c05", + "hash_cont_tokens": "205c5deee1581b02" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b92bd6b06fc3464c", + "hash_cont_tokens": "272d28867e0ff046" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a549346cde8165e9", + "hash_cont_tokens": "98b3bf311aa83f0d" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "e7e9cf91f9d6a081", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "a61a1670f854d9e1", + "hash_cont_tokens": "d9e66fc7c702b795" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8a77cb7763f28110", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "fcfcfae391f8faa1", + "hash_cont_tokens": "d4b1936084c060e0" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a29454cc1feb23ef", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "b6734a25556d75dc", + "hash_cont_tokens": "2bf9921a39e901d9" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "5720438e29473426", + "hash_cont_tokens": "cab8b16be9576360" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "486321d5858de240", + "hash_cont_tokens": "1c34fbe5a59f1ed1" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "473919e64d1b8c80", + "hash_cont_tokens": "ebd714885a59ef55" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "47a65c81fd7ed010", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "aedfcd41cbd2fcc9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "ed5f2414144d7b72", + "hash_cont_tokens": "aac52fa6a519223b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "692eaacb5b747264", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "2cbce4edca937588", + "hash_cont_tokens": "697179a0dd47c5c0" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "c2f38b19bab1aa2c", + "hash_cont_tokens": "9b19898e3ecb527f" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fde277bc547bc3d8", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "87b232bbebce39db", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "58c21af9da3e126e", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d1f5c770d368e9c6", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "98d6db15a50aaa8e", + "hash_cont_tokens": "1e30d7dedc7588c0" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "2aabd8c7337502f8", + "hash_cont_tokens": "ceee291786cbb123" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "17f8c8f2d4a0a9b1", + "hash_cont_tokens": "484df4c25a5460bd" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "dfc6df491d991966", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "cffe8139e00da9dd", + "hash_cont_tokens": "85a9de6c685b7035" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "4a69ed6ee55918fb", + "hash_cont_tokens": "ad7b5a040535bdcf" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "6cc713f12b5890de", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "b4044fc92756c377", + "hash_cont_tokens": "0b7b5aaef574dc78" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b019784da8db089a", + "hash_cont_tokens": "63a651778e8d72d2" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "f47f37c7c9bfc601", + "hash_cont_tokens": "841583ab707b25d7" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "4d282718d6142410", + "hash_cont_tokens": "9c2c01d3214f66b8" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fbc6026e500537bc", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "150dd1ff81ff642e", + "hash_cont_tokens": "96353c5969a9028a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "fcbac3e735545969", + "hash_cont_tokens": "a1f8901800ac9b68" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ffc962a38441ef13", + "hash_cont_tokens": "08c0be345e5f1c12" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "9ffb65d225ae550f", + "hash_cont_tokens": "16c760a491c6f26e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "1c61d6705b299f5c", + "hash_cont_tokens": "868d6f1055fbd51d" + }, + "total_evaluation_time_secondes": "8250.808156967163", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/robin-33B-v2-fp16/results_2023-08-12T13-26-34.042597.json b/eval-results/TheBloke/robin-33B-v2-fp16/results_2023-08-12T13-26-34.042597.json new file mode 100644 index 0000000000000000000000000000000000000000..49eec7789f7ddf79ea4a3049cdcf96db644570ac --- /dev/null +++ b/eval-results/TheBloke/robin-33B-v2-fp16/results_2023-08-12T13-26-34.042597.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5947098976109215, + "acc_stderr": 0.014346869060229321, + "acc_norm": 0.6237201365187713, + "acc_norm_stderr": 0.014157022555407156 + }, + "harness|hellaswag|10": { + "acc": 0.6331408086038638, + "acc_stderr": 0.004809626723626824, + "acc_norm": 0.8362875921131249, + "acc_norm_stderr": 0.0036925819391622834 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5037037037037037, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.5037037037037037, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5657894736842105, + "acc_stderr": 0.04033565667848319, + "acc_norm": 0.5657894736842105, + "acc_norm_stderr": 0.04033565667848319 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5207547169811321, + "acc_stderr": 0.030746349975723463, + "acc_norm": 0.5207547169811321, + "acc_norm_stderr": 0.030746349975723463 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.041553199555931467, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.041553199555931467 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.48554913294797686, + "acc_stderr": 0.03810871630454764, + "acc_norm": 0.48554913294797686, + "acc_norm_stderr": 0.03810871630454764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3137254901960784, + "acc_stderr": 0.04617034827006718, + "acc_norm": 0.3137254901960784, + "acc_norm_stderr": 0.04617034827006718 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.65, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.65, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.451063829787234, + "acc_stderr": 0.032529096196131965, + "acc_norm": 0.451063829787234, + "acc_norm_stderr": 0.032529096196131965 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.04537815354939392, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.04537815354939392 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.0416180850350153, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.0416180850350153 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.328042328042328, + "acc_stderr": 0.024180497164376896, + "acc_norm": 0.328042328042328, + "acc_norm_stderr": 0.024180497164376896 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.04163453031302859, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.04163453031302859 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6161290322580645, + "acc_stderr": 0.027666182075539638, + "acc_norm": 0.6161290322580645, + "acc_norm_stderr": 0.027666182075539638 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.37438423645320196, + "acc_stderr": 0.03405155380561953, + "acc_norm": 0.37438423645320196, + "acc_norm_stderr": 0.03405155380561953 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7333333333333333, + "acc_stderr": 0.03453131801885416, + "acc_norm": 0.7333333333333333, + "acc_norm_stderr": 0.03453131801885416 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.03191178226713547, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.03191178226713547 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.772020725388601, + "acc_stderr": 0.030276909945178267, + "acc_norm": 0.772020725388601, + "acc_norm_stderr": 0.030276909945178267 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5025641025641026, + "acc_stderr": 0.025350672979412202, + "acc_norm": 0.5025641025641026, + "acc_norm_stderr": 0.025350672979412202 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.026842057873833706, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.026842057873833706 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5630252100840336, + "acc_stderr": 0.03221943636566196, + "acc_norm": 0.5630252100840336, + "acc_norm_stderr": 0.03221943636566196 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.038020397601079024, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.038020397601079024 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7339449541284404, + "acc_stderr": 0.018946022322225607, + "acc_norm": 0.7339449541284404, + "acc_norm_stderr": 0.018946022322225607 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4212962962962963, + "acc_stderr": 0.03367462138896078, + "acc_norm": 0.4212962962962963, + "acc_norm_stderr": 0.03367462138896078 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7598039215686274, + "acc_stderr": 0.02998373305591361, + "acc_norm": 0.7598039215686274, + "acc_norm_stderr": 0.02998373305591361 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229966, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229966 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6188340807174888, + "acc_stderr": 0.03259625118416827, + "acc_norm": 0.6188340807174888, + "acc_norm_stderr": 0.03259625118416827 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6106870229007634, + "acc_stderr": 0.04276486542814591, + "acc_norm": 0.6106870229007634, + "acc_norm_stderr": 0.04276486542814591 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7024793388429752, + "acc_stderr": 0.04173349148083499, + "acc_norm": 0.7024793388429752, + "acc_norm_stderr": 0.04173349148083499 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.04557239513497751, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.04557239513497751 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6932515337423313, + "acc_stderr": 0.036230899157241474, + "acc_norm": 0.6932515337423313, + "acc_norm_stderr": 0.036230899157241474 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3392857142857143, + "acc_stderr": 0.04493949068613539, + "acc_norm": 0.3392857142857143, + "acc_norm_stderr": 0.04493949068613539 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6310679611650486, + "acc_stderr": 0.0477761518115674, + "acc_norm": 0.6310679611650486, + "acc_norm_stderr": 0.0477761518115674 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8418803418803419, + "acc_stderr": 0.023902325549560396, + "acc_norm": 0.8418803418803419, + "acc_norm_stderr": 0.023902325549560396 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7305236270753512, + "acc_stderr": 0.01586624307321506, + "acc_norm": 0.7305236270753512, + "acc_norm_stderr": 0.01586624307321506 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5953757225433526, + "acc_stderr": 0.02642481659400985, + "acc_norm": 0.5953757225433526, + "acc_norm_stderr": 0.02642481659400985 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.26033519553072626, + "acc_stderr": 0.014676252009319476, + "acc_norm": 0.26033519553072626, + "acc_norm_stderr": 0.014676252009319476 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.02818059632825929, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.02818059632825929 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6302250803858521, + "acc_stderr": 0.027417996705630998, + "acc_norm": 0.6302250803858521, + "acc_norm_stderr": 0.027417996705630998 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6080246913580247, + "acc_stderr": 0.027163686038271146, + "acc_norm": 0.6080246913580247, + "acc_norm_stderr": 0.027163686038271146 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.43617021276595747, + "acc_stderr": 0.02958345203628407, + "acc_norm": 0.43617021276595747, + "acc_norm_stderr": 0.02958345203628407 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.41264667535853977, + "acc_stderr": 0.012573836633799015, + "acc_norm": 0.41264667535853977, + "acc_norm_stderr": 0.012573836633799015 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5404411764705882, + "acc_stderr": 0.03027332507734575, + "acc_norm": 0.5404411764705882, + "acc_norm_stderr": 0.03027332507734575 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5571895424836601, + "acc_stderr": 0.020095083154577347, + "acc_norm": 0.5571895424836601, + "acc_norm_stderr": 0.020095083154577347 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6, + "acc_stderr": 0.0469237132203465, + "acc_norm": 0.6, + "acc_norm_stderr": 0.0469237132203465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.031680911612338825, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.031680911612338825 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7661691542288557, + "acc_stderr": 0.02992941540834839, + "acc_norm": 0.7661691542288557, + "acc_norm_stderr": 0.02992941540834839 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.03889951252827217, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.03889951252827217 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7602339181286549, + "acc_stderr": 0.032744852119469564, + "acc_norm": 0.7602339181286549, + "acc_norm_stderr": 0.032744852119469564 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3574051407588739, + "mc1_stderr": 0.016776599676729398, + "mc2": 0.5388029530988832, + "mc2_stderr": 0.014742138833066059 + }, + "all": { + "acc": 0.5493694357469432, + "acc_stderr": 0.03462857618448208, + "acc_norm": 0.5533043005336739, + "acc_norm_stderr": 0.03460642548466365, + "mc1": 0.3574051407588739, + "mc1_stderr": 0.016776599676729398, + "mc2": 0.5388029530988832, + "mc2_stderr": 0.014742138833066059 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/robin-33B-v2-fp16", + "model_sha": "c0ed7d40c3e52379780638dac3bd1f69597b8e18", + "model_dtype": "torch.float16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2b0e07d4cdd3b0fe", + "hash_cont_tokens": "52204555b6e39a6e" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "578edd77107cb2c3", + "hash_cont_tokens": "25c49737526d9f80" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "6a95a1511f8da075", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "24a78edc4d9a93aa", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "b11106668d6c0974", + "hash_cont_tokens": "ebed26cf74a85815" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "10180ba12a075cb0", + "hash_cont_tokens": "6898ac348a7ae442" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "73351ef4968750a2", + "hash_cont_tokens": "34a058958a45af94" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "a539150af234c668", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "52e12e5a43bcee35", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "d1f3721a5659f7ee", + "hash_cont_tokens": "da408cb12ab08288" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "f2d78f546b5595c2", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "c9cc19179f63d1d6", + "hash_cont_tokens": "370a1a0ab68d15cd" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5046144e67e992e8", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4b14581ba4fc06fc", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "1ee52c413b5b4cc4", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "2914077c4dd3090a", + "hash_cont_tokens": "80dea4d59245cf01" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0f88a874342378de", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "9889933f1dd02a23", + "hash_cont_tokens": "309bef1803097408" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dc309a94c4bfdd2f", + "hash_cont_tokens": "5105a3bd1b39b785" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "0801a0aebec3ba8c", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "5bc4aca8831d9c05", + "hash_cont_tokens": "205c5deee1581b02" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b92bd6b06fc3464c", + "hash_cont_tokens": "272d28867e0ff046" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a549346cde8165e9", + "hash_cont_tokens": "98b3bf311aa83f0d" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "e7e9cf91f9d6a081", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "a61a1670f854d9e1", + "hash_cont_tokens": "d9e66fc7c702b795" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8a77cb7763f28110", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "fcfcfae391f8faa1", + "hash_cont_tokens": "d4b1936084c060e0" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a29454cc1feb23ef", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "b6734a25556d75dc", + "hash_cont_tokens": "2bf9921a39e901d9" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "5720438e29473426", + "hash_cont_tokens": "cab8b16be9576360" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "486321d5858de240", + "hash_cont_tokens": "1c34fbe5a59f1ed1" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "473919e64d1b8c80", + "hash_cont_tokens": "ebd714885a59ef55" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "47a65c81fd7ed010", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "aedfcd41cbd2fcc9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "ed5f2414144d7b72", + "hash_cont_tokens": "aac52fa6a519223b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "692eaacb5b747264", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "2cbce4edca937588", + "hash_cont_tokens": "697179a0dd47c5c0" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "c2f38b19bab1aa2c", + "hash_cont_tokens": "9b19898e3ecb527f" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fde277bc547bc3d8", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "87b232bbebce39db", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "58c21af9da3e126e", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d1f5c770d368e9c6", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "98d6db15a50aaa8e", + "hash_cont_tokens": "1e30d7dedc7588c0" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "2aabd8c7337502f8", + "hash_cont_tokens": "ceee291786cbb123" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "17f8c8f2d4a0a9b1", + "hash_cont_tokens": "484df4c25a5460bd" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "dfc6df491d991966", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "cffe8139e00da9dd", + "hash_cont_tokens": "85a9de6c685b7035" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "4a69ed6ee55918fb", + "hash_cont_tokens": "ad7b5a040535bdcf" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "6cc713f12b5890de", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "b4044fc92756c377", + "hash_cont_tokens": "0b7b5aaef574dc78" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b019784da8db089a", + "hash_cont_tokens": "63a651778e8d72d2" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "f47f37c7c9bfc601", + "hash_cont_tokens": "841583ab707b25d7" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "4d282718d6142410", + "hash_cont_tokens": "9c2c01d3214f66b8" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fbc6026e500537bc", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "150dd1ff81ff642e", + "hash_cont_tokens": "96353c5969a9028a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "fcbac3e735545969", + "hash_cont_tokens": "a1f8901800ac9b68" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ffc962a38441ef13", + "hash_cont_tokens": "08c0be345e5f1c12" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "9ffb65d225ae550f", + "hash_cont_tokens": "16c760a491c6f26e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "1c61d6705b299f5c", + "hash_cont_tokens": "868d6f1055fbd51d" + }, + "total_evaluation_time_secondes": "9191.256449699402", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/robin-65b-v2-fp16/results_2023-08-17T22-09-59.169977.json b/eval-results/TheBloke/robin-65b-v2-fp16/results_2023-08-17T22-09-59.169977.json new file mode 100644 index 0000000000000000000000000000000000000000..473279983c6fd2ff00d5ceace9f290949caeadba --- /dev/null +++ b/eval-results/TheBloke/robin-65b-v2-fp16/results_2023-08-17T22-09-59.169977.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5870307167235495, + "acc_stderr": 0.014388344935398326, + "acc_norm": 0.6194539249146758, + "acc_norm_stderr": 0.014188277712349808 + }, + "harness|hellaswag|10": { + "acc": 0.6395140410276837, + "acc_stderr": 0.004791601975612765, + "acc_norm": 0.8460466042620992, + "acc_norm_stderr": 0.0036016648387189004 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5333333333333333, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.5333333333333333, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6973684210526315, + "acc_stderr": 0.037385206761196686, + "acc_norm": 0.6973684210526315, + "acc_norm_stderr": 0.037385206761196686 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.660377358490566, + "acc_stderr": 0.029146904747798325, + "acc_norm": 0.660377358490566, + "acc_norm_stderr": 0.029146904747798325 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7152777777777778, + "acc_stderr": 0.03773809990686934, + "acc_norm": 0.7152777777777778, + "acc_norm_stderr": 0.03773809990686934 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5606936416184971, + "acc_stderr": 0.037842719328874674, + "acc_norm": 0.5606936416184971, + "acc_norm_stderr": 0.037842719328874674 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.046550104113196177, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.046550104113196177 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6085106382978723, + "acc_stderr": 0.031907012423268113, + "acc_norm": 0.6085106382978723, + "acc_norm_stderr": 0.031907012423268113 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.39473684210526316, + "acc_stderr": 0.045981880578165414, + "acc_norm": 0.39473684210526316, + "acc_norm_stderr": 0.045981880578165414 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5241379310344828, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.5241379310344828, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3994708994708995, + "acc_stderr": 0.025225450284067877, + "acc_norm": 0.3994708994708995, + "acc_norm_stderr": 0.025225450284067877 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.04375888492727062, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.04375888492727062 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7516129032258064, + "acc_stderr": 0.024580028921481003, + "acc_norm": 0.7516129032258064, + "acc_norm_stderr": 0.024580028921481003 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4433497536945813, + "acc_stderr": 0.03495334582162934, + "acc_norm": 0.4433497536945813, + "acc_norm_stderr": 0.03495334582162934 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.65, + "acc_stderr": 0.04793724854411019, + "acc_norm": 0.65, + "acc_norm_stderr": 0.04793724854411019 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7696969696969697, + "acc_stderr": 0.032876667586034906, + "acc_norm": 0.7696969696969697, + "acc_norm_stderr": 0.032876667586034906 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8080808080808081, + "acc_stderr": 0.02805779167298902, + "acc_norm": 0.8080808080808081, + "acc_norm_stderr": 0.02805779167298902 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8808290155440415, + "acc_stderr": 0.023381935348121437, + "acc_norm": 0.8808290155440415, + "acc_norm_stderr": 0.023381935348121437 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6487179487179487, + "acc_stderr": 0.024203665177902803, + "acc_norm": 0.6487179487179487, + "acc_norm_stderr": 0.024203665177902803 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.02831753349606647, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.02831753349606647 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6638655462184874, + "acc_stderr": 0.030684737115135374, + "acc_norm": 0.6638655462184874, + "acc_norm_stderr": 0.030684737115135374 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.37748344370860926, + "acc_stderr": 0.0395802723112157, + "acc_norm": 0.37748344370860926, + "acc_norm_stderr": 0.0395802723112157 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.818348623853211, + "acc_stderr": 0.016530617409266857, + "acc_norm": 0.818348623853211, + "acc_norm_stderr": 0.016530617409266857 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4861111111111111, + "acc_stderr": 0.03408655867977748, + "acc_norm": 0.4861111111111111, + "acc_norm_stderr": 0.03408655867977748 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8137254901960784, + "acc_stderr": 0.027325470966716312, + "acc_norm": 0.8137254901960784, + "acc_norm_stderr": 0.027325470966716312 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8354430379746836, + "acc_stderr": 0.024135736240566932, + "acc_norm": 0.8354430379746836, + "acc_norm_stderr": 0.024135736240566932 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6995515695067265, + "acc_stderr": 0.030769352008229136, + "acc_norm": 0.6995515695067265, + "acc_norm_stderr": 0.030769352008229136 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7175572519083969, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.7175572519083969, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8016528925619835, + "acc_stderr": 0.03640118271990945, + "acc_norm": 0.8016528925619835, + "acc_norm_stderr": 0.03640118271990945 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.04330043749650742, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.04330043749650742 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.754601226993865, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.754601226993865, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.41964285714285715, + "acc_stderr": 0.04684099321077106, + "acc_norm": 0.41964285714285715, + "acc_norm_stderr": 0.04684099321077106 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7961165048543689, + "acc_stderr": 0.03989139859531771, + "acc_norm": 0.7961165048543689, + "acc_norm_stderr": 0.03989139859531771 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8675213675213675, + "acc_stderr": 0.022209309073165616, + "acc_norm": 0.8675213675213675, + "acc_norm_stderr": 0.022209309073165616 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252607, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252607 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8212005108556832, + "acc_stderr": 0.01370264371536898, + "acc_norm": 0.8212005108556832, + "acc_norm_stderr": 0.01370264371536898 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7167630057803468, + "acc_stderr": 0.02425790170532337, + "acc_norm": 0.7167630057803468, + "acc_norm_stderr": 0.02425790170532337 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.34413407821229053, + "acc_stderr": 0.015889221313307094, + "acc_norm": 0.34413407821229053, + "acc_norm_stderr": 0.015889221313307094 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6568627450980392, + "acc_stderr": 0.027184498909941613, + "acc_norm": 0.6568627450980392, + "acc_norm_stderr": 0.027184498909941613 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7202572347266881, + "acc_stderr": 0.02549425935069491, + "acc_norm": 0.7202572347266881, + "acc_norm_stderr": 0.02549425935069491 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7438271604938271, + "acc_stderr": 0.024288533637726095, + "acc_norm": 0.7438271604938271, + "acc_norm_stderr": 0.024288533637726095 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5070921985815603, + "acc_stderr": 0.02982449855912901, + "acc_norm": 0.5070921985815603, + "acc_norm_stderr": 0.02982449855912901 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.46936114732724904, + "acc_stderr": 0.012746237711716634, + "acc_norm": 0.46936114732724904, + "acc_norm_stderr": 0.012746237711716634 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6102941176470589, + "acc_stderr": 0.0296246635811597, + "acc_norm": 0.6102941176470589, + "acc_norm_stderr": 0.0296246635811597 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6715686274509803, + "acc_stderr": 0.018999707383162673, + "acc_norm": 0.6715686274509803, + "acc_norm_stderr": 0.018999707383162673 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.0449429086625209, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.0449429086625209 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7306122448979592, + "acc_stderr": 0.02840125202902294, + "acc_norm": 0.7306122448979592, + "acc_norm_stderr": 0.02840125202902294 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8407960199004975, + "acc_stderr": 0.02587064676616914, + "acc_norm": 0.8407960199004975, + "acc_norm_stderr": 0.02587064676616914 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.033799766898963086, + "acc_norm": 0.87, + "acc_norm_stderr": 0.033799766898963086 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5301204819277109, + "acc_stderr": 0.03885425420866767, + "acc_norm": 0.5301204819277109, + "acc_norm_stderr": 0.03885425420866767 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8304093567251462, + "acc_stderr": 0.02878210810540171, + "acc_norm": 0.8304093567251462, + "acc_norm_stderr": 0.02878210810540171 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.37209302325581395, + "mc1_stderr": 0.016921090118814038, + "mc2": 0.5230660045885717, + "mc2_stderr": 0.014819358026329301 + }, + "all": { + "acc": 0.6247081012105746, + "acc_stderr": 0.03306437561725338, + "acc_norm": 0.6287581990313466, + "acc_norm_stderr": 0.033040816221322156, + "mc1": 0.37209302325581395, + "mc1_stderr": 0.016921090118814038, + "mc2": 0.5230660045885717, + "mc2_stderr": 0.014819358026329301 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/robin-65b-v2-fp16", + "model_sha": "40edb31ba93045d673735361bc98f56125bbc77b", + "model_dtype": "torch.float16", + "lighteval_sha": "8bab069fee0c6e75ffa4c1ef8a9591c28ee0e049", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "25843.34134221077", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/robin-65b-v2-fp16/results_2023-10-23T10-30-00.008059.json b/eval-results/TheBloke/robin-65b-v2-fp16/results_2023-10-23T10-30-00.008059.json new file mode 100644 index 0000000000000000000000000000000000000000..74ebb678ec394f916b91902db4e3a361a24c277f --- /dev/null +++ b/eval-results/TheBloke/robin-65b-v2-fp16/results_2023-10-23T10-30-00.008059.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/robin-65b-v2-fp16", + "model_sha": "40edb31ba93045d673735361bc98f56125bbc77b", + "model_size": "121.68 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.002202181208053691, + "em_stderr": 0.00048005108166193297, + "f1": 0.064190436241611, + "f1_stderr": 0.001385342539630455 + }, + "harness|gsm8k|5": { + "acc": 0.2699014404852161, + "acc_stderr": 0.012227442856468897 + }, + "harness|winogrande|5": { + "acc": 0.8050513022888713, + "acc_stderr": 0.011134099415938275 + }, + "all": { + "em": 0.002202181208053691, + "em_stderr": 0.00048005108166193297, + "f1": 0.064190436241611, + "f1_stderr": 0.001385342539630455, + "acc": 0.5374763713870437, + "acc_stderr": 0.011680771136203586 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "c4a4764c46e536b9" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "18bacd0c13ec2173" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "f99baf00ee6b1b52" + }, + "total_evaluation_time_secondes": "45524.50394201279", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/stable-vicuna-13B-HF/results_2023-08-22T17-12-46.134347.json b/eval-results/TheBloke/stable-vicuna-13B-HF/results_2023-08-22T17-12-46.134347.json new file mode 100644 index 0000000000000000000000000000000000000000..874f35f00b8626ea9862f81348b7f37b539012b0 --- /dev/null +++ b/eval-results/TheBloke/stable-vicuna-13B-HF/results_2023-08-22T17-12-46.134347.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5170648464163823, + "acc_stderr": 0.014602878388536597, + "acc_norm": 0.5332764505119454, + "acc_norm_stderr": 0.014578995859605806 + }, + "harness|hellaswag|10": { + "acc": 0.5864369647480582, + "acc_stderr": 0.004914655063329499, + "acc_norm": 0.7850029874526987, + "acc_norm_stderr": 0.004099806728607399 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252606, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252606 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5592105263157895, + "acc_stderr": 0.04040311062490436, + "acc_norm": 0.5592105263157895, + "acc_norm_stderr": 0.04040311062490436 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.46037735849056605, + "acc_stderr": 0.030676096599389188, + "acc_norm": 0.46037735849056605, + "acc_norm_stderr": 0.030676096599389188 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.04174752578923183, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.04174752578923183 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3815028901734104, + "acc_stderr": 0.037038511930995194, + "acc_norm": 0.3815028901734104, + "acc_norm_stderr": 0.037038511930995194 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4127659574468085, + "acc_stderr": 0.03218471141400351, + "acc_norm": 0.4127659574468085, + "acc_norm_stderr": 0.03218471141400351 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4413793103448276, + "acc_stderr": 0.04137931034482758, + "acc_norm": 0.4413793103448276, + "acc_norm_stderr": 0.04137931034482758 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.023517294335963283, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.023517294335963283 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04285714285714281, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04285714285714281 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5580645161290323, + "acc_stderr": 0.028251557906849734, + "acc_norm": 0.5580645161290323, + "acc_norm_stderr": 0.028251557906849734 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.35467980295566504, + "acc_stderr": 0.03366124489051449, + "acc_norm": 0.35467980295566504, + "acc_norm_stderr": 0.03366124489051449 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.03756335775187896, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.03756335775187896 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6262626262626263, + "acc_stderr": 0.03446897738659333, + "acc_norm": 0.6262626262626263, + "acc_norm_stderr": 0.03446897738659333 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.694300518134715, + "acc_stderr": 0.033248379397581594, + "acc_norm": 0.694300518134715, + "acc_norm_stderr": 0.033248379397581594 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4512820512820513, + "acc_stderr": 0.02523038123893484, + "acc_norm": 0.4512820512820513, + "acc_norm_stderr": 0.02523038123893484 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.027940457136228416, + "acc_norm": 0.3, + "acc_norm_stderr": 0.027940457136228416 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5042016806722689, + "acc_stderr": 0.0324773433444811, + "acc_norm": 0.5042016806722689, + "acc_norm_stderr": 0.0324773433444811 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943342, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943342 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6642201834862386, + "acc_stderr": 0.020248081396752923, + "acc_norm": 0.6642201834862386, + "acc_norm_stderr": 0.020248081396752923 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.38425925925925924, + "acc_stderr": 0.03317354514310742, + "acc_norm": 0.38425925925925924, + "acc_norm_stderr": 0.03317354514310742 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6323529411764706, + "acc_stderr": 0.03384132045674119, + "acc_norm": 0.6323529411764706, + "acc_norm_stderr": 0.03384132045674119 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6497890295358649, + "acc_stderr": 0.031052391937584346, + "acc_norm": 0.6497890295358649, + "acc_norm_stderr": 0.031052391937584346 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5874439461883408, + "acc_stderr": 0.03304062175449297, + "acc_norm": 0.5874439461883408, + "acc_norm_stderr": 0.03304062175449297 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6335877862595419, + "acc_stderr": 0.04225875451969638, + "acc_norm": 0.6335877862595419, + "acc_norm_stderr": 0.04225875451969638 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6198347107438017, + "acc_stderr": 0.04431324501968432, + "acc_norm": 0.6198347107438017, + "acc_norm_stderr": 0.04431324501968432 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6018518518518519, + "acc_stderr": 0.04732332615978814, + "acc_norm": 0.6018518518518519, + "acc_norm_stderr": 0.04732332615978814 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6503067484662577, + "acc_stderr": 0.03746668325470021, + "acc_norm": 0.6503067484662577, + "acc_norm_stderr": 0.03746668325470021 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.38392857142857145, + "acc_stderr": 0.04616143075028547, + "acc_norm": 0.38392857142857145, + "acc_norm_stderr": 0.04616143075028547 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6601941747572816, + "acc_stderr": 0.046897659372781335, + "acc_norm": 0.6601941747572816, + "acc_norm_stderr": 0.046897659372781335 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.027236013946196687, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.027236013946196687 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.55, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.55, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6641123882503193, + "acc_stderr": 0.016889407235171686, + "acc_norm": 0.6641123882503193, + "acc_norm_stderr": 0.016889407235171686 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5549132947976878, + "acc_stderr": 0.026756255129663762, + "acc_norm": 0.5549132947976878, + "acc_norm_stderr": 0.026756255129663762 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2748603351955307, + "acc_stderr": 0.014931316703220504, + "acc_norm": 0.2748603351955307, + "acc_norm_stderr": 0.014931316703220504 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5261437908496732, + "acc_stderr": 0.028590752958852387, + "acc_norm": 0.5261437908496732, + "acc_norm_stderr": 0.028590752958852387 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5434083601286174, + "acc_stderr": 0.028290869054197604, + "acc_norm": 0.5434083601286174, + "acc_norm_stderr": 0.028290869054197604 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5216049382716049, + "acc_stderr": 0.027794760105008736, + "acc_norm": 0.5216049382716049, + "acc_norm_stderr": 0.027794760105008736 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3829787234042553, + "acc_stderr": 0.02899908090480619, + "acc_norm": 0.3829787234042553, + "acc_norm_stderr": 0.02899908090480619 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4048239895697523, + "acc_stderr": 0.012536743830953987, + "acc_norm": 0.4048239895697523, + "acc_norm_stderr": 0.012536743830953987 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5, + "acc_stderr": 0.030372836961539352, + "acc_norm": 0.5, + "acc_norm_stderr": 0.030372836961539352 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5032679738562091, + "acc_stderr": 0.020227402794434867, + "acc_norm": 0.5032679738562091, + "acc_norm_stderr": 0.020227402794434867 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5545454545454546, + "acc_stderr": 0.047605488214603246, + "acc_norm": 0.5545454545454546, + "acc_norm_stderr": 0.047605488214603246 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6163265306122448, + "acc_stderr": 0.03113088039623593, + "acc_norm": 0.6163265306122448, + "acc_norm_stderr": 0.03113088039623593 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6517412935323383, + "acc_stderr": 0.033687874661154596, + "acc_norm": 0.6517412935323383, + "acc_norm_stderr": 0.033687874661154596 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4578313253012048, + "acc_stderr": 0.038786267710023595, + "acc_norm": 0.4578313253012048, + "acc_norm_stderr": 0.038786267710023595 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7426900584795322, + "acc_stderr": 0.03352799844161865, + "acc_norm": 0.7426900584795322, + "acc_norm_stderr": 0.03352799844161865 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3427172582619339, + "mc1_stderr": 0.01661494938534704, + "mc2": 0.4838353616511973, + "mc2_stderr": 0.015030079987453928 + }, + "all": { + "acc": 0.5045929860684458, + "acc_stderr": 0.03513348936167607, + "acc_norm": 0.5082332848277713, + "acc_norm_stderr": 0.03511927358432602, + "mc1": 0.3427172582619339, + "mc1_stderr": 0.01661494938534704, + "mc2": 0.4838353616511973, + "mc2_stderr": 0.015030079987453928 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/stable-vicuna-13B-HF", + "model_sha": "2b099b2be0dafb2606ae9808c0f6183fe4bff7bc", + "model_dtype": "torch.float16", + "lighteval_sha": "2d7f9b0219a3536f201c55d7e8126251127b731c", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4122.606763839722", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/stable-vicuna-13B-HF/results_2023-10-22T23-53-33.704899.json b/eval-results/TheBloke/stable-vicuna-13B-HF/results_2023-10-22T23-53-33.704899.json new file mode 100644 index 0000000000000000000000000000000000000000..56c66e1b352f4196c4fd75abb5cc775e7ae4fedd --- /dev/null +++ b/eval-results/TheBloke/stable-vicuna-13B-HF/results_2023-10-22T23-53-33.704899.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/stable-vicuna-13B-HF", + "model_sha": "2b099b2be0dafb2606ae9808c0f6183fe4bff7bc", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.005138422818791947, + "em_stderr": 0.0007322104102794217, + "f1": 0.07736682046979858, + "f1_stderr": 0.001663035439531372 + }, + "harness|gsm8k|5": { + "acc": 0.04094010614101592, + "acc_stderr": 0.005458076796294336 + }, + "harness|winogrande|5": { + "acc": 0.7521704814522494, + "acc_stderr": 0.012134386019865348 + }, + "all": { + "em": 0.005138422818791947, + "em_stderr": 0.0007322104102794217, + "f1": 0.07736682046979858, + "f1_stderr": 0.001663035439531372, + "acc": 0.3965552937966327, + "acc_stderr": 0.008796231408079842 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "980e75bd3e00dc11" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "7a03776ccec1ebac" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "fc35b8fef315f84c" + }, + "total_evaluation_time_secondes": "12246.05377650261", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/tulu-13B-fp16/results_2023-07-19T18-33-52.983892.json b/eval-results/TheBloke/tulu-13B-fp16/results_2023-07-19T18-33-52.983892.json new file mode 100644 index 0000000000000000000000000000000000000000..e3bd1ecd7ac0df0d2db4c1b3876dbb53df91ebd6 --- /dev/null +++ b/eval-results/TheBloke/tulu-13B-fp16/results_2023-07-19T18-33-52.983892.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5110921501706485, + "acc_stderr": 0.014607794914013048, + "acc_norm": 0.5392491467576792, + "acc_norm_stderr": 0.014566303676636583 + }, + "harness|hellaswag|10": { + "acc": 0.606652061342362, + "acc_stderr": 0.004874945833947072, + "acc_norm": 0.8066122286397132, + "acc_norm_stderr": 0.003941471781664184 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421296, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421296 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5263157894736842, + "acc_stderr": 0.04063302731486671, + "acc_norm": 0.5263157894736842, + "acc_norm_stderr": 0.04063302731486671 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5547169811320755, + "acc_stderr": 0.030588052974270658, + "acc_norm": 0.5547169811320755, + "acc_norm_stderr": 0.030588052974270658 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5972222222222222, + "acc_stderr": 0.04101405519842426, + "acc_norm": 0.5972222222222222, + "acc_norm_stderr": 0.04101405519842426 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411019, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411019 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.42196531791907516, + "acc_stderr": 0.0376574669386515, + "acc_norm": 0.42196531791907516, + "acc_norm_stderr": 0.0376574669386515 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.03793281185307809, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.03793281185307809 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4553191489361702, + "acc_stderr": 0.03255525359340355, + "acc_norm": 0.4553191489361702, + "acc_norm_stderr": 0.03255525359340355 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.37719298245614036, + "acc_stderr": 0.04559522141958216, + "acc_norm": 0.37719298245614036, + "acc_norm_stderr": 0.04559522141958216 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.46206896551724136, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.46206896551724136, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3386243386243386, + "acc_stderr": 0.024373197867983053, + "acc_norm": 0.3386243386243386, + "acc_norm_stderr": 0.024373197867983053 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.04325506042017086, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.04325506042017086 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5967741935483871, + "acc_stderr": 0.02790615082604114, + "acc_norm": 0.5967741935483871, + "acc_norm_stderr": 0.02790615082604114 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.35960591133004927, + "acc_stderr": 0.03376458246509567, + "acc_norm": 0.35960591133004927, + "acc_norm_stderr": 0.03376458246509567 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7212121212121212, + "acc_stderr": 0.03501438706296781, + "acc_norm": 0.7212121212121212, + "acc_norm_stderr": 0.03501438706296781 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6818181818181818, + "acc_stderr": 0.0331847733384533, + "acc_norm": 0.6818181818181818, + "acc_norm_stderr": 0.0331847733384533 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7512953367875648, + "acc_stderr": 0.031195840877700293, + "acc_norm": 0.7512953367875648, + "acc_norm_stderr": 0.031195840877700293 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5205128205128206, + "acc_stderr": 0.02532966316348994, + "acc_norm": 0.5205128205128206, + "acc_norm_stderr": 0.02532966316348994 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.27037037037037037, + "acc_stderr": 0.02708037281514566, + "acc_norm": 0.27037037037037037, + "acc_norm_stderr": 0.02708037281514566 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5756302521008403, + "acc_stderr": 0.032104790510157764, + "acc_norm": 0.5756302521008403, + "acc_norm_stderr": 0.032104790510157764 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7211009174311926, + "acc_stderr": 0.0192274688764635, + "acc_norm": 0.7211009174311926, + "acc_norm_stderr": 0.0192274688764635 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.36574074074074076, + "acc_stderr": 0.03284738857647207, + "acc_norm": 0.36574074074074076, + "acc_norm_stderr": 0.03284738857647207 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6911764705882353, + "acc_stderr": 0.03242661719827218, + "acc_norm": 0.6911764705882353, + "acc_norm_stderr": 0.03242661719827218 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.759493670886076, + "acc_stderr": 0.027820781981149685, + "acc_norm": 0.759493670886076, + "acc_norm_stderr": 0.027820781981149685 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5739910313901345, + "acc_stderr": 0.033188332862172806, + "acc_norm": 0.5739910313901345, + "acc_norm_stderr": 0.033188332862172806 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6183206106870229, + "acc_stderr": 0.04260735157644561, + "acc_norm": 0.6183206106870229, + "acc_norm_stderr": 0.04260735157644561 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6859504132231405, + "acc_stderr": 0.042369647530410184, + "acc_norm": 0.6859504132231405, + "acc_norm_stderr": 0.042369647530410184 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7037037037037037, + "acc_stderr": 0.044143436668549335, + "acc_norm": 0.7037037037037037, + "acc_norm_stderr": 0.044143436668549335 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6380368098159509, + "acc_stderr": 0.037757007291414416, + "acc_norm": 0.6380368098159509, + "acc_norm_stderr": 0.037757007291414416 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.44642857142857145, + "acc_stderr": 0.047184714852195886, + "acc_norm": 0.44642857142857145, + "acc_norm_stderr": 0.047184714852195886 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.04541609446503948, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.04541609446503948 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7692307692307693, + "acc_stderr": 0.027601921381417586, + "acc_norm": 0.7692307692307693, + "acc_norm_stderr": 0.027601921381417586 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956913, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956913 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7139208173690932, + "acc_stderr": 0.016160871405127546, + "acc_norm": 0.7139208173690932, + "acc_norm_stderr": 0.016160871405127546 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5953757225433526, + "acc_stderr": 0.026424816594009845, + "acc_norm": 0.5953757225433526, + "acc_norm_stderr": 0.026424816594009845 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3307262569832402, + "acc_stderr": 0.01573502625896612, + "acc_norm": 0.3307262569832402, + "acc_norm_stderr": 0.01573502625896612 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5686274509803921, + "acc_stderr": 0.028358956313423556, + "acc_norm": 0.5686274509803921, + "acc_norm_stderr": 0.028358956313423556 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6205787781350482, + "acc_stderr": 0.02755994980234782, + "acc_norm": 0.6205787781350482, + "acc_norm_stderr": 0.02755994980234782 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5740740740740741, + "acc_stderr": 0.027513747284379428, + "acc_norm": 0.5740740740740741, + "acc_norm_stderr": 0.027513747284379428 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.42907801418439717, + "acc_stderr": 0.029525914302558562, + "acc_norm": 0.42907801418439717, + "acc_norm_stderr": 0.029525914302558562 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.39960886571056065, + "acc_stderr": 0.012510181636960672, + "acc_norm": 0.39960886571056065, + "acc_norm_stderr": 0.012510181636960672 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.49264705882352944, + "acc_stderr": 0.030369552523902173, + "acc_norm": 0.49264705882352944, + "acc_norm_stderr": 0.030369552523902173 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5212418300653595, + "acc_stderr": 0.020209572388600244, + "acc_norm": 0.5212418300653595, + "acc_norm_stderr": 0.020209572388600244 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.046075820907199756, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.046075820907199756 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6081632653061224, + "acc_stderr": 0.03125127591089165, + "acc_norm": 0.6081632653061224, + "acc_norm_stderr": 0.03125127591089165 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7562189054726368, + "acc_stderr": 0.030360490154014635, + "acc_norm": 0.7562189054726368, + "acc_norm_stderr": 0.030360490154014635 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.77, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.77, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42168674698795183, + "acc_stderr": 0.03844453181770917, + "acc_norm": 0.42168674698795183, + "acc_norm_stderr": 0.03844453181770917 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7426900584795322, + "acc_stderr": 0.03352799844161865, + "acc_norm": 0.7426900584795322, + "acc_norm_stderr": 0.03352799844161865 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2962056303549572, + "mc1_stderr": 0.015983595101811392, + "mc2": 0.4383598440987917, + "mc2_stderr": 0.01533631770650821 + }, + "all": { + "acc": 0.5328622443733009, + "acc_stderr": 0.03469472349445629, + "acc_norm": 0.5367286369815107, + "acc_norm_stderr": 0.03467819865903834, + "mc1": 0.2962056303549572, + "mc1_stderr": 0.015983595101811392, + "mc2": 0.4383598440987917, + "mc2_stderr": 0.01533631770650821 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/tulu-13B-fp16", + "model_sha": "532aeb363b0ceee155b3cf9479ef635b797cee7c", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2b0e07d4cdd3b0fe", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "578edd77107cb2c3", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "6a95a1511f8da075", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "24a78edc4d9a93aa", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "b11106668d6c0974", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "10180ba12a075cb0", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "73351ef4968750a2", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "a539150af234c668", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "52e12e5a43bcee35", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "d1f3721a5659f7ee", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "f2d78f546b5595c2", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "c9cc19179f63d1d6", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5046144e67e992e8", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4b14581ba4fc06fc", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "1ee52c413b5b4cc4", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "2914077c4dd3090a", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0f88a874342378de", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "9889933f1dd02a23", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dc309a94c4bfdd2f", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "0801a0aebec3ba8c", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "5bc4aca8831d9c05", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b92bd6b06fc3464c", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a549346cde8165e9", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "e7e9cf91f9d6a081", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "a61a1670f854d9e1", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8a77cb7763f28110", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "fcfcfae391f8faa1", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a29454cc1feb23ef", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "b6734a25556d75dc", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "5720438e29473426", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "486321d5858de240", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "473919e64d1b8c80", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "47a65c81fd7ed010", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "aedfcd41cbd2fcc9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "ed5f2414144d7b72", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "692eaacb5b747264", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "2cbce4edca937588", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "c2f38b19bab1aa2c", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fde277bc547bc3d8", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "87b232bbebce39db", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "58c21af9da3e126e", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d1f5c770d368e9c6", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "98d6db15a50aaa8e", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "2aabd8c7337502f8", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "17f8c8f2d4a0a9b1", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "dfc6df491d991966", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "cffe8139e00da9dd", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "4a69ed6ee55918fb", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "6cc713f12b5890de", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "b4044fc92756c377", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b019784da8db089a", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "f47f37c7c9bfc601", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "4d282718d6142410", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fbc6026e500537bc", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "150dd1ff81ff642e", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "fcbac3e735545969", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ffc962a38441ef13", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "9ffb65d225ae550f", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/tulu-13B-fp16/results_2023-10-22T17-51-25.855725.json b/eval-results/TheBloke/tulu-13B-fp16/results_2023-10-22T17-51-25.855725.json new file mode 100644 index 0000000000000000000000000000000000000000..d35bdabffc84be83c22ce44f3347e717ba1dd63e --- /dev/null +++ b/eval-results/TheBloke/tulu-13B-fp16/results_2023-10-22T17-51-25.855725.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/tulu-13B-fp16", + "model_sha": "532aeb363b0ceee155b3cf9479ef635b797cee7c", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.33001258389261745, + "em_stderr": 0.004815464931125239, + "f1": 0.367210570469799, + "f1_stderr": 0.004753724357053633 + }, + "harness|gsm8k|5": { + "acc": 0.1425322213798332, + "acc_stderr": 0.009629588445673824 + }, + "harness|winogrande|5": { + "acc": 0.7561168113654302, + "acc_stderr": 0.0120689232789082 + }, + "all": { + "em": 0.33001258389261745, + "em_stderr": 0.004815464931125239, + "f1": 0.367210570469799, + "f1_stderr": 0.004753724357053633, + "acc": 0.4493245163726317, + "acc_stderr": 0.010849255862291012 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "c7d2a4da187dba20" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "09271fe3bbe6fcc3" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "d95e03c7537cf693" + }, + "total_evaluation_time_secondes": "7140.315663576126", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/tulu-30B-fp16/results_2023-08-21T16-27-25.217456.json b/eval-results/TheBloke/tulu-30B-fp16/results_2023-08-21T16-27-25.217456.json new file mode 100644 index 0000000000000000000000000000000000000000..d585f4714055922bd99ad5e1d60ae167fa026db9 --- /dev/null +++ b/eval-results/TheBloke/tulu-30B-fp16/results_2023-08-21T16-27-25.217456.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5563139931740614, + "acc_stderr": 0.014518421825670454, + "acc_norm": 0.5998293515358362, + "acc_norm_stderr": 0.014317197787809183 + }, + "harness|hellaswag|10": { + "acc": 0.6341366261700856, + "acc_stderr": 0.0048068702857472926, + "acc_norm": 0.8339972117108145, + "acc_norm_stderr": 0.003713227064225385 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5921052631578947, + "acc_stderr": 0.039993097127774734, + "acc_norm": 0.5921052631578947, + "acc_norm_stderr": 0.039993097127774734 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5773584905660377, + "acc_stderr": 0.030402331445769544, + "acc_norm": 0.5773584905660377, + "acc_norm_stderr": 0.030402331445769544 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6180555555555556, + "acc_stderr": 0.040629907841466674, + "acc_norm": 0.6180555555555556, + "acc_norm_stderr": 0.040629907841466674 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5086705202312138, + "acc_stderr": 0.03811890988940412, + "acc_norm": 0.5086705202312138, + "acc_norm_stderr": 0.03811890988940412 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.044405219061793275, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.044405219061793275 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4808510638297872, + "acc_stderr": 0.03266204299064678, + "acc_norm": 0.4808510638297872, + "acc_norm_stderr": 0.03266204299064678 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537314, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537314 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4413793103448276, + "acc_stderr": 0.04137931034482757, + "acc_norm": 0.4413793103448276, + "acc_norm_stderr": 0.04137931034482757 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3201058201058201, + "acc_stderr": 0.024026846392873502, + "acc_norm": 0.3201058201058201, + "acc_norm_stderr": 0.024026846392873502 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.042163702135578345, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.042163702135578345 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6419354838709678, + "acc_stderr": 0.027273890594300645, + "acc_norm": 0.6419354838709678, + "acc_norm_stderr": 0.027273890594300645 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4039408866995074, + "acc_stderr": 0.03452453903822039, + "acc_norm": 0.4039408866995074, + "acc_norm_stderr": 0.03452453903822039 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7757575757575758, + "acc_stderr": 0.03256866661681102, + "acc_norm": 0.7757575757575758, + "acc_norm_stderr": 0.03256866661681102 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.702020202020202, + "acc_stderr": 0.03258630383836556, + "acc_norm": 0.702020202020202, + "acc_norm_stderr": 0.03258630383836556 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7668393782383419, + "acc_stderr": 0.03051611137147602, + "acc_norm": 0.7668393782383419, + "acc_norm_stderr": 0.03051611137147602 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5307692307692308, + "acc_stderr": 0.025302958890850154, + "acc_norm": 0.5307692307692308, + "acc_norm_stderr": 0.025302958890850154 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2851851851851852, + "acc_stderr": 0.027528599210340492, + "acc_norm": 0.2851851851851852, + "acc_norm_stderr": 0.027528599210340492 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5798319327731093, + "acc_stderr": 0.03206183783236153, + "acc_norm": 0.5798319327731093, + "acc_norm_stderr": 0.03206183783236153 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943343, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943343 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7889908256880734, + "acc_stderr": 0.01749392240411265, + "acc_norm": 0.7889908256880734, + "acc_norm_stderr": 0.01749392240411265 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.03372343271653063, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.03372343271653063 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8235294117647058, + "acc_stderr": 0.026756401538078962, + "acc_norm": 0.8235294117647058, + "acc_norm_stderr": 0.026756401538078962 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8185654008438819, + "acc_stderr": 0.025085961144579654, + "acc_norm": 0.8185654008438819, + "acc_norm_stderr": 0.025085961144579654 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6322869955156951, + "acc_stderr": 0.03236198350928275, + "acc_norm": 0.6322869955156951, + "acc_norm_stderr": 0.03236198350928275 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6641221374045801, + "acc_stderr": 0.041423137719966634, + "acc_norm": 0.6641221374045801, + "acc_norm_stderr": 0.041423137719966634 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.03984979653302872, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.03984979653302872 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.04236511258094633, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.04236511258094633 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6993865030674846, + "acc_stderr": 0.03602511318806771, + "acc_norm": 0.6993865030674846, + "acc_norm_stderr": 0.03602511318806771 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.046355501356099754, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.046355501356099754 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7087378640776699, + "acc_stderr": 0.044986763205729224, + "acc_norm": 0.7087378640776699, + "acc_norm_stderr": 0.044986763205729224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8504273504273504, + "acc_stderr": 0.023365051491753715, + "acc_norm": 0.8504273504273504, + "acc_norm_stderr": 0.023365051491753715 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7624521072796935, + "acc_stderr": 0.015218733046150193, + "acc_norm": 0.7624521072796935, + "acc_norm_stderr": 0.015218733046150193 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6069364161849711, + "acc_stderr": 0.026296227915613674, + "acc_norm": 0.6069364161849711, + "acc_norm_stderr": 0.026296227915613674 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.19441340782122904, + "acc_stderr": 0.013235808096742276, + "acc_norm": 0.19441340782122904, + "acc_norm_stderr": 0.013235808096742276 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5980392156862745, + "acc_stderr": 0.02807415894760066, + "acc_norm": 0.5980392156862745, + "acc_norm_stderr": 0.02807415894760066 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.639871382636656, + "acc_stderr": 0.027264297599804015, + "acc_norm": 0.639871382636656, + "acc_norm_stderr": 0.027264297599804015 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6481481481481481, + "acc_stderr": 0.026571483480719964, + "acc_norm": 0.6481481481481481, + "acc_norm_stderr": 0.026571483480719964 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.46099290780141844, + "acc_stderr": 0.029736592526424438, + "acc_norm": 0.46099290780141844, + "acc_norm_stderr": 0.029736592526424438 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44132985658409385, + "acc_stderr": 0.01268201633564667, + "acc_norm": 0.44132985658409385, + "acc_norm_stderr": 0.01268201633564667 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5808823529411765, + "acc_stderr": 0.029972807170464622, + "acc_norm": 0.5808823529411765, + "acc_norm_stderr": 0.029972807170464622 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.01994491413687358, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.01994491413687358 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6612244897959184, + "acc_stderr": 0.030299506562154185, + "acc_norm": 0.6612244897959184, + "acc_norm_stderr": 0.030299506562154185 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7512437810945274, + "acc_stderr": 0.030567675938916718, + "acc_norm": 0.7512437810945274, + "acc_norm_stderr": 0.030567675938916718 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036625, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036625 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4879518072289157, + "acc_stderr": 0.0389136449583582, + "acc_norm": 0.4879518072289157, + "acc_norm_stderr": 0.0389136449583582 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.030944459778533197, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.030944459778533197 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2974296205630355, + "mc1_stderr": 0.016002651487361005, + "mc2": 0.4514200961025914, + "mc2_stderr": 0.014896406115690652 + }, + "all": { + "acc": 0.562197435739877, + "acc_stderr": 0.03396946631813126, + "acc_norm": 0.5663224517382246, + "acc_norm_stderr": 0.03394751941542985, + "mc1": 0.2974296205630355, + "mc1_stderr": 0.016002651487361005, + "mc2": 0.4514200961025914, + "mc2_stderr": 0.014896406115690652 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/tulu-30B-fp16", + "model_sha": "37c3655676c37662f60c68dacfce3f0e861be846", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2b0e07d4cdd3b0fe", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "578edd77107cb2c3", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "6a95a1511f8da075", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "24a78edc4d9a93aa", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "b11106668d6c0974", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "10180ba12a075cb0", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "73351ef4968750a2", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "a539150af234c668", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "52e12e5a43bcee35", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "d1f3721a5659f7ee", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "f2d78f546b5595c2", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "c9cc19179f63d1d6", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5046144e67e992e8", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4b14581ba4fc06fc", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "1ee52c413b5b4cc4", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "2914077c4dd3090a", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0f88a874342378de", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "9889933f1dd02a23", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dc309a94c4bfdd2f", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "0801a0aebec3ba8c", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "5bc4aca8831d9c05", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b92bd6b06fc3464c", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a549346cde8165e9", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "e7e9cf91f9d6a081", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "a61a1670f854d9e1", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8a77cb7763f28110", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "fcfcfae391f8faa1", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a29454cc1feb23ef", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "b6734a25556d75dc", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "5720438e29473426", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "486321d5858de240", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "473919e64d1b8c80", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "47a65c81fd7ed010", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "aedfcd41cbd2fcc9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "ed5f2414144d7b72", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "692eaacb5b747264", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "2cbce4edca937588", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "c2f38b19bab1aa2c", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fde277bc547bc3d8", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "87b232bbebce39db", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "58c21af9da3e126e", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d1f5c770d368e9c6", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "98d6db15a50aaa8e", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "2aabd8c7337502f8", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "17f8c8f2d4a0a9b1", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "dfc6df491d991966", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "cffe8139e00da9dd", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "4a69ed6ee55918fb", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "6cc713f12b5890de", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "b4044fc92756c377", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b019784da8db089a", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "f47f37c7c9bfc601", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "4d282718d6142410", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fbc6026e500537bc", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "150dd1ff81ff642e", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "fcbac3e735545969", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ffc962a38441ef13", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "9ffb65d225ae550f", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "1c61d6705b299f5c", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "9287.17385149002", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/tulu-30B-fp16/results_2023-10-22T14-05-44.356727.json b/eval-results/TheBloke/tulu-30B-fp16/results_2023-10-22T14-05-44.356727.json new file mode 100644 index 0000000000000000000000000000000000000000..0659537568265c2eea9bce715bc3b2f6dafb76ab --- /dev/null +++ b/eval-results/TheBloke/tulu-30B-fp16/results_2023-10-22T14-05-44.356727.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/tulu-30B-fp16", + "model_sha": "37c3655676c37662f60c68dacfce3f0e861be846", + "model_size": "60.65 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.4158976510067114, + "em_stderr": 0.005047512015363023, + "f1": 0.4501331795302018, + "f1_stderr": 0.004938014903871411 + }, + "harness|gsm8k|5": { + "acc": 0.19711902956785443, + "acc_stderr": 0.01095802163030063 + }, + "harness|winogrande|5": { + "acc": 0.8082083662194159, + "acc_stderr": 0.011065209664659527 + }, + "all": { + "em": 0.4158976510067114, + "em_stderr": 0.005047512015363023, + "f1": 0.4501331795302018, + "f1_stderr": 0.004938014903871411, + "acc": 0.5026636978936352, + "acc_stderr": 0.011011615647480079 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "aefc7e1535a158a4" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "fb748e8d8bfa91dc" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "129b33809f533750" + }, + "total_evaluation_time_secondes": "13137.950061321259", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/tulu-7B-fp16/results_2023-07-19T17-17-47.759549.json b/eval-results/TheBloke/tulu-7B-fp16/results_2023-07-19T17-17-47.759549.json new file mode 100644 index 0000000000000000000000000000000000000000..aa9065a46be57c3dfc2497dbc78569bd72004f36 --- /dev/null +++ b/eval-results/TheBloke/tulu-7B-fp16/results_2023-07-19T17-17-47.759549.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.46245733788395904, + "acc_stderr": 0.014570144495075578, + "acc_norm": 0.5017064846416383, + "acc_norm_stderr": 0.014611305705056983 + }, + "harness|hellaswag|10": { + "acc": 0.5748854809798845, + "acc_stderr": 0.004933500261683595, + "acc_norm": 0.7703644692292372, + "acc_norm_stderr": 0.004197388626940065 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45185185185185184, + "acc_stderr": 0.04299268905480864, + "acc_norm": 0.45185185185185184, + "acc_norm_stderr": 0.04299268905480864 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.04063302731486671, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.04063302731486671 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.49056603773584906, + "acc_stderr": 0.0307673947078081, + "acc_norm": 0.49056603773584906, + "acc_norm_stderr": 0.0307673947078081 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5, + "acc_stderr": 0.04181210050035455, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04181210050035455 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4277456647398844, + "acc_stderr": 0.03772446857518026, + "acc_norm": 0.4277456647398844, + "acc_norm_stderr": 0.03772446857518026 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.04440521906179327, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.04440521906179327 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.37446808510638296, + "acc_stderr": 0.03163910665367291, + "acc_norm": 0.37446808510638296, + "acc_norm_stderr": 0.03163910665367291 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4068965517241379, + "acc_stderr": 0.040937939812662374, + "acc_norm": 0.4068965517241379, + "acc_norm_stderr": 0.040937939812662374 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25132275132275134, + "acc_stderr": 0.022340482339643898, + "acc_norm": 0.25132275132275134, + "acc_norm_stderr": 0.022340482339643898 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30952380952380953, + "acc_stderr": 0.04134913018303316, + "acc_norm": 0.30952380952380953, + "acc_norm_stderr": 0.04134913018303316 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.4806451612903226, + "acc_stderr": 0.0284226874043121, + "acc_norm": 0.4806451612903226, + "acc_norm_stderr": 0.0284226874043121 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3103448275862069, + "acc_stderr": 0.03255086769970103, + "acc_norm": 0.3103448275862069, + "acc_norm_stderr": 0.03255086769970103 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6606060606060606, + "acc_stderr": 0.03697442205031596, + "acc_norm": 0.6606060606060606, + "acc_norm_stderr": 0.03697442205031596 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5909090909090909, + "acc_stderr": 0.03502975799413007, + "acc_norm": 0.5909090909090909, + "acc_norm_stderr": 0.03502975799413007 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6735751295336787, + "acc_stderr": 0.033840286211432945, + "acc_norm": 0.6735751295336787, + "acc_norm_stderr": 0.033840286211432945 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4358974358974359, + "acc_stderr": 0.025141801511177495, + "acc_norm": 0.4358974358974359, + "acc_norm_stderr": 0.025141801511177495 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.02784081149587192, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.02784081149587192 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3907563025210084, + "acc_stderr": 0.031693802357129965, + "acc_norm": 0.3907563025210084, + "acc_norm_stderr": 0.031693802357129965 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.03631329803969653, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.03631329803969653 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.671559633027523, + "acc_stderr": 0.020135902797298412, + "acc_norm": 0.671559633027523, + "acc_norm_stderr": 0.020135902797298412 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.38425925925925924, + "acc_stderr": 0.03317354514310742, + "acc_norm": 0.38425925925925924, + "acc_norm_stderr": 0.03317354514310742 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6225490196078431, + "acc_stderr": 0.03402272044340703, + "acc_norm": 0.6225490196078431, + "acc_norm_stderr": 0.03402272044340703 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6624472573839663, + "acc_stderr": 0.030781549102026223, + "acc_norm": 0.6624472573839663, + "acc_norm_stderr": 0.030781549102026223 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5291479820627802, + "acc_stderr": 0.03350073248773404, + "acc_norm": 0.5291479820627802, + "acc_norm_stderr": 0.03350073248773404 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5114503816793893, + "acc_stderr": 0.043841400240780176, + "acc_norm": 0.5114503816793893, + "acc_norm_stderr": 0.043841400240780176 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6115702479338843, + "acc_stderr": 0.044492703500683836, + "acc_norm": 0.6115702479338843, + "acc_norm_stderr": 0.044492703500683836 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5648148148148148, + "acc_stderr": 0.04792898170907061, + "acc_norm": 0.5648148148148148, + "acc_norm_stderr": 0.04792898170907061 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5705521472392638, + "acc_stderr": 0.038890666191127236, + "acc_norm": 0.5705521472392638, + "acc_norm_stderr": 0.038890666191127236 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3392857142857143, + "acc_stderr": 0.04493949068613539, + "acc_norm": 0.3392857142857143, + "acc_norm_stderr": 0.04493949068613539 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6601941747572816, + "acc_stderr": 0.04689765937278134, + "acc_norm": 0.6601941747572816, + "acc_norm_stderr": 0.04689765937278134 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7564102564102564, + "acc_stderr": 0.0281209665039144, + "acc_norm": 0.7564102564102564, + "acc_norm_stderr": 0.0281209665039144 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6526181353767561, + "acc_stderr": 0.01702667174865573, + "acc_norm": 0.6526181353767561, + "acc_norm_stderr": 0.01702667174865573 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5115606936416185, + "acc_stderr": 0.02691189868637793, + "acc_norm": 0.5115606936416185, + "acc_norm_stderr": 0.02691189868637793 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.29608938547486036, + "acc_stderr": 0.01526867731760228, + "acc_norm": 0.29608938547486036, + "acc_norm_stderr": 0.01526867731760228 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5032679738562091, + "acc_stderr": 0.028629305194003543, + "acc_norm": 0.5032679738562091, + "acc_norm_stderr": 0.028629305194003543 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.4919614147909968, + "acc_stderr": 0.028394421370984545, + "acc_norm": 0.4919614147909968, + "acc_norm_stderr": 0.028394421370984545 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.027744313443376536, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.027744313443376536 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.34397163120567376, + "acc_stderr": 0.02833801742861132, + "acc_norm": 0.34397163120567376, + "acc_norm_stderr": 0.02833801742861132 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3546284224250326, + "acc_stderr": 0.012218576439090158, + "acc_norm": 0.3546284224250326, + "acc_norm_stderr": 0.012218576439090158 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4375, + "acc_stderr": 0.030134614954403924, + "acc_norm": 0.4375, + "acc_norm_stderr": 0.030134614954403924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.45751633986928103, + "acc_stderr": 0.020154685712590888, + "acc_norm": 0.45751633986928103, + "acc_norm_stderr": 0.020154685712590888 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5636363636363636, + "acc_stderr": 0.04750185058907296, + "acc_norm": 0.5636363636363636, + "acc_norm_stderr": 0.04750185058907296 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4897959183673469, + "acc_stderr": 0.03200255347893782, + "acc_norm": 0.4897959183673469, + "acc_norm_stderr": 0.03200255347893782 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6616915422885572, + "acc_stderr": 0.033455630703391914, + "acc_norm": 0.6616915422885572, + "acc_norm_stderr": 0.033455630703391914 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.40963855421686746, + "acc_stderr": 0.03828401115079023, + "acc_norm": 0.40963855421686746, + "acc_norm_stderr": 0.03828401115079023 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6549707602339181, + "acc_stderr": 0.036459813773888065, + "acc_norm": 0.6549707602339181, + "acc_norm_stderr": 0.036459813773888065 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2802937576499388, + "mc1_stderr": 0.015723139524608767, + "mc2": 0.41606557052981424, + "mc2_stderr": 0.01502153595504234 + }, + "all": { + "acc": 0.4777637934552113, + "acc_stderr": 0.03507055963848573, + "acc_norm": 0.4817422364214322, + "acc_norm_stderr": 0.03505878081772705, + "mc1": 0.2802937576499388, + "mc1_stderr": 0.015723139524608767, + "mc2": 0.41606557052981424, + "mc2_stderr": 0.01502153595504234 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/tulu-7B-fp16", + "model_sha": "8a026683f79119643f4007da4e9155c7849792cc", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2b0e07d4cdd3b0fe", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "578edd77107cb2c3", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "6a95a1511f8da075", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "24a78edc4d9a93aa", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "b11106668d6c0974", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "10180ba12a075cb0", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "73351ef4968750a2", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "a539150af234c668", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "52e12e5a43bcee35", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "d1f3721a5659f7ee", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "f2d78f546b5595c2", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "c9cc19179f63d1d6", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5046144e67e992e8", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4b14581ba4fc06fc", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "1ee52c413b5b4cc4", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "2914077c4dd3090a", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0f88a874342378de", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "9889933f1dd02a23", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dc309a94c4bfdd2f", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "0801a0aebec3ba8c", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "5bc4aca8831d9c05", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b92bd6b06fc3464c", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a549346cde8165e9", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "e7e9cf91f9d6a081", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "a61a1670f854d9e1", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8a77cb7763f28110", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "fcfcfae391f8faa1", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a29454cc1feb23ef", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "b6734a25556d75dc", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "5720438e29473426", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "486321d5858de240", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "473919e64d1b8c80", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "47a65c81fd7ed010", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "aedfcd41cbd2fcc9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "ed5f2414144d7b72", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "692eaacb5b747264", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "2cbce4edca937588", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "c2f38b19bab1aa2c", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fde277bc547bc3d8", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "87b232bbebce39db", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "58c21af9da3e126e", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d1f5c770d368e9c6", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "98d6db15a50aaa8e", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "2aabd8c7337502f8", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "17f8c8f2d4a0a9b1", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "dfc6df491d991966", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "cffe8139e00da9dd", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "4a69ed6ee55918fb", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "6cc713f12b5890de", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "b4044fc92756c377", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b019784da8db089a", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "f47f37c7c9bfc601", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "4d282718d6142410", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fbc6026e500537bc", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "150dd1ff81ff642e", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "fcbac3e735545969", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ffc962a38441ef13", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "9ffb65d225ae550f", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/tulu-7B-fp16/results_2023-10-22T23-41-54.207641.json b/eval-results/TheBloke/tulu-7B-fp16/results_2023-10-22T23-41-54.207641.json new file mode 100644 index 0000000000000000000000000000000000000000..755f22973109014e5c0be9eb8a7b9114ba390f96 --- /dev/null +++ b/eval-results/TheBloke/tulu-7B-fp16/results_2023-10-22T23-41-54.207641.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/tulu-7B-fp16", + "model_sha": "8a026683f79119643f4007da4e9155c7849792cc", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.2993917785234899, + "em_stderr": 0.004690263056389047, + "f1": 0.33736996644295303, + "f1_stderr": 0.004651138439477223 + }, + "harness|gsm8k|5": { + "acc": 0.11220621683093253, + "acc_stderr": 0.008693743138242383 + }, + "harness|winogrande|5": { + "acc": 0.7379636937647988, + "acc_stderr": 0.012358944431637561 + }, + "all": { + "em": 0.2993917785234899, + "em_stderr": 0.004690263056389047, + "f1": 0.33736996644295303, + "f1_stderr": 0.004651138439477223, + "acc": 0.42508495529786566, + "acc_stderr": 0.010526343784939971 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "ba5286a4f20d9f0f" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "3db0d25a3feea444" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "a38e6eb4c0e64a24" + }, + "total_evaluation_time_secondes": "6154.091874361038", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/vicuna-13B-1.1-HF/results_2023-07-18T13-57-49.812019.json b/eval-results/TheBloke/vicuna-13B-1.1-HF/results_2023-07-18T13-57-49.812019.json new file mode 100644 index 0000000000000000000000000000000000000000..ee6875f8471fa213f625d627b7f15161685719bd --- /dev/null +++ b/eval-results/TheBloke/vicuna-13B-1.1-HF/results_2023-07-18T13-57-49.812019.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5196245733788396, + "acc_stderr": 0.014600132075947094, + "acc_norm": 0.5273037542662116, + "acc_norm_stderr": 0.014589589101985996 + }, + "harness|hellaswag|10": { + "acc": 0.6007767377016531, + "acc_stderr": 0.004887378682406532, + "acc_norm": 0.8013343955387373, + "acc_norm_stderr": 0.003981802822377587 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421296, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421296 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.506578947368421, + "acc_stderr": 0.040685900502249704, + "acc_norm": 0.506578947368421, + "acc_norm_stderr": 0.040685900502249704 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4981132075471698, + "acc_stderr": 0.030772653642075664, + "acc_norm": 0.4981132075471698, + "acc_norm_stderr": 0.030772653642075664 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5972222222222222, + "acc_stderr": 0.04101405519842426, + "acc_norm": 0.5972222222222222, + "acc_norm_stderr": 0.04101405519842426 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4161849710982659, + "acc_stderr": 0.03758517775404948, + "acc_norm": 0.4161849710982659, + "acc_norm_stderr": 0.03758517775404948 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.044405219061793254, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.044405219061793254 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.39574468085106385, + "acc_stderr": 0.031967586978353627, + "acc_norm": 0.39574468085106385, + "acc_norm_stderr": 0.031967586978353627 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.04339138322579861, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.04339138322579861 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3386243386243386, + "acc_stderr": 0.02437319786798306, + "acc_norm": 0.3386243386243386, + "acc_norm_stderr": 0.02437319786798306 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.04390259265377562, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.04390259265377562 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.567741935483871, + "acc_stderr": 0.028181739720019416, + "acc_norm": 0.567741935483871, + "acc_norm_stderr": 0.028181739720019416 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4039408866995074, + "acc_stderr": 0.03452453903822039, + "acc_norm": 0.4039408866995074, + "acc_norm_stderr": 0.03452453903822039 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.0368105086916155, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.0368105086916155 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6565656565656566, + "acc_stderr": 0.03383201223244441, + "acc_norm": 0.6565656565656566, + "acc_norm_stderr": 0.03383201223244441 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.694300518134715, + "acc_stderr": 0.033248379397581594, + "acc_norm": 0.694300518134715, + "acc_norm_stderr": 0.033248379397581594 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4717948717948718, + "acc_stderr": 0.0253106392549339, + "acc_norm": 0.4717948717948718, + "acc_norm_stderr": 0.0253106392549339 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.027940457136228416, + "acc_norm": 0.3, + "acc_norm_stderr": 0.027940457136228416 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4495798319327731, + "acc_stderr": 0.03231293497137707, + "acc_norm": 0.4495798319327731, + "acc_norm_stderr": 0.03231293497137707 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6862385321100918, + "acc_stderr": 0.019894723341469116, + "acc_norm": 0.6862385321100918, + "acc_norm_stderr": 0.019894723341469116 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.37037037037037035, + "acc_stderr": 0.03293377139415191, + "acc_norm": 0.37037037037037035, + "acc_norm_stderr": 0.03293377139415191 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6862745098039216, + "acc_stderr": 0.03256685484460388, + "acc_norm": 0.6862745098039216, + "acc_norm_stderr": 0.03256685484460388 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7088607594936709, + "acc_stderr": 0.02957160106575337, + "acc_norm": 0.7088607594936709, + "acc_norm_stderr": 0.02957160106575337 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5874439461883408, + "acc_stderr": 0.03304062175449297, + "acc_norm": 0.5874439461883408, + "acc_norm_stderr": 0.03304062175449297 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6717557251908397, + "acc_stderr": 0.04118438565806298, + "acc_norm": 0.6717557251908397, + "acc_norm_stderr": 0.04118438565806298 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6859504132231405, + "acc_stderr": 0.042369647530410184, + "acc_norm": 0.6859504132231405, + "acc_norm_stderr": 0.042369647530410184 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6203703703703703, + "acc_stderr": 0.04691521224077742, + "acc_norm": 0.6203703703703703, + "acc_norm_stderr": 0.04691521224077742 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6380368098159509, + "acc_stderr": 0.037757007291414416, + "acc_norm": 0.6380368098159509, + "acc_norm_stderr": 0.037757007291414416 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.04541609446503947, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.04541609446503947 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7521367521367521, + "acc_stderr": 0.028286324075564386, + "acc_norm": 0.7521367521367521, + "acc_norm_stderr": 0.028286324075564386 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6922094508301405, + "acc_stderr": 0.016506045045155637, + "acc_norm": 0.6922094508301405, + "acc_norm_stderr": 0.016506045045155637 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.546242774566474, + "acc_stderr": 0.026803720583206177, + "acc_norm": 0.546242774566474, + "acc_norm_stderr": 0.026803720583206177 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3307262569832402, + "acc_stderr": 0.01573502625896612, + "acc_norm": 0.3307262569832402, + "acc_norm_stderr": 0.01573502625896612 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.02845263998508801, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.02845263998508801 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5241157556270096, + "acc_stderr": 0.028365041542564577, + "acc_norm": 0.5241157556270096, + "acc_norm_stderr": 0.028365041542564577 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5462962962962963, + "acc_stderr": 0.0277012284685426, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.0277012284685426 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.39361702127659576, + "acc_stderr": 0.029144544781596154, + "acc_norm": 0.39361702127659576, + "acc_norm_stderr": 0.029144544781596154 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4165580182529335, + "acc_stderr": 0.012591153245057383, + "acc_norm": 0.4165580182529335, + "acc_norm_stderr": 0.012591153245057383 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4889705882352941, + "acc_stderr": 0.030365446477275675, + "acc_norm": 0.4889705882352941, + "acc_norm_stderr": 0.030365446477275675 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5212418300653595, + "acc_stderr": 0.020209572388600248, + "acc_norm": 0.5212418300653595, + "acc_norm_stderr": 0.020209572388600248 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5545454545454546, + "acc_stderr": 0.047605488214603246, + "acc_norm": 0.5545454545454546, + "acc_norm_stderr": 0.047605488214603246 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6244897959183674, + "acc_stderr": 0.03100120903989484, + "acc_norm": 0.6244897959183674, + "acc_norm_stderr": 0.03100120903989484 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7711442786069652, + "acc_stderr": 0.029705284056772436, + "acc_norm": 0.7711442786069652, + "acc_norm_stderr": 0.029705284056772436 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.76, + "acc_stderr": 0.04292346959909281, + "acc_norm": 0.76, + "acc_norm_stderr": 0.04292346959909281 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4397590361445783, + "acc_stderr": 0.03864139923699121, + "acc_norm": 0.4397590361445783, + "acc_norm_stderr": 0.03864139923699121 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7134502923976608, + "acc_stderr": 0.03467826685703826, + "acc_norm": 0.7134502923976608, + "acc_norm_stderr": 0.03467826685703826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3623011015911873, + "mc1_stderr": 0.016826646897262255, + "mc2": 0.5207836984948891, + "mc2_stderr": 0.01580678689190342 + }, + "all": { + "acc": 0.5207458541981249, + "acc_stderr": 0.03494058387309796, + "acc_norm": 0.5242752921426072, + "acc_norm_stderr": 0.03492505643523372, + "mc1": 0.3623011015911873, + "mc1_stderr": 0.016826646897262255, + "mc2": 0.5207836984948891, + "mc2_stderr": 0.01580678689190342 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/vicuna-13B-1.1-HF", + "model_sha": "8c71dbe9221e83d2ec72e4dc08beccfc78b563c0", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/vicuna-13B-1.1-HF/results_2023-10-23T02-01-12.621227.json b/eval-results/TheBloke/vicuna-13B-1.1-HF/results_2023-10-23T02-01-12.621227.json new file mode 100644 index 0000000000000000000000000000000000000000..fb9655bacf59f2ce68e8cbc275b4fe893b433342 --- /dev/null +++ b/eval-results/TheBloke/vicuna-13B-1.1-HF/results_2023-10-23T02-01-12.621227.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/vicuna-13B-1.1-HF", + "model_sha": "1acf26f93742fafe91562253ec0e5d94e40a8bea", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.029677013422818792, + "em_stderr": 0.0017378324714143493, + "f1": 0.09310612416107406, + "f1_stderr": 0.002167792401176146 + }, + "harness|gsm8k|5": { + "acc": 0.08642911296436695, + "acc_stderr": 0.00774004433710381 + }, + "harness|winogrande|5": { + "acc": 0.7419100236779794, + "acc_stderr": 0.012298278833972384 + }, + "all": { + "em": 0.029677013422818792, + "em_stderr": 0.0017378324714143493, + "f1": 0.09310612416107406, + "f1_stderr": 0.002167792401176146, + "acc": 0.4141695683211732, + "acc_stderr": 0.010019161585538096 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "ac752e2682fcf21e" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "6a30e0a9abfde216" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "3120c9f83854444f" + }, + "total_evaluation_time_secondes": "12857.16336965561", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/vicuna-13b-v1.3.0-GPTQ/results_2023-08-29T17-36-46.584597.json b/eval-results/TheBloke/vicuna-13b-v1.3.0-GPTQ/results_2023-08-29T17-36-46.584597.json new file mode 100644 index 0000000000000000000000000000000000000000..bef6f7bd389f2433d8176b17d938be52792fc082 --- /dev/null +++ b/eval-results/TheBloke/vicuna-13b-v1.3.0-GPTQ/results_2023-08-29T17-36-46.584597.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TheBloke/vicuna-13b-v1.3.0-GPTQ", + "model_sha": "6ef1f8d8638ea2d6681a8e3da73be57c501d847b", + "model_dtype": "torch.float16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.515358361774744, + "acc_stderr": 0.01460449612939491, + "acc_norm": 0.5435153583617748, + "acc_norm_stderr": 0.014555949760496442 + }, + "harness|hellaswag|10": { + "acc": 0.594901414060944, + "acc_stderr": 0.004899078300184252, + "acc_norm": 0.7946624178450508, + "acc_norm_stderr": 0.004031225342516806 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.04292596718256981, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.04292596718256981 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4868421052631579, + "acc_stderr": 0.04067533136309173, + "acc_norm": 0.4868421052631579, + "acc_norm_stderr": 0.04067533136309173 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5094339622641509, + "acc_stderr": 0.030767394707808093, + "acc_norm": 0.5094339622641509, + "acc_norm_stderr": 0.030767394707808093 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5069444444444444, + "acc_stderr": 0.04180806750294938, + "acc_norm": 0.5069444444444444, + "acc_norm_stderr": 0.04180806750294938 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4277456647398844, + "acc_stderr": 0.03772446857518026, + "acc_norm": 0.4277456647398844, + "acc_norm_stderr": 0.03772446857518026 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.04440521906179327, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.04440521906179327 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.68, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4085106382978723, + "acc_stderr": 0.03213418026701576, + "acc_norm": 0.4085106382978723, + "acc_norm_stderr": 0.03213418026701576 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.04339138322579861, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.04339138322579861 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.45517241379310347, + "acc_stderr": 0.04149886942192117, + "acc_norm": 0.45517241379310347, + "acc_norm_stderr": 0.04149886942192117 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2671957671957672, + "acc_stderr": 0.022789673145776564, + "acc_norm": 0.2671957671957672, + "acc_norm_stderr": 0.022789673145776564 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.04263906892795132, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.04263906892795132 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5870967741935483, + "acc_stderr": 0.02800913812540039, + "acc_norm": 0.5870967741935483, + "acc_norm_stderr": 0.02800913812540039 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4187192118226601, + "acc_stderr": 0.03471192860518468, + "acc_norm": 0.4187192118226601, + "acc_norm_stderr": 0.03471192860518468 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03681050869161549, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03681050869161549 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6616161616161617, + "acc_stderr": 0.03371124142626302, + "acc_norm": 0.6616161616161617, + "acc_norm_stderr": 0.03371124142626302 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7202072538860104, + "acc_stderr": 0.03239637046735704, + "acc_norm": 0.7202072538860104, + "acc_norm_stderr": 0.03239637046735704 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.441025641025641, + "acc_stderr": 0.02517404838400076, + "acc_norm": 0.441025641025641, + "acc_norm_stderr": 0.02517404838400076 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.02646611753895992, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.02646611753895992 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4831932773109244, + "acc_stderr": 0.03246013680375308, + "acc_norm": 0.4831932773109244, + "acc_norm_stderr": 0.03246013680375308 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2913907284768212, + "acc_stderr": 0.03710185726119995, + "acc_norm": 0.2913907284768212, + "acc_norm_stderr": 0.03710185726119995 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7064220183486238, + "acc_stderr": 0.019525151122639667, + "acc_norm": 0.7064220183486238, + "acc_norm_stderr": 0.019525151122639667 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4212962962962963, + "acc_stderr": 0.03367462138896078, + "acc_norm": 0.4212962962962963, + "acc_norm_stderr": 0.03367462138896078 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03308611113236434, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03308611113236434 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6835443037974683, + "acc_stderr": 0.03027497488021898, + "acc_norm": 0.6835443037974683, + "acc_norm_stderr": 0.03027497488021898 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5515695067264574, + "acc_stderr": 0.03337883736255098, + "acc_norm": 0.5515695067264574, + "acc_norm_stderr": 0.03337883736255098 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6030534351145038, + "acc_stderr": 0.04291135671009224, + "acc_norm": 0.6030534351145038, + "acc_norm_stderr": 0.04291135671009224 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6859504132231405, + "acc_stderr": 0.04236964753041019, + "acc_norm": 0.6859504132231405, + "acc_norm_stderr": 0.04236964753041019 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.0471282125742677, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.0471282125742677 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.656441717791411, + "acc_stderr": 0.03731133519673893, + "acc_norm": 0.656441717791411, + "acc_norm_stderr": 0.03731133519673893 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4375, + "acc_stderr": 0.04708567521880525, + "acc_norm": 0.4375, + "acc_norm_stderr": 0.04708567521880525 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6893203883495146, + "acc_stderr": 0.0458212416016155, + "acc_norm": 0.6893203883495146, + "acc_norm_stderr": 0.0458212416016155 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.811965811965812, + "acc_stderr": 0.025598193686652244, + "acc_norm": 0.811965811965812, + "acc_norm_stderr": 0.025598193686652244 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7062579821200511, + "acc_stderr": 0.016287759388491665, + "acc_norm": 0.7062579821200511, + "acc_norm_stderr": 0.016287759388491665 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5722543352601156, + "acc_stderr": 0.02663653974111608, + "acc_norm": 0.5722543352601156, + "acc_norm_stderr": 0.02663653974111608 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3016759776536313, + "acc_stderr": 0.015350767572220286, + "acc_norm": 0.3016759776536313, + "acc_norm_stderr": 0.015350767572220286 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.630718954248366, + "acc_stderr": 0.02763417668960266, + "acc_norm": 0.630718954248366, + "acc_norm_stderr": 0.02763417668960266 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5755627009646302, + "acc_stderr": 0.028071928247946208, + "acc_norm": 0.5755627009646302, + "acc_norm_stderr": 0.028071928247946208 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5802469135802469, + "acc_stderr": 0.027460099557005135, + "acc_norm": 0.5802469135802469, + "acc_norm_stderr": 0.027460099557005135 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.38652482269503546, + "acc_stderr": 0.02904919034254346, + "acc_norm": 0.38652482269503546, + "acc_norm_stderr": 0.02904919034254346 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.42503259452411996, + "acc_stderr": 0.012625879884891993, + "acc_norm": 0.42503259452411996, + "acc_norm_stderr": 0.012625879884891993 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5183823529411765, + "acc_stderr": 0.030352303395351964, + "acc_norm": 0.5183823529411765, + "acc_norm_stderr": 0.030352303395351964 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5196078431372549, + "acc_stderr": 0.020212274976302957, + "acc_norm": 0.5196078431372549, + "acc_norm_stderr": 0.020212274976302957 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5454545454545454, + "acc_stderr": 0.04769300568972744, + "acc_norm": 0.5454545454545454, + "acc_norm_stderr": 0.04769300568972744 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5836734693877551, + "acc_stderr": 0.031557828165561644, + "acc_norm": 0.5836734693877551, + "acc_norm_stderr": 0.031557828165561644 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7263681592039801, + "acc_stderr": 0.03152439186555402, + "acc_norm": 0.7263681592039801, + "acc_norm_stderr": 0.03152439186555402 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4578313253012048, + "acc_stderr": 0.038786267710023595, + "acc_norm": 0.4578313253012048, + "acc_norm_stderr": 0.038786267710023595 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.03188578017686399, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.03188578017686399 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35862913096695226, + "mc1_stderr": 0.016789289499502025, + "mc2": 0.5088488034487862, + "mc2_stderr": 0.015405211397549821 + }, + "all": { + "acc": 0.5208688458263941, + "acc_stderr": 0.034919052518984244, + "acc_norm": 0.5247318627818371, + "acc_norm_stderr": 0.034903520327008546, + "mc1": 0.35862913096695226, + "mc1_stderr": 0.016789289499502025, + "mc2": 0.5088488034487862, + "mc2_stderr": 0.015405211397549821 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "7541.253061532974", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/vicuna-13b-v1.3.0-GPTQ/results_2023-11-05T09-23-24.198168.json b/eval-results/TheBloke/vicuna-13b-v1.3.0-GPTQ/results_2023-11-05T09-23-24.198168.json new file mode 100644 index 0000000000000000000000000000000000000000..8d975ce32a5e180989aa304ee085fd0587352212 --- /dev/null +++ b/eval-results/TheBloke/vicuna-13b-v1.3.0-GPTQ/results_2023-11-05T09-23-24.198168.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/vicuna-13b-v1.3.0-GPTQ", + "model_sha": "6ef1f8d8638ea2d6681a8e3da73be57c501d847b", + "model_dtype": "torch.float16", + "model_size": "6.8 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.009542785234899329, + "em_stderr": 0.0009956233793266876, + "f1": 0.07240981543624204, + "f1_stderr": 0.001677260569484712 + }, + "harness|gsm8k|5": { + "acc": 0.0841546626231994, + "acc_stderr": 0.0076470240466032045 + }, + "harness|winogrande|5": { + "acc": 0.7466456195737964, + "acc_stderr": 0.012223754434233618 + }, + "all": { + "em": 0.009542785234899329, + "em_stderr": 0.0009956233793266876, + "f1": 0.07240981543624204, + "f1_stderr": 0.001677260569484712, + "acc": 0.4154001410984979, + "acc_stderr": 0.00993538924041841 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "ab5e5e3ce36bd4e9" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "9407c2609166cb10" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "c6f93c301311dae3" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/vicuna-13b-v1.3.0-GPTQ/results_2023-11-07T20-06-54.484278.json b/eval-results/TheBloke/vicuna-13b-v1.3.0-GPTQ/results_2023-11-07T20-06-54.484278.json new file mode 100644 index 0000000000000000000000000000000000000000..4866090d9a31f92dee51adc15fc05619dac37722 --- /dev/null +++ b/eval-results/TheBloke/vicuna-13b-v1.3.0-GPTQ/results_2023-11-07T20-06-54.484278.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/vicuna-13b-v1.3.0-GPTQ", + "model_sha": "6ef1f8d8638ea2d6681a8e3da73be57c501d847b", + "model_dtype": "torch.float16", + "model_size": "6.8 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.00964765100671141, + "em_stderr": 0.0010010258941568287, + "f1": 0.0725954278523494, + "f1_stderr": 0.0016816004855467774 + }, + "harness|gsm8k|5": { + "acc": 0.0841546626231994, + "acc_stderr": 0.0076470240466032045 + }, + "harness|winogrande|5": { + "acc": 0.7466456195737964, + "acc_stderr": 0.012223754434233618 + }, + "all": { + "em": 0.00964765100671141, + "em_stderr": 0.0010010258941568287, + "f1": 0.0725954278523494, + "f1_stderr": 0.0016816004855467774, + "acc": 0.4154001410984979, + "acc_stderr": 0.00993538924041841 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "7389456663e3bfe8" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "e37e5966a8f2134b" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "89cfd638633b5aed" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/wizard-mega-13B-GPTQ/results_2023-08-22T10-09-24.633261.json b/eval-results/TheBloke/wizard-mega-13B-GPTQ/results_2023-08-22T10-09-24.633261.json new file mode 100644 index 0000000000000000000000000000000000000000..67ceb683904b6f38b86878f1256ebc18b8ff99af --- /dev/null +++ b/eval-results/TheBloke/wizard-mega-13B-GPTQ/results_2023-08-22T10-09-24.633261.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.2158703071672355, + "acc_stderr": 0.012022975360030684, + "acc_norm": 0.2773037542662116, + "acc_norm_stderr": 0.013082095839059374 + }, + "harness|hellaswag|10": { + "acc": 0.2544313881696873, + "acc_stderr": 0.004346509850679535, + "acc_norm": 0.26010754829715194, + "acc_norm_stderr": 0.004377965074211625 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.17777777777777778, + "acc_stderr": 0.03302789859901717, + "acc_norm": 0.17777777777777778, + "acc_norm_stderr": 0.03302789859901717 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.034597776068105345, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.034597776068105345 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2490566037735849, + "acc_stderr": 0.026616482980501704, + "acc_norm": 0.2490566037735849, + "acc_norm_stderr": 0.026616482980501704 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2152777777777778, + "acc_stderr": 0.034370793441061344, + "acc_norm": 0.2152777777777778, + "acc_norm_stderr": 0.034370793441061344 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.21965317919075145, + "acc_stderr": 0.031568093627031744, + "acc_norm": 0.21965317919075145, + "acc_norm_stderr": 0.031568093627031744 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909281, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909281 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2851063829787234, + "acc_stderr": 0.029513196625539355, + "acc_norm": 0.2851063829787234, + "acc_norm_stderr": 0.029513196625539355 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.22807017543859648, + "acc_stderr": 0.03947152782669415, + "acc_norm": 0.22807017543859648, + "acc_norm_stderr": 0.03947152782669415 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.31724137931034485, + "acc_stderr": 0.03878352372138623, + "acc_norm": 0.31724137931034485, + "acc_norm_stderr": 0.03878352372138623 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.23544973544973544, + "acc_stderr": 0.021851509822031715, + "acc_norm": 0.23544973544973544, + "acc_norm_stderr": 0.021851509822031715 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.03893259610604672, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.03893259610604672 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036623, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036623 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.22903225806451613, + "acc_stderr": 0.02390491431178265, + "acc_norm": 0.22903225806451613, + "acc_norm_stderr": 0.02390491431178265 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2660098522167488, + "acc_stderr": 0.03108982600293752, + "acc_norm": 0.2660098522167488, + "acc_norm_stderr": 0.03108982600293752 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.0347769116216366, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.0347769116216366 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2878787878787879, + "acc_stderr": 0.03225883512300993, + "acc_norm": 0.2878787878787879, + "acc_norm_stderr": 0.03225883512300993 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.2694300518134715, + "acc_stderr": 0.03201867122877794, + "acc_norm": 0.2694300518134715, + "acc_norm_stderr": 0.03201867122877794 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.022421273612923703, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.022421273612923703 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.025928876132766135, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.025928876132766135 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2689075630252101, + "acc_stderr": 0.028801392193631273, + "acc_norm": 0.2689075630252101, + "acc_norm_stderr": 0.028801392193631273 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.1986754966887417, + "acc_stderr": 0.03257847384436776, + "acc_norm": 0.1986754966887417, + "acc_norm_stderr": 0.03257847384436776 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.24220183486238533, + "acc_stderr": 0.01836817630659862, + "acc_norm": 0.24220183486238533, + "acc_norm_stderr": 0.01836817630659862 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.03167468706828979, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.03167468706828979 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.22058823529411764, + "acc_stderr": 0.029102254389674082, + "acc_norm": 0.22058823529411764, + "acc_norm_stderr": 0.029102254389674082 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.21940928270042195, + "acc_stderr": 0.026939106581553945, + "acc_norm": 0.21940928270042195, + "acc_norm_stderr": 0.026939106581553945 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.22869955156950672, + "acc_stderr": 0.028188240046929196, + "acc_norm": 0.22869955156950672, + "acc_norm_stderr": 0.028188240046929196 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.21487603305785125, + "acc_stderr": 0.03749492448709698, + "acc_norm": 0.21487603305785125, + "acc_norm_stderr": 0.03749492448709698 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.04133119440243838, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.04133119440243838 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2331288343558282, + "acc_stderr": 0.03322015795776741, + "acc_norm": 0.2331288343558282, + "acc_norm_stderr": 0.03322015795776741 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.22321428571428573, + "acc_stderr": 0.039523019677025116, + "acc_norm": 0.22321428571428573, + "acc_norm_stderr": 0.039523019677025116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3106796116504854, + "acc_stderr": 0.045821241601615506, + "acc_norm": 0.3106796116504854, + "acc_norm_stderr": 0.045821241601615506 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.25213675213675213, + "acc_stderr": 0.02844796547623101, + "acc_norm": 0.25213675213675213, + "acc_norm_stderr": 0.02844796547623101 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.22860791826309068, + "acc_stderr": 0.015016884698539894, + "acc_norm": 0.22860791826309068, + "acc_norm_stderr": 0.015016884698539894 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.26256983240223464, + "acc_stderr": 0.014716824273017744, + "acc_norm": 0.26256983240223464, + "acc_norm_stderr": 0.014716824273017744 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.025829163272757482, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.025829163272757482 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.21864951768488747, + "acc_stderr": 0.02347558141786111, + "acc_norm": 0.21864951768488747, + "acc_norm_stderr": 0.02347558141786111 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.28703703703703703, + "acc_stderr": 0.025171041915309684, + "acc_norm": 0.28703703703703703, + "acc_norm_stderr": 0.025171041915309684 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2553191489361702, + "acc_stderr": 0.02601199293090202, + "acc_norm": 0.2553191489361702, + "acc_norm_stderr": 0.02601199293090202 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.26597131681877445, + "acc_stderr": 0.011285033165551265, + "acc_norm": 0.26597131681877445, + "acc_norm_stderr": 0.011285033165551265 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.23161764705882354, + "acc_stderr": 0.025626533803777562, + "acc_norm": 0.23161764705882354, + "acc_norm_stderr": 0.025626533803777562 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.26633986928104575, + "acc_stderr": 0.017883188134667206, + "acc_norm": 0.26633986928104575, + "acc_norm_stderr": 0.017883188134667206 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.22727272727272727, + "acc_stderr": 0.04013964554072775, + "acc_norm": 0.22727272727272727, + "acc_norm_stderr": 0.04013964554072775 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2979591836734694, + "acc_stderr": 0.029279567411065664, + "acc_norm": 0.2979591836734694, + "acc_norm_stderr": 0.029279567411065664 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2736318407960199, + "acc_stderr": 0.03152439186555404, + "acc_norm": 0.2736318407960199, + "acc_norm_stderr": 0.03152439186555404 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2289156626506024, + "acc_stderr": 0.03270745277352477, + "acc_norm": 0.2289156626506024, + "acc_norm_stderr": 0.03270745277352477 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.28654970760233917, + "acc_stderr": 0.03467826685703826, + "acc_norm": 0.28654970760233917, + "acc_norm_stderr": 0.03467826685703826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2460220318237454, + "mc1_stderr": 0.015077219200662574, + "mc2": 0.4869109173912817, + "mc2_stderr": 0.01702324741696185 + }, + "all": { + "acc": 0.24921936031109732, + "acc_stderr": 0.031469310713380494, + "acc_norm": 0.25035681128103693, + "acc_norm_stderr": 0.03148779504732221, + "mc1": 0.2460220318237454, + "mc1_stderr": 0.015077219200662574, + "mc2": 0.4869109173912817, + "mc2_stderr": 0.01702324741696185 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/wizard-mega-13B-GPTQ", + "model_sha": "848bf2514f804799dd28c188e5428d497dc983fb", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4694.2846574783325", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/wizard-mega-13B-GPTQ/results_2023-11-05T00-29-27.161865.json b/eval-results/TheBloke/wizard-mega-13B-GPTQ/results_2023-11-05T00-29-27.161865.json new file mode 100644 index 0000000000000000000000000000000000000000..6e41c9bd227622283e85f3e14c8238431b069ad3 --- /dev/null +++ b/eval-results/TheBloke/wizard-mega-13B-GPTQ/results_2023-11-05T00-29-27.161865.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/wizard-mega-13B-GPTQ", + "model_sha": "df7beb2d01f37d601784926ec949e092185255fb", + "model_dtype": "torch.float16", + "model_size": "6.8 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.0024119127516778523, + "em_stderr": 0.0005023380498893266, + "f1": 0.06474517617449695, + "f1_stderr": 0.0014209030899497513 + }, + "harness|gsm8k|5": { + "acc": 0.0887035633055345, + "acc_stderr": 0.007831458737058719 + }, + "harness|winogrande|5": { + "acc": 0.7474348855564326, + "acc_stderr": 0.012211148449394105 + }, + "all": { + "em": 0.0024119127516778523, + "em_stderr": 0.0005023380498893266, + "f1": 0.06474517617449695, + "f1_stderr": 0.0014209030899497513, + "acc": 0.41806922443098354, + "acc_stderr": 0.010021303593226411 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "b42c683f49140162" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "9fd30c52f99d0355" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "56cd9bb61f97939e" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/wizard-mega-13B-GPTQ/results_2023-11-07T07-11-46.594603.json b/eval-results/TheBloke/wizard-mega-13B-GPTQ/results_2023-11-07T07-11-46.594603.json new file mode 100644 index 0000000000000000000000000000000000000000..d733966f88cadbaae1180b10d983d0b248719e0e --- /dev/null +++ b/eval-results/TheBloke/wizard-mega-13B-GPTQ/results_2023-11-07T07-11-46.594603.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/wizard-mega-13B-GPTQ", + "model_sha": "df7beb2d01f37d601784926ec949e092185255fb", + "model_dtype": "torch.float16", + "model_size": "6.8 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.0024119127516778523, + "em_stderr": 0.0005023380498893266, + "f1": 0.06481438758389294, + "f1_stderr": 0.0014219270919505864 + }, + "harness|gsm8k|5": { + "acc": 0.08946171341925702, + "acc_stderr": 0.007861583049939712 + }, + "harness|winogrande|5": { + "acc": 0.7474348855564326, + "acc_stderr": 0.012211148449394105 + }, + "all": { + "em": 0.0024119127516778523, + "em_stderr": 0.0005023380498893266, + "f1": 0.06481438758389294, + "f1_stderr": 0.0014219270919505864, + "acc": 0.4184482994878448, + "acc_stderr": 0.010036365749666909 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "aa151a5442caaa4c" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "d29d8996d6c5c880" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "d9555a5c55448724" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/wizard-vicuna-13B-GPTQ/results_2023-08-21T19-47-38.407396.json b/eval-results/TheBloke/wizard-vicuna-13B-GPTQ/results_2023-08-21T19-47-38.407396.json new file mode 100644 index 0000000000000000000000000000000000000000..71e3e5b5d283bc1450d3cfd07054de202243ef76 --- /dev/null +++ b/eval-results/TheBloke/wizard-vicuna-13B-GPTQ/results_2023-08-21T19-47-38.407396.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.23464163822525597, + "acc_stderr": 0.012383873560768675, + "acc_norm": 0.28668941979522183, + "acc_norm_stderr": 0.01321498632927477 + }, + "harness|hellaswag|10": { + "acc": 0.2555267874925314, + "acc_stderr": 0.004352655263682337, + "acc_norm": 0.25941047600079664, + "acc_norm_stderr": 0.004374153847826758 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.22962962962962963, + "acc_stderr": 0.03633384414073461, + "acc_norm": 0.22962962962962963, + "acc_norm_stderr": 0.03633384414073461 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.26973684210526316, + "acc_stderr": 0.03611780560284898, + "acc_norm": 0.26973684210526316, + "acc_norm_stderr": 0.03611780560284898 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2981132075471698, + "acc_stderr": 0.028152837942493857, + "acc_norm": 0.2981132075471698, + "acc_norm_stderr": 0.028152837942493857 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2013888888888889, + "acc_stderr": 0.03353647469713839, + "acc_norm": 0.2013888888888889, + "acc_norm_stderr": 0.03353647469713839 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.31213872832369943, + "acc_stderr": 0.035331333893236574, + "acc_norm": 0.31213872832369943, + "acc_norm_stderr": 0.035331333893236574 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3431372549019608, + "acc_stderr": 0.04724007352383888, + "acc_norm": 0.3431372549019608, + "acc_norm_stderr": 0.04724007352383888 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.20851063829787234, + "acc_stderr": 0.026556982117838728, + "acc_norm": 0.20851063829787234, + "acc_norm_stderr": 0.026556982117838728 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748142, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748142 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.02256989707491843, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.02256989707491843 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04285714285714281, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04285714285714281 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.17, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.17, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.27419354838709675, + "acc_stderr": 0.025378139970885193, + "acc_norm": 0.27419354838709675, + "acc_norm_stderr": 0.025378139970885193 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.0317852971064275, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.0317852971064275 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.24242424242424243, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3484848484848485, + "acc_stderr": 0.033948539651564025, + "acc_norm": 0.3484848484848485, + "acc_norm_stderr": 0.033948539651564025 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.33678756476683935, + "acc_stderr": 0.03410780251836183, + "acc_norm": 0.33678756476683935, + "acc_norm_stderr": 0.03410780251836183 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.36153846153846153, + "acc_stderr": 0.024359581465396987, + "acc_norm": 0.36153846153846153, + "acc_norm_stderr": 0.024359581465396987 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.02592887613276612, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.02592887613276612 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.031041941304059288, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.031041941304059288 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2251655629139073, + "acc_stderr": 0.03410435282008936, + "acc_norm": 0.2251655629139073, + "acc_norm_stderr": 0.03410435282008936 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.28623853211009176, + "acc_stderr": 0.019379436628919965, + "acc_norm": 0.28623853211009176, + "acc_norm_stderr": 0.019379436628919965 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.375, + "acc_stderr": 0.033016908987210894, + "acc_norm": 0.375, + "acc_norm_stderr": 0.033016908987210894 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.27941176470588236, + "acc_stderr": 0.03149328104507957, + "acc_norm": 0.27941176470588236, + "acc_norm_stderr": 0.03149328104507957 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2489451476793249, + "acc_stderr": 0.028146970599422644, + "acc_norm": 0.2489451476793249, + "acc_norm_stderr": 0.028146970599422644 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.10762331838565023, + "acc_stderr": 0.020799400082879997, + "acc_norm": 0.10762331838565023, + "acc_norm_stderr": 0.020799400082879997 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2824427480916031, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.2824427480916031, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2231404958677686, + "acc_stderr": 0.03800754475228733, + "acc_norm": 0.2231404958677686, + "acc_norm_stderr": 0.03800754475228733 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2331288343558282, + "acc_stderr": 0.033220157957767414, + "acc_norm": 0.2331288343558282, + "acc_norm_stderr": 0.033220157957767414 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25892857142857145, + "acc_stderr": 0.041577515398656284, + "acc_norm": 0.25892857142857145, + "acc_norm_stderr": 0.041577515398656284 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3786407766990291, + "acc_stderr": 0.04802694698258972, + "acc_norm": 0.3786407766990291, + "acc_norm_stderr": 0.04802694698258972 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.19658119658119658, + "acc_stderr": 0.02603538609895129, + "acc_norm": 0.19658119658119658, + "acc_norm_stderr": 0.02603538609895129 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909281, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909281 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.20434227330779056, + "acc_stderr": 0.0144191239809319, + "acc_norm": 0.20434227330779056, + "acc_norm_stderr": 0.0144191239809319 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2398843930635838, + "acc_stderr": 0.02298959254312356, + "acc_norm": 0.2398843930635838, + "acc_norm_stderr": 0.02298959254312356 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.29608938547486036, + "acc_stderr": 0.015268677317602265, + "acc_norm": 0.29608938547486036, + "acc_norm_stderr": 0.015268677317602265 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2875816993464052, + "acc_stderr": 0.025917806117147158, + "acc_norm": 0.2875816993464052, + "acc_norm_stderr": 0.025917806117147158 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24437299035369775, + "acc_stderr": 0.024406162094668882, + "acc_norm": 0.24437299035369775, + "acc_norm_stderr": 0.024406162094668882 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.22839506172839505, + "acc_stderr": 0.023358211840626267, + "acc_norm": 0.22839506172839505, + "acc_norm_stderr": 0.023358211840626267 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2553191489361702, + "acc_stderr": 0.026011992930902002, + "acc_norm": 0.2553191489361702, + "acc_norm_stderr": 0.026011992930902002 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2627118644067797, + "acc_stderr": 0.011240545514995676, + "acc_norm": 0.2627118644067797, + "acc_norm_stderr": 0.011240545514995676 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2536764705882353, + "acc_stderr": 0.026431329870789555, + "acc_norm": 0.2536764705882353, + "acc_norm_stderr": 0.026431329870789555 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.24673202614379086, + "acc_stderr": 0.0174408203674025, + "acc_norm": 0.24673202614379086, + "acc_norm_stderr": 0.0174408203674025 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.24545454545454545, + "acc_stderr": 0.041220665028782834, + "acc_norm": 0.24545454545454545, + "acc_norm_stderr": 0.041220665028782834 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2653061224489796, + "acc_stderr": 0.028263889943784603, + "acc_norm": 0.2653061224489796, + "acc_norm_stderr": 0.028263889943784603 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.263681592039801, + "acc_stderr": 0.03115715086935556, + "acc_norm": 0.263681592039801, + "acc_norm_stderr": 0.03115715086935556 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.19879518072289157, + "acc_stderr": 0.031069390260789437, + "acc_norm": 0.19879518072289157, + "acc_norm_stderr": 0.031069390260789437 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.17543859649122806, + "acc_stderr": 0.029170885500727654, + "acc_norm": 0.17543859649122806, + "acc_norm_stderr": 0.029170885500727654 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23745410036719705, + "mc1_stderr": 0.014896277441041866, + "mc2": 0.48529808032699134, + "mc2_stderr": 0.016985401750509065 + }, + "all": { + "acc": 0.2579761146376028, + "acc_stderr": 0.031605286511255506, + "acc_norm": 0.2589241056558779, + "acc_norm_stderr": 0.03161973755113095, + "mc1": 0.23745410036719705, + "mc1_stderr": 0.014896277441041866, + "mc2": 0.48529808032699134, + "mc2_stderr": 0.016985401750509065 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TheBloke/wizard-vicuna-13B-GPTQ", + "model_sha": "936a51c0219744d7a9598d0c65a7d18e01660601", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4687.057464361191", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/wizard-vicuna-13B-GPTQ/results_2023-11-05T07-12-56.494554.json b/eval-results/TheBloke/wizard-vicuna-13B-GPTQ/results_2023-11-05T07-12-56.494554.json new file mode 100644 index 0000000000000000000000000000000000000000..8f651de1fd7324bb3ea5a40871ee9914f4ca04bb --- /dev/null +++ b/eval-results/TheBloke/wizard-vicuna-13B-GPTQ/results_2023-11-05T07-12-56.494554.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/wizard-vicuna-13B-GPTQ", + "model_sha": "07f9445364346147f7d27efef9de49739e87fdac", + "model_dtype": "torch.float16", + "model_size": "6.8 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.045197147651006714, + "em_stderr": 0.0021274140301580503, + "f1": 0.11252307046979877, + "f1_stderr": 0.0025015697880602372 + }, + "harness|gsm8k|5": { + "acc": 0.09552691432903715, + "acc_stderr": 0.00809660577115573 + }, + "harness|winogrande|5": { + "acc": 0.7474348855564326, + "acc_stderr": 0.012211148449394105 + }, + "all": { + "em": 0.045197147651006714, + "em_stderr": 0.0021274140301580503, + "f1": 0.11252307046979877, + "f1_stderr": 0.0025015697880602372, + "acc": 0.42148089994273485, + "acc_stderr": 0.010153877110274916 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "55dd8405cbad457e" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "ad111b9fa78f8922" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "ca7ffef20efb7719" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/wizard-vicuna-13B-GPTQ/results_2023-11-07T21-40-44.837005.json b/eval-results/TheBloke/wizard-vicuna-13B-GPTQ/results_2023-11-07T21-40-44.837005.json new file mode 100644 index 0000000000000000000000000000000000000000..524efa5b50b45c46190d83326f19107686ff0428 --- /dev/null +++ b/eval-results/TheBloke/wizard-vicuna-13B-GPTQ/results_2023-11-07T21-40-44.837005.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "TheBloke/wizard-vicuna-13B-GPTQ", + "model_sha": "07f9445364346147f7d27efef9de49739e87fdac", + "model_dtype": "torch.float16", + "model_size": "6.8 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.04488255033557047, + "em_stderr": 0.0021203463374070692, + "f1": 0.11209521812080547, + "f1_stderr": 0.002495000900110754 + }, + "harness|gsm8k|5": { + "acc": 0.09628506444275967, + "acc_stderr": 0.008125264128215884 + }, + "harness|winogrande|5": { + "acc": 0.7474348855564326, + "acc_stderr": 0.012211148449394105 + }, + "all": { + "em": 0.04488255033557047, + "em_stderr": 0.0021203463374070692, + "f1": 0.11209521812080547, + "f1_stderr": 0.002495000900110754, + "acc": 0.4218599749995961, + "acc_stderr": 0.010168206288804995 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "c0c760e5bc8fccd3" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "f9403f92909e8197" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "5067396759fac786" + }, + "truncated": 1263, + "non_truncated": 10859, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/wizard-vicuna-13B-HF/results_2023-07-18T15-41-31.806863.json b/eval-results/TheBloke/wizard-vicuna-13B-HF/results_2023-07-18T15-41-31.806863.json new file mode 100644 index 0000000000000000000000000000000000000000..f1201c0eec95ef5f16bad3526f00cdc33b867776 --- /dev/null +++ b/eval-results/TheBloke/wizard-vicuna-13B-HF/results_2023-07-18T15-41-31.806863.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5187713310580204, + "acc_stderr": 0.014601090150633964, + "acc_norm": 0.5469283276450512, + "acc_norm_stderr": 0.01454689205200563 + }, + "harness|hellaswag|10": { + "acc": 0.5871340370444135, + "acc_stderr": 0.0049134290105590705, + "acc_norm": 0.7917745469030074, + "acc_norm_stderr": 0.0040520910240415785 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.04244633238353229, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.04244633238353229 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.04063302731486671, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.04063302731486671 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4716981132075472, + "acc_stderr": 0.0307235352490061, + "acc_norm": 0.4716981132075472, + "acc_norm_stderr": 0.0307235352490061 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4861111111111111, + "acc_stderr": 0.041795966175810016, + "acc_norm": 0.4861111111111111, + "acc_norm_stderr": 0.041795966175810016 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3583815028901734, + "acc_stderr": 0.036563436533531585, + "acc_norm": 0.3583815028901734, + "acc_norm_stderr": 0.036563436533531585 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.0379328118530781, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.0379328118530781 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3829787234042553, + "acc_stderr": 0.03177821250236922, + "acc_norm": 0.3829787234042553, + "acc_norm_stderr": 0.03177821250236922 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.04227054451232199, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.04227054451232199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4068965517241379, + "acc_stderr": 0.04093793981266237, + "acc_norm": 0.4068965517241379, + "acc_norm_stderr": 0.04093793981266237 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2751322751322751, + "acc_stderr": 0.023000086859068642, + "acc_norm": 0.2751322751322751, + "acc_norm_stderr": 0.023000086859068642 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.04343525428949098, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.04343525428949098 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5225806451612903, + "acc_stderr": 0.02841498501970786, + "acc_norm": 0.5225806451612903, + "acc_norm_stderr": 0.02841498501970786 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.35960591133004927, + "acc_stderr": 0.03376458246509568, + "acc_norm": 0.35960591133004927, + "acc_norm_stderr": 0.03376458246509568 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.037131580674819135, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.037131580674819135 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6161616161616161, + "acc_stderr": 0.03464881675016339, + "acc_norm": 0.6161616161616161, + "acc_norm_stderr": 0.03464881675016339 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6839378238341969, + "acc_stderr": 0.03355397369686173, + "acc_norm": 0.6839378238341969, + "acc_norm_stderr": 0.03355397369686173 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4153846153846154, + "acc_stderr": 0.024985354923102332, + "acc_norm": 0.4153846153846154, + "acc_norm_stderr": 0.024985354923102332 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.025928876132766114, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.025928876132766114 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4327731092436975, + "acc_stderr": 0.03218358107742613, + "acc_norm": 0.4327731092436975, + "acc_norm_stderr": 0.03218358107742613 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943342, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943342 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6495412844036698, + "acc_stderr": 0.020456077599824464, + "acc_norm": 0.6495412844036698, + "acc_norm_stderr": 0.020456077599824464 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3101851851851852, + "acc_stderr": 0.031546962856566295, + "acc_norm": 0.3101851851851852, + "acc_norm_stderr": 0.031546962856566295 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.0328347205610856, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.0328347205610856 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6708860759493671, + "acc_stderr": 0.030587326294702365, + "acc_norm": 0.6708860759493671, + "acc_norm_stderr": 0.030587326294702365 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5964125560538116, + "acc_stderr": 0.032928028193303135, + "acc_norm": 0.5964125560538116, + "acc_norm_stderr": 0.032928028193303135 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6259541984732825, + "acc_stderr": 0.04243869242230524, + "acc_norm": 0.6259541984732825, + "acc_norm_stderr": 0.04243869242230524 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.043913262867240704, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.043913262867240704 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5925925925925926, + "acc_stderr": 0.04750077341199985, + "acc_norm": 0.5925925925925926, + "acc_norm_stderr": 0.04750077341199985 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6012269938650306, + "acc_stderr": 0.03847021420456023, + "acc_norm": 0.6012269938650306, + "acc_norm_stderr": 0.03847021420456023 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.045723723587374296, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.045723723587374296 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6601941747572816, + "acc_stderr": 0.04689765937278134, + "acc_norm": 0.6601941747572816, + "acc_norm_stderr": 0.04689765937278134 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.717948717948718, + "acc_stderr": 0.029480360549541194, + "acc_norm": 0.717948717948718, + "acc_norm_stderr": 0.029480360549541194 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6756066411238825, + "acc_stderr": 0.016740929047162696, + "acc_norm": 0.6756066411238825, + "acc_norm_stderr": 0.016740929047162696 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5317919075144508, + "acc_stderr": 0.02686462436675665, + "acc_norm": 0.5317919075144508, + "acc_norm_stderr": 0.02686462436675665 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3452513966480447, + "acc_stderr": 0.015901432608930354, + "acc_norm": 0.3452513966480447, + "acc_norm_stderr": 0.015901432608930354 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.02845263998508801, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.02845263998508801 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.49517684887459806, + "acc_stderr": 0.02839677044411129, + "acc_norm": 0.49517684887459806, + "acc_norm_stderr": 0.02839677044411129 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5246913580246914, + "acc_stderr": 0.02778680093142745, + "acc_norm": 0.5246913580246914, + "acc_norm_stderr": 0.02778680093142745 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.34397163120567376, + "acc_stderr": 0.028338017428611317, + "acc_norm": 0.34397163120567376, + "acc_norm_stderr": 0.028338017428611317 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.408735332464146, + "acc_stderr": 0.012555701346703373, + "acc_norm": 0.408735332464146, + "acc_norm_stderr": 0.012555701346703373 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4338235294117647, + "acc_stderr": 0.030105636570016626, + "acc_norm": 0.4338235294117647, + "acc_norm_stderr": 0.030105636570016626 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5, + "acc_stderr": 0.020227834851568375, + "acc_norm": 0.5, + "acc_norm_stderr": 0.020227834851568375 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5181818181818182, + "acc_stderr": 0.04785964010794915, + "acc_norm": 0.5181818181818182, + "acc_norm_stderr": 0.04785964010794915 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5673469387755102, + "acc_stderr": 0.031717528240626645, + "acc_norm": 0.5673469387755102, + "acc_norm_stderr": 0.031717528240626645 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7263681592039801, + "acc_stderr": 0.03152439186555402, + "acc_norm": 0.7263681592039801, + "acc_norm_stderr": 0.03152439186555402 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.45180722891566266, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.45180722891566266, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6900584795321637, + "acc_stderr": 0.035469769593931624, + "acc_norm": 0.6900584795321637, + "acc_norm_stderr": 0.035469769593931624 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35006119951040393, + "mc1_stderr": 0.01669794942015103, + "mc2": 0.49619544657387094, + "mc2_stderr": 0.01573558851923109 + }, + "all": { + "acc": 0.49093552913894745, + "acc_stderr": 0.034974545085472035, + "acc_norm": 0.4948812495871784, + "acc_norm_stderr": 0.03495902752470686, + "mc1": 0.35006119951040393, + "mc1_stderr": 0.01669794942015103, + "mc2": 0.49619544657387094, + "mc2_stderr": 0.01573558851923109 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/wizard-vicuna-13B-HF", + "model_sha": "12dc8aacb474522ae2a83c18cb0fdf0907987f8f", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/wizard-vicuna-13B-HF/results_2023-10-22T05-16-09.820423.json b/eval-results/TheBloke/wizard-vicuna-13B-HF/results_2023-10-22T05-16-09.820423.json new file mode 100644 index 0000000000000000000000000000000000000000..9fd93aa1653809ad50599903fc4efc83f07ef631 --- /dev/null +++ b/eval-results/TheBloke/wizard-vicuna-13B-HF/results_2023-10-22T05-16-09.820423.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/wizard-vicuna-13B-HF", + "model_sha": "12dc8aacb474522ae2a83c18cb0fdf0907987f8f", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0350251677852349, + "em_stderr": 0.0018827287598880416, + "f1": 0.10088821308724859, + "f1_stderr": 0.0023095858218995214 + }, + "harness|gsm8k|5": { + "acc": 0.0932524639878696, + "acc_stderr": 0.008009688838328585 + }, + "harness|winogrande|5": { + "acc": 0.7482241515390686, + "acc_stderr": 0.012198489100259785 + }, + "all": { + "em": 0.0350251677852349, + "em_stderr": 0.0018827287598880416, + "f1": 0.10088821308724859, + "f1_stderr": 0.0023095858218995214, + "acc": 0.4207383077634691, + "acc_stderr": 0.010104088969294184 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "255f1074d1931c24" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "951df2bd14ff1a2d" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "3ed4c83ec13f00bb" + }, + "total_evaluation_time_secondes": "12678.325699567795", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/wizardLM-13B-1.0-fp16/results_2023-07-19T19-39-43.498686.json b/eval-results/TheBloke/wizardLM-13B-1.0-fp16/results_2023-07-19T19-39-43.498686.json new file mode 100644 index 0000000000000000000000000000000000000000..0707e4677f590ce47f3f80a0f70eb84af4b1cdaa --- /dev/null +++ b/eval-results/TheBloke/wizardLM-13B-1.0-fp16/results_2023-07-19T19-39-43.498686.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5537542662116041, + "acc_stderr": 0.014526705548539982, + "acc_norm": 0.5725255972696246, + "acc_norm_stderr": 0.014456862944650652 + }, + "harness|hellaswag|10": { + "acc": 0.6105357498506274, + "acc_stderr": 0.00486632225833596, + "acc_norm": 0.8088030272854013, + "acc_norm_stderr": 0.0039244012588482875 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.04605661864718381, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04605661864718381 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4962962962962963, + "acc_stderr": 0.043192236258113303, + "acc_norm": 0.4962962962962963, + "acc_norm_stderr": 0.043192236258113303 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5394736842105263, + "acc_stderr": 0.04056242252249034, + "acc_norm": 0.5394736842105263, + "acc_norm_stderr": 0.04056242252249034 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5358490566037736, + "acc_stderr": 0.030693675018458003, + "acc_norm": 0.5358490566037736, + "acc_norm_stderr": 0.030693675018458003 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5347222222222222, + "acc_stderr": 0.04171115858181618, + "acc_norm": 0.5347222222222222, + "acc_norm_stderr": 0.04171115858181618 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4682080924855491, + "acc_stderr": 0.03804749744364763, + "acc_norm": 0.4682080924855491, + "acc_norm_stderr": 0.03804749744364763 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171453, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171453 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252607, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252607 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.41702127659574467, + "acc_stderr": 0.032232762667117124, + "acc_norm": 0.41702127659574467, + "acc_norm_stderr": 0.032232762667117124 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537315, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537315 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.42758620689655175, + "acc_stderr": 0.041227371113703316, + "acc_norm": 0.42758620689655175, + "acc_norm_stderr": 0.041227371113703316 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.023973861998992072, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.023973861998992072 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.04375888492727061, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.04375888492727061 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5967741935483871, + "acc_stderr": 0.02790615082604114, + "acc_norm": 0.5967741935483871, + "acc_norm_stderr": 0.02790615082604114 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3793103448275862, + "acc_stderr": 0.034139638059062345, + "acc_norm": 0.3793103448275862, + "acc_norm_stderr": 0.034139638059062345 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6787878787878788, + "acc_stderr": 0.036462049632538115, + "acc_norm": 0.6787878787878788, + "acc_norm_stderr": 0.036462049632538115 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.702020202020202, + "acc_stderr": 0.03258630383836556, + "acc_norm": 0.702020202020202, + "acc_norm_stderr": 0.03258630383836556 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7202072538860104, + "acc_stderr": 0.03239637046735704, + "acc_norm": 0.7202072538860104, + "acc_norm_stderr": 0.03239637046735704 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.47435897435897434, + "acc_stderr": 0.025317649726448663, + "acc_norm": 0.47435897435897434, + "acc_norm_stderr": 0.025317649726448663 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2851851851851852, + "acc_stderr": 0.027528599210340492, + "acc_norm": 0.2851851851851852, + "acc_norm_stderr": 0.027528599210340492 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5084033613445378, + "acc_stderr": 0.03247390276569669, + "acc_norm": 0.5084033613445378, + "acc_norm_stderr": 0.03247390276569669 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.03879687024073327, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.03879687024073327 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6990825688073394, + "acc_stderr": 0.019664751366802114, + "acc_norm": 0.6990825688073394, + "acc_norm_stderr": 0.019664751366802114 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3611111111111111, + "acc_stderr": 0.03275773486100999, + "acc_norm": 0.3611111111111111, + "acc_norm_stderr": 0.03275773486100999 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03308611113236436, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03308611113236436 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7088607594936709, + "acc_stderr": 0.02957160106575337, + "acc_norm": 0.7088607594936709, + "acc_norm_stderr": 0.02957160106575337 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5874439461883408, + "acc_stderr": 0.03304062175449297, + "acc_norm": 0.5874439461883408, + "acc_norm_stderr": 0.03304062175449297 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6335877862595419, + "acc_stderr": 0.04225875451969637, + "acc_norm": 0.6335877862595419, + "acc_norm_stderr": 0.04225875451969637 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6694214876033058, + "acc_stderr": 0.04294340845212093, + "acc_norm": 0.6694214876033058, + "acc_norm_stderr": 0.04294340845212093 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6388888888888888, + "acc_stderr": 0.04643454608906276, + "acc_norm": 0.6388888888888888, + "acc_norm_stderr": 0.04643454608906276 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6380368098159509, + "acc_stderr": 0.037757007291414416, + "acc_norm": 0.6380368098159509, + "acc_norm_stderr": 0.037757007291414416 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.045218299028335865, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.045218299028335865 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.02723601394619668, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.02723601394619668 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7241379310344828, + "acc_stderr": 0.015982814774695632, + "acc_norm": 0.7241379310344828, + "acc_norm_stderr": 0.015982814774695632 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5809248554913294, + "acc_stderr": 0.02656417811142262, + "acc_norm": 0.5809248554913294, + "acc_norm_stderr": 0.02656417811142262 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2670391061452514, + "acc_stderr": 0.014796502622562553, + "acc_norm": 0.2670391061452514, + "acc_norm_stderr": 0.014796502622562553 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5915032679738562, + "acc_stderr": 0.028146405993096358, + "acc_norm": 0.5915032679738562, + "acc_norm_stderr": 0.028146405993096358 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5852090032154341, + "acc_stderr": 0.02798268045975957, + "acc_norm": 0.5852090032154341, + "acc_norm_stderr": 0.02798268045975957 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5679012345679012, + "acc_stderr": 0.02756301097160667, + "acc_norm": 0.5679012345679012, + "acc_norm_stderr": 0.02756301097160667 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4148936170212766, + "acc_stderr": 0.029392236584612503, + "acc_norm": 0.4148936170212766, + "acc_norm_stderr": 0.029392236584612503 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.42242503259452413, + "acc_stderr": 0.012615600475734921, + "acc_norm": 0.42242503259452413, + "acc_norm_stderr": 0.012615600475734921 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5551470588235294, + "acc_stderr": 0.030187532060329383, + "acc_norm": 0.5551470588235294, + "acc_norm_stderr": 0.030187532060329383 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5392156862745098, + "acc_stderr": 0.02016552331390791, + "acc_norm": 0.5392156862745098, + "acc_norm_stderr": 0.02016552331390791 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6, + "acc_stderr": 0.0469237132203465, + "acc_norm": 0.6, + "acc_norm_stderr": 0.0469237132203465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6408163265306123, + "acc_stderr": 0.030713560455108493, + "acc_norm": 0.6408163265306123, + "acc_norm_stderr": 0.030713560455108493 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7412935323383084, + "acc_stderr": 0.03096590312357304, + "acc_norm": 0.7412935323383084, + "acc_norm_stderr": 0.03096590312357304 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036847, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036847 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4578313253012048, + "acc_stderr": 0.038786267710023595, + "acc_norm": 0.4578313253012048, + "acc_norm_stderr": 0.038786267710023595 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7543859649122807, + "acc_stderr": 0.0330140594698725, + "acc_norm": 0.7543859649122807, + "acc_norm_stderr": 0.0330140594698725 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3561811505507956, + "mc1_stderr": 0.016763790728446335, + "mc2": 0.5054873045730662, + "mc2_stderr": 0.015581256865462065 + }, + "all": { + "acc": 0.5308438667148494, + "acc_stderr": 0.034725324926748416, + "acc_norm": 0.5345224871977782, + "acc_norm_stderr": 0.03470817639109796, + "mc1": 0.3561811505507956, + "mc1_stderr": 0.016763790728446335, + "mc2": 0.5054873045730662, + "mc2_stderr": 0.015581256865462065 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/wizardLM-13B-1.0-fp16", + "model_sha": "b79733805e98e668ff9a459975c259881b1b8014", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/wizardLM-13B-1.0-fp16/results_2023-10-22T22-13-20.355454.json b/eval-results/TheBloke/wizardLM-13B-1.0-fp16/results_2023-10-22T22-13-20.355454.json new file mode 100644 index 0000000000000000000000000000000000000000..514636f4e99917f5d3a05c5ccd1b978324f74b21 --- /dev/null +++ b/eval-results/TheBloke/wizardLM-13B-1.0-fp16/results_2023-10-22T22-13-20.355454.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TheBloke/wizardLM-13B-1.0-fp16", + "model_sha": "b79733805e98e668ff9a459975c259881b1b8014", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.06638003355704698, + "em_stderr": 0.0025494321051837475, + "f1": 0.14066380033557005, + "f1_stderr": 0.00288504029268502 + }, + "harness|gsm8k|5": { + "acc": 0.13874147081122062, + "acc_stderr": 0.009521649920798148 + }, + "harness|winogrande|5": { + "acc": 0.7411207576953434, + "acc_stderr": 0.01231051581099338 + }, + "all": { + "em": 0.06638003355704698, + "em_stderr": 0.0025494321051837475, + "f1": 0.14066380033557005, + "f1_stderr": 0.00288504029268502, + "acc": 0.439931114253282, + "acc_stderr": 0.010916082865895764 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "0835c8fc5ecb12d2" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "b83ab79814401262" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "666509af0772f58b" + }, + "total_evaluation_time_secondes": "12140.85950922966", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TheBloke/wizardLM-7B-HF/results_2023-07-18T11-33-18.439367.json b/eval-results/TheBloke/wizardLM-7B-HF/results_2023-07-18T11-33-18.439367.json new file mode 100644 index 0000000000000000000000000000000000000000..345b0220ff449893099c566a1262d143f697bb6d --- /dev/null +++ b/eval-results/TheBloke/wizardLM-7B-HF/results_2023-07-18T11-33-18.439367.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.48464163822525597, + "acc_stderr": 0.014604496129394913, + "acc_norm": 0.5034129692832765, + "acc_norm_stderr": 0.014611050403244081 + }, + "harness|hellaswag|10": { + "acc": 0.5685122485560645, + "acc_stderr": 0.004942716091996078, + "acc_norm": 0.7527384983071101, + "acc_norm_stderr": 0.004305383398710189 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.43703703703703706, + "acc_stderr": 0.042849586397534, + "acc_norm": 0.43703703703703706, + "acc_norm_stderr": 0.042849586397534 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.40131578947368424, + "acc_stderr": 0.03988903703336284, + "acc_norm": 0.40131578947368424, + "acc_norm_stderr": 0.03988903703336284 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4377358490566038, + "acc_stderr": 0.03053333843046751, + "acc_norm": 0.4377358490566038, + "acc_norm_stderr": 0.03053333843046751 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3680555555555556, + "acc_stderr": 0.04032999053960719, + "acc_norm": 0.3680555555555556, + "acc_norm_stderr": 0.04032999053960719 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.35260115606936415, + "acc_stderr": 0.036430371689585475, + "acc_norm": 0.35260115606936415, + "acc_norm_stderr": 0.036430371689585475 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4, + "acc_stderr": 0.03202563076101735, + "acc_norm": 0.4, + "acc_norm_stderr": 0.03202563076101735 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.04142439719489362, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.04142439719489362 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.32413793103448274, + "acc_stderr": 0.03900432069185555, + "acc_norm": 0.32413793103448274, + "acc_norm_stderr": 0.03900432069185555 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30687830687830686, + "acc_stderr": 0.02375292871211214, + "acc_norm": 0.30687830687830686, + "acc_norm_stderr": 0.02375292871211214 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.03893259610604675, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.03893259610604675 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.36129032258064514, + "acc_stderr": 0.02732754844795754, + "acc_norm": 0.36129032258064514, + "acc_norm_stderr": 0.02732754844795754 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.30049261083743845, + "acc_stderr": 0.03225799476233484, + "acc_norm": 0.30049261083743845, + "acc_norm_stderr": 0.03225799476233484 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.45454545454545453, + "acc_stderr": 0.03888176921674099, + "acc_norm": 0.45454545454545453, + "acc_norm_stderr": 0.03888176921674099 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.42424242424242425, + "acc_stderr": 0.03521224908841583, + "acc_norm": 0.42424242424242425, + "acc_norm_stderr": 0.03521224908841583 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.46632124352331605, + "acc_stderr": 0.03600244069867178, + "acc_norm": 0.46632124352331605, + "acc_norm_stderr": 0.03600244069867178 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.35384615384615387, + "acc_stderr": 0.024243783994062164, + "acc_norm": 0.35384615384615387, + "acc_norm_stderr": 0.024243783994062164 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.026202766534652148, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.026202766534652148 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3277310924369748, + "acc_stderr": 0.030489911417673227, + "acc_norm": 0.3277310924369748, + "acc_norm_stderr": 0.030489911417673227 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.46605504587155966, + "acc_stderr": 0.021387863350353992, + "acc_norm": 0.46605504587155966, + "acc_norm_stderr": 0.021387863350353992 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2175925925925926, + "acc_stderr": 0.028139689444859672, + "acc_norm": 0.2175925925925926, + "acc_norm_stderr": 0.028139689444859672 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.45588235294117646, + "acc_stderr": 0.03495624522015474, + "acc_norm": 0.45588235294117646, + "acc_norm_stderr": 0.03495624522015474 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.43037974683544306, + "acc_stderr": 0.03223017195937597, + "acc_norm": 0.43037974683544306, + "acc_norm_stderr": 0.03223017195937597 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5112107623318386, + "acc_stderr": 0.033549366530984746, + "acc_norm": 0.5112107623318386, + "acc_norm_stderr": 0.033549366530984746 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3893129770992366, + "acc_stderr": 0.04276486542814591, + "acc_norm": 0.3893129770992366, + "acc_norm_stderr": 0.04276486542814591 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5785123966942148, + "acc_stderr": 0.04507732278775087, + "acc_norm": 0.5785123966942148, + "acc_norm_stderr": 0.04507732278775087 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.4537037037037037, + "acc_stderr": 0.04812917324536821, + "acc_norm": 0.4537037037037037, + "acc_norm_stderr": 0.04812917324536821 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3987730061349693, + "acc_stderr": 0.038470214204560246, + "acc_norm": 0.3987730061349693, + "acc_norm_stderr": 0.038470214204560246 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3592233009708738, + "acc_stderr": 0.047504583990416946, + "acc_norm": 0.3592233009708738, + "acc_norm_stderr": 0.047504583990416946 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.5170940170940171, + "acc_stderr": 0.032736940493481824, + "acc_norm": 0.5170940170940171, + "acc_norm_stderr": 0.032736940493481824 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.42, + "acc_stderr": 0.04960449637488584, + "acc_norm": 0.42, + "acc_norm_stderr": 0.04960449637488584 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.545338441890166, + "acc_stderr": 0.017806304585052602, + "acc_norm": 0.545338441890166, + "acc_norm_stderr": 0.017806304585052602 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.38439306358381503, + "acc_stderr": 0.026189666966272035, + "acc_norm": 0.38439306358381503, + "acc_norm_stderr": 0.026189666966272035 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23016759776536314, + "acc_stderr": 0.014078339253425819, + "acc_norm": 0.23016759776536314, + "acc_norm_stderr": 0.014078339253425819 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4084967320261438, + "acc_stderr": 0.028146405993096358, + "acc_norm": 0.4084967320261438, + "acc_norm_stderr": 0.028146405993096358 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3858520900321543, + "acc_stderr": 0.027648149599751457, + "acc_norm": 0.3858520900321543, + "acc_norm_stderr": 0.027648149599751457 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.39814814814814814, + "acc_stderr": 0.027237415094592477, + "acc_norm": 0.39814814814814814, + "acc_norm_stderr": 0.027237415094592477 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3191489361702128, + "acc_stderr": 0.027807990141320193, + "acc_norm": 0.3191489361702128, + "acc_norm_stderr": 0.027807990141320193 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3220338983050847, + "acc_stderr": 0.01193393607189109, + "acc_norm": 0.3220338983050847, + "acc_norm_stderr": 0.01193393607189109 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.3860294117647059, + "acc_stderr": 0.029573269134411124, + "acc_norm": 0.3860294117647059, + "acc_norm_stderr": 0.029573269134411124 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.40032679738562094, + "acc_stderr": 0.019821843688271765, + "acc_norm": 0.40032679738562094, + "acc_norm_stderr": 0.019821843688271765 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.41818181818181815, + "acc_stderr": 0.04724577405731571, + "acc_norm": 0.41818181818181815, + "acc_norm_stderr": 0.04724577405731571 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3142857142857143, + "acc_stderr": 0.029719329422417482, + "acc_norm": 0.3142857142857143, + "acc_norm_stderr": 0.029719329422417482 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.47761194029850745, + "acc_stderr": 0.035319879302087305, + "acc_norm": 0.47761194029850745, + "acc_norm_stderr": 0.035319879302087305 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.39156626506024095, + "acc_stderr": 0.03799857454479637, + "acc_norm": 0.39156626506024095, + "acc_norm_stderr": 0.03799857454479637 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.5380116959064327, + "acc_stderr": 0.038237270928823064, + "acc_norm": 0.5380116959064327, + "acc_norm_stderr": 0.038237270928823064 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.31456548347613217, + "mc1_stderr": 0.01625524199317919, + "mc2": 0.45584096136441793, + "mc2_stderr": 0.016028055350830416 + }, + "all": { + "acc": 0.38566819917906325, + "acc_stderr": 0.03482242619787474, + "acc_norm": 0.3891088361419288, + "acc_norm_stderr": 0.03481173503822327, + "mc1": 0.31456548347613217, + "mc1_stderr": 0.01625524199317919, + "mc2": 0.45584096136441793, + "mc2_stderr": 0.016028055350830416 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "TheBloke/wizardLM-7B-HF", + "model_sha": "a8e22531a48cece989e670f539eb18ebd2dbd0cf", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/TinyPixel/elm-test/results_2023-09-22T05-13-08.764414.json b/eval-results/TinyPixel/elm-test/results_2023-09-22T05-13-08.764414.json new file mode 100644 index 0000000000000000000000000000000000000000..26b1d51facc4aea44db3f98c50b0408e1946fd9b --- /dev/null +++ b/eval-results/TinyPixel/elm-test/results_2023-09-22T05-13-08.764414.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "TinyPixel/elm-test", + "model_sha": "aa8f81624d897aa493474bcd96dc3feae9f7a535", + "model_size": "12.58 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5, + "acc_stderr": 0.014611390804670088, + "acc_norm": 0.5315699658703071, + "acc_norm_stderr": 0.014582236460866977 + }, + "harness|hellaswag|10": { + "acc": 0.5914160525791675, + "acc_stderr": 0.004905674408614026, + "acc_norm": 0.7897829117705636, + "acc_norm_stderr": 0.004066299761478493 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4144736842105263, + "acc_stderr": 0.04008973785779206, + "acc_norm": 0.4144736842105263, + "acc_norm_stderr": 0.04008973785779206 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.46037735849056605, + "acc_stderr": 0.030676096599389184, + "acc_norm": 0.46037735849056605, + "acc_norm_stderr": 0.030676096599389184 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.04174752578923185, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.04174752578923185 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.43352601156069365, + "acc_stderr": 0.03778621079092055, + "acc_norm": 0.43352601156069365, + "acc_norm_stderr": 0.03778621079092055 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.04280105837364395, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.04280105837364395 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4340425531914894, + "acc_stderr": 0.03240038086792747, + "acc_norm": 0.4340425531914894, + "acc_norm_stderr": 0.03240038086792747 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.04339138322579861, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.04339138322579861 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.04164188720169377, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.04164188720169377 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2830687830687831, + "acc_stderr": 0.023201392938194974, + "acc_norm": 0.2830687830687831, + "acc_norm_stderr": 0.023201392938194974 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.04104947269903394, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.04104947269903394 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.49032258064516127, + "acc_stderr": 0.028438677998909558, + "acc_norm": 0.49032258064516127, + "acc_norm_stderr": 0.028438677998909558 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3694581280788177, + "acc_stderr": 0.033959703819985726, + "acc_norm": 0.3694581280788177, + "acc_norm_stderr": 0.033959703819985726 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6242424242424243, + "acc_stderr": 0.03781887353205982, + "acc_norm": 0.6242424242424243, + "acc_norm_stderr": 0.03781887353205982 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.48484848484848486, + "acc_stderr": 0.0356071651653106, + "acc_norm": 0.48484848484848486, + "acc_norm_stderr": 0.0356071651653106 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6994818652849741, + "acc_stderr": 0.0330881859441575, + "acc_norm": 0.6994818652849741, + "acc_norm_stderr": 0.0330881859441575 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.46153846153846156, + "acc_stderr": 0.025275892070240634, + "acc_norm": 0.46153846153846156, + "acc_norm_stderr": 0.025275892070240634 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.29259259259259257, + "acc_stderr": 0.02773896963217609, + "acc_norm": 0.29259259259259257, + "acc_norm_stderr": 0.02773896963217609 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42436974789915966, + "acc_stderr": 0.032104790510157764, + "acc_norm": 0.42436974789915966, + "acc_norm_stderr": 0.032104790510157764 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.03802039760107903, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.03802039760107903 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.636697247706422, + "acc_stderr": 0.020620603919625804, + "acc_norm": 0.636697247706422, + "acc_norm_stderr": 0.020620603919625804 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.03054674526495318, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.03054674526495318 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5441176470588235, + "acc_stderr": 0.03495624522015476, + "acc_norm": 0.5441176470588235, + "acc_norm_stderr": 0.03495624522015476 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6286919831223629, + "acc_stderr": 0.031450686007448596, + "acc_norm": 0.6286919831223629, + "acc_norm_stderr": 0.031450686007448596 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5695067264573991, + "acc_stderr": 0.033231973029429394, + "acc_norm": 0.5695067264573991, + "acc_norm_stderr": 0.033231973029429394 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5572519083969466, + "acc_stderr": 0.043564472026650695, + "acc_norm": 0.5572519083969466, + "acc_norm_stderr": 0.043564472026650695 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6446280991735537, + "acc_stderr": 0.0436923632657398, + "acc_norm": 0.6446280991735537, + "acc_norm_stderr": 0.0436923632657398 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.04820403072760628, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.04820403072760628 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.50920245398773, + "acc_stderr": 0.03927705600787443, + "acc_norm": 0.50920245398773, + "acc_norm_stderr": 0.03927705600787443 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.38392857142857145, + "acc_stderr": 0.04616143075028547, + "acc_norm": 0.38392857142857145, + "acc_norm_stderr": 0.04616143075028547 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5533980582524272, + "acc_stderr": 0.04922424153458933, + "acc_norm": 0.5533980582524272, + "acc_norm_stderr": 0.04922424153458933 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6923076923076923, + "acc_stderr": 0.03023638994217309, + "acc_norm": 0.6923076923076923, + "acc_norm_stderr": 0.03023638994217309 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.648786717752235, + "acc_stderr": 0.01706998205149943, + "acc_norm": 0.648786717752235, + "acc_norm_stderr": 0.01706998205149943 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5028901734104047, + "acc_stderr": 0.026918645383239004, + "acc_norm": 0.5028901734104047, + "acc_norm_stderr": 0.026918645383239004 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23910614525139665, + "acc_stderr": 0.014265554192331144, + "acc_norm": 0.23910614525139665, + "acc_norm_stderr": 0.014265554192331144 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.49673202614379086, + "acc_stderr": 0.028629305194003543, + "acc_norm": 0.49673202614379086, + "acc_norm_stderr": 0.028629305194003543 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5916398713826366, + "acc_stderr": 0.027917050748484627, + "acc_norm": 0.5916398713826366, + "acc_norm_stderr": 0.027917050748484627 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5061728395061729, + "acc_stderr": 0.027818623962583295, + "acc_norm": 0.5061728395061729, + "acc_norm_stderr": 0.027818623962583295 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.35106382978723405, + "acc_stderr": 0.028473501272963764, + "acc_norm": 0.35106382978723405, + "acc_norm_stderr": 0.028473501272963764 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.35853976531942633, + "acc_stderr": 0.012248487319682734, + "acc_norm": 0.35853976531942633, + "acc_norm_stderr": 0.012248487319682734 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5220588235294118, + "acc_stderr": 0.030343264224213528, + "acc_norm": 0.5220588235294118, + "acc_norm_stderr": 0.030343264224213528 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.44607843137254904, + "acc_stderr": 0.020109864547181354, + "acc_norm": 0.44607843137254904, + "acc_norm_stderr": 0.020109864547181354 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5363636363636364, + "acc_stderr": 0.04776449162396197, + "acc_norm": 0.5363636363636364, + "acc_norm_stderr": 0.04776449162396197 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4816326530612245, + "acc_stderr": 0.031987615467631264, + "acc_norm": 0.4816326530612245, + "acc_norm_stderr": 0.031987615467631264 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6467661691542289, + "acc_stderr": 0.03379790611796777, + "acc_norm": 0.6467661691542289, + "acc_norm_stderr": 0.03379790611796777 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42771084337349397, + "acc_stderr": 0.038515976837185335, + "acc_norm": 0.42771084337349397, + "acc_norm_stderr": 0.038515976837185335 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7076023391812866, + "acc_stderr": 0.03488647713457922, + "acc_norm": 0.7076023391812866, + "acc_norm_stderr": 0.03488647713457922 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2582619339045288, + "mc1_stderr": 0.015321821688476199, + "mc2": 0.39505922754623324, + "mc2_stderr": 0.01379379444493236 + }, + "all": { + "acc": 0.47299833231682314, + "acc_stderr": 0.03534398633986979, + "acc_norm": 0.4768955666399029, + "acc_norm_stderr": 0.0353292655095149, + "mc1": 0.2582619339045288, + "mc1_stderr": 0.015321821688476199, + "mc2": 0.39505922754623324, + "mc2_stderr": 0.01379379444493236 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4959.418930053711", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TinyPixel/elm-test/results_2023-10-28T16-54-03.304592.json b/eval-results/TinyPixel/elm-test/results_2023-10-28T16-54-03.304592.json new file mode 100644 index 0000000000000000000000000000000000000000..c06bdc1af8e77dc5be2fe7caafa3ec552b815b14 --- /dev/null +++ b/eval-results/TinyPixel/elm-test/results_2023-10-28T16-54-03.304592.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TinyPixel/elm-test", + "model_sha": "aa8f81624d897aa493474bcd96dc3feae9f7a535", + "model_size": "12.58 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0012583892617449664, + "em_stderr": 0.0003630560893119392, + "f1": 0.05654886744966456, + "f1_stderr": 0.0013251750673152706 + }, + "harness|gsm8k|5": { + "acc": 0.07505686125852919, + "acc_stderr": 0.007257633145486643 + }, + "harness|winogrande|5": { + "acc": 0.7434885556432518, + "acc_stderr": 0.012273648008759996 + }, + "all": { + "em": 0.0012583892617449664, + "em_stderr": 0.0003630560893119392, + "f1": 0.05654886744966456, + "f1_stderr": 0.0013251750673152706, + "acc": 0.4092727084508905, + "acc_stderr": 0.00976564057712332 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "91ebc292723b7ace" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "41853c6e7d29c69a" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "9686fe7f055183b6" + }, + "total_evaluation_time_secondes": "10362.112861156464", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TinyPixel/lima-test/results_2023-08-28T09-10-45.645303.json b/eval-results/TinyPixel/lima-test/results_2023-08-28T09-10-45.645303.json new file mode 100644 index 0000000000000000000000000000000000000000..f8a29a6a49c3306d5fa345e4e6fc7cae9fce32a9 --- /dev/null +++ b/eval-results/TinyPixel/lima-test/results_2023-08-28T09-10-45.645303.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "TinyPixel/lima-test", + "model_sha": "4d6a006c6341f29b11c02f19bf9535f51b4da1b5", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "c8a907ca0dbabbcc3132b1b9d84d5c763d587820", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.49146757679180886, + "acc_stderr": 0.01460926316563219, + "acc_norm": 0.5307167235494881, + "acc_norm_stderr": 0.014583792546304037 + }, + "harness|hellaswag|10": { + "acc": 0.5900219079864569, + "acc_stderr": 0.004908241354310212, + "acc_norm": 0.7887870942043418, + "acc_norm_stderr": 0.004073349176133355 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45925925925925926, + "acc_stderr": 0.04304979692464242, + "acc_norm": 0.45925925925925926, + "acc_norm_stderr": 0.04304979692464242 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3881578947368421, + "acc_stderr": 0.03965842097512744, + "acc_norm": 0.3881578947368421, + "acc_norm_stderr": 0.03965842097512744 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.46037735849056605, + "acc_stderr": 0.030676096599389184, + "acc_norm": 0.46037735849056605, + "acc_norm_stderr": 0.030676096599389184 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4513888888888889, + "acc_stderr": 0.04161402398403279, + "acc_norm": 0.4513888888888889, + "acc_norm_stderr": 0.04161402398403279 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.42196531791907516, + "acc_stderr": 0.037657466938651504, + "acc_norm": 0.42196531791907516, + "acc_norm_stderr": 0.037657466938651504 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.042207736591714534, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.042207736591714534 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.43829787234042555, + "acc_stderr": 0.032436186361081004, + "acc_norm": 0.43829787234042555, + "acc_norm_stderr": 0.032436186361081004 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537315, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537315 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.022569897074918407, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.022569897074918407 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30952380952380953, + "acc_stderr": 0.04134913018303316, + "acc_norm": 0.30952380952380953, + "acc_norm_stderr": 0.04134913018303316 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.4967741935483871, + "acc_stderr": 0.02844341422643833, + "acc_norm": 0.4967741935483871, + "acc_norm_stderr": 0.02844341422643833 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3399014778325123, + "acc_stderr": 0.0333276906841079, + "acc_norm": 0.3399014778325123, + "acc_norm_stderr": 0.0333276906841079 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.03756335775187898, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.03756335775187898 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.47474747474747475, + "acc_stderr": 0.03557806245087314, + "acc_norm": 0.47474747474747475, + "acc_norm_stderr": 0.03557806245087314 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6787564766839378, + "acc_stderr": 0.033699508685490674, + "acc_norm": 0.6787564766839378, + "acc_norm_stderr": 0.033699508685490674 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.44871794871794873, + "acc_stderr": 0.025217315184846482, + "acc_norm": 0.44871794871794873, + "acc_norm_stderr": 0.025217315184846482 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.02763490726417854, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.02763490726417854 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42436974789915966, + "acc_stderr": 0.03210479051015776, + "acc_norm": 0.42436974789915966, + "acc_norm_stderr": 0.03210479051015776 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.03734535676787198, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.03734535676787198 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6275229357798165, + "acc_stderr": 0.020728368457638497, + "acc_norm": 0.6275229357798165, + "acc_norm_stderr": 0.020728368457638497 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.03054674526495318, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.03054674526495318 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.03503235296367992, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.03503235296367992 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.5949367088607594, + "acc_stderr": 0.03195514741370671, + "acc_norm": 0.5949367088607594, + "acc_norm_stderr": 0.03195514741370671 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5560538116591929, + "acc_stderr": 0.03334625674242728, + "acc_norm": 0.5560538116591929, + "acc_norm_stderr": 0.03334625674242728 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5725190839694656, + "acc_stderr": 0.04338920305792401, + "acc_norm": 0.5725190839694656, + "acc_norm_stderr": 0.04338920305792401 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.628099173553719, + "acc_stderr": 0.04412015806624504, + "acc_norm": 0.628099173553719, + "acc_norm_stderr": 0.04412015806624504 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.04820403072760628, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.04820403072760628 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5153374233128835, + "acc_stderr": 0.039265223787088445, + "acc_norm": 0.5153374233128835, + "acc_norm_stderr": 0.039265223787088445 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764376, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5631067961165048, + "acc_stderr": 0.049111471073657764, + "acc_norm": 0.5631067961165048, + "acc_norm_stderr": 0.049111471073657764 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6965811965811965, + "acc_stderr": 0.030118210106942638, + "acc_norm": 0.6965811965811965, + "acc_norm_stderr": 0.030118210106942638 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6462324393358876, + "acc_stderr": 0.017098184708161903, + "acc_norm": 0.6462324393358876, + "acc_norm_stderr": 0.017098184708161903 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.49421965317919075, + "acc_stderr": 0.026917296179149116, + "acc_norm": 0.49421965317919075, + "acc_norm_stderr": 0.026917296179149116 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4869281045751634, + "acc_stderr": 0.028620130800700246, + "acc_norm": 0.4869281045751634, + "acc_norm_stderr": 0.028620130800700246 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6012861736334405, + "acc_stderr": 0.0278093225857745, + "acc_norm": 0.6012861736334405, + "acc_norm_stderr": 0.0278093225857745 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5092592592592593, + "acc_stderr": 0.027815973433878014, + "acc_norm": 0.5092592592592593, + "acc_norm_stderr": 0.027815973433878014 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.35815602836879434, + "acc_stderr": 0.028602085862759422, + "acc_norm": 0.35815602836879434, + "acc_norm_stderr": 0.028602085862759422 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.36766623207301175, + "acc_stderr": 0.012314845910071695, + "acc_norm": 0.36766623207301175, + "acc_norm_stderr": 0.012314845910071695 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5220588235294118, + "acc_stderr": 0.030343264224213535, + "acc_norm": 0.5220588235294118, + "acc_norm_stderr": 0.030343264224213535 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4542483660130719, + "acc_stderr": 0.02014297455379519, + "acc_norm": 0.4542483660130719, + "acc_norm_stderr": 0.02014297455379519 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5636363636363636, + "acc_stderr": 0.04750185058907296, + "acc_norm": 0.5636363636363636, + "acc_norm_stderr": 0.04750185058907296 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.45714285714285713, + "acc_stderr": 0.031891418324213966, + "acc_norm": 0.45714285714285713, + "acc_norm_stderr": 0.031891418324213966 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6318407960199005, + "acc_stderr": 0.03410410565495301, + "acc_norm": 0.6318407960199005, + "acc_norm_stderr": 0.03410410565495301 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4036144578313253, + "acc_stderr": 0.038194861407583984, + "acc_norm": 0.4036144578313253, + "acc_norm_stderr": 0.038194861407583984 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7134502923976608, + "acc_stderr": 0.03467826685703826, + "acc_norm": 0.7134502923976608, + "acc_norm_stderr": 0.03467826685703826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2533659730722154, + "mc1_stderr": 0.015225899340826842, + "mc2": 0.3939743360593484, + "mc2_stderr": 0.013599478672319854 + }, + "all": { + "acc": 0.4667611741672404, + "acc_stderr": 0.03523325461503941, + "acc_norm": 0.47079531540411435, + "acc_norm_stderr": 0.035218672194742714, + "mc1": 0.2533659730722154, + "mc1_stderr": 0.015225899340826842, + "mc2": 0.3939743360593484, + "mc2_stderr": 0.013599478672319854 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "4756.200440645218", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TinyPixel/lima-test/results_2023-10-17T19-33-22.756804.json b/eval-results/TinyPixel/lima-test/results_2023-10-17T19-33-22.756804.json new file mode 100644 index 0000000000000000000000000000000000000000..c029a9089881374618a924c3cfb336859adc9b9c --- /dev/null +++ b/eval-results/TinyPixel/lima-test/results_2023-10-17T19-33-22.756804.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TinyPixel/lima-test", + "model_sha": "7f92ca8f1445eeb7fa16c2509cc5a3b7d6a2e212", + "model_size": "12.58 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001363255033557047, + "em_stderr": 0.0003778609196461008, + "f1": 0.05645763422818797, + "f1_stderr": 0.0013211879752480866 + }, + "harness|gsm8k|5": { + "acc": 0.07960576194086429, + "acc_stderr": 0.007455924338676286 + }, + "harness|winogrande|5": { + "acc": 0.7403314917127072, + "acc_stderr": 0.012322700705552667 + }, + "all": { + "em": 0.001363255033557047, + "em_stderr": 0.0003778609196461008, + "f1": 0.05645763422818797, + "f1_stderr": 0.0013211879752480866, + "acc": 0.40996862682678575, + "acc_stderr": 0.009889312522114477 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "3091f1212ec81213" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "00a9a7f4c8014322" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "584b10f9865d2908" + }, + "total_evaluation_time_secondes": "9751.945108652115", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TinyPixel/llama2-7b-instruct/results_2023-08-17T12-12-37.965756.json b/eval-results/TinyPixel/llama2-7b-instruct/results_2023-08-17T12-12-37.965756.json new file mode 100644 index 0000000000000000000000000000000000000000..b607bfac5ab1eb1e917561065ad247d5cf64f145 --- /dev/null +++ b/eval-results/TinyPixel/llama2-7b-instruct/results_2023-08-17T12-12-37.965756.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.49829351535836175, + "acc_stderr": 0.01461130570505699, + "acc_norm": 0.5358361774744027, + "acc_norm_stderr": 0.01457381366473572 + }, + "harness|hellaswag|10": { + "acc": 0.5910177255526787, + "acc_stderr": 0.004906411984476793, + "acc_norm": 0.7877912766381199, + "acc_norm_stderr": 0.00408036220825117 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45925925925925926, + "acc_stderr": 0.04304979692464242, + "acc_norm": 0.45925925925925926, + "acc_norm_stderr": 0.04304979692464242 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.39473684210526316, + "acc_stderr": 0.039777499346220734, + "acc_norm": 0.39473684210526316, + "acc_norm_stderr": 0.039777499346220734 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4528301886792453, + "acc_stderr": 0.03063562795796182, + "acc_norm": 0.4528301886792453, + "acc_norm_stderr": 0.03063562795796182 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4513888888888889, + "acc_stderr": 0.04161402398403279, + "acc_norm": 0.4513888888888889, + "acc_norm_stderr": 0.04161402398403279 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.43352601156069365, + "acc_stderr": 0.03778621079092055, + "acc_norm": 0.43352601156069365, + "acc_norm_stderr": 0.03778621079092055 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171453, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171453 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4340425531914894, + "acc_stderr": 0.03240038086792747, + "acc_norm": 0.4340425531914894, + "acc_norm_stderr": 0.03240038086792747 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.041857744240220554, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.041857744240220554 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.46206896551724136, + "acc_stderr": 0.041546596717075474, + "acc_norm": 0.46206896551724136, + "acc_norm_stderr": 0.041546596717075474 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2724867724867725, + "acc_stderr": 0.022930973071633366, + "acc_norm": 0.2724867724867725, + "acc_norm_stderr": 0.022930973071633366 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.04073524322147126, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.04073524322147126 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.4935483870967742, + "acc_stderr": 0.02844163823354051, + "acc_norm": 0.4935483870967742, + "acc_norm_stderr": 0.02844163823354051 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.32019704433497537, + "acc_stderr": 0.032826493853041504, + "acc_norm": 0.32019704433497537, + "acc_norm_stderr": 0.032826493853041504 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6303030303030303, + "acc_stderr": 0.03769430314512566, + "acc_norm": 0.6303030303030303, + "acc_norm_stderr": 0.03769430314512566 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.48484848484848486, + "acc_stderr": 0.03560716516531061, + "acc_norm": 0.48484848484848486, + "acc_norm_stderr": 0.03560716516531061 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6839378238341969, + "acc_stderr": 0.033553973696861736, + "acc_norm": 0.6839378238341969, + "acc_norm_stderr": 0.033553973696861736 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4282051282051282, + "acc_stderr": 0.025088301454694834, + "acc_norm": 0.4282051282051282, + "acc_norm_stderr": 0.025088301454694834 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085622, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085622 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.0322529423239964, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.0322529423239964 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.03734535676787198, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.03734535676787198 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6220183486238532, + "acc_stderr": 0.02078918706672811, + "acc_norm": 0.6220183486238532, + "acc_norm_stderr": 0.02078918706672811 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.24537037037037038, + "acc_stderr": 0.029346665094372937, + "acc_norm": 0.24537037037037038, + "acc_norm_stderr": 0.029346665094372937 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5245098039215687, + "acc_stderr": 0.03505093194348798, + "acc_norm": 0.5245098039215687, + "acc_norm_stderr": 0.03505093194348798 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6033755274261603, + "acc_stderr": 0.03184399873811225, + "acc_norm": 0.6033755274261603, + "acc_norm_stderr": 0.03184399873811225 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5515695067264574, + "acc_stderr": 0.033378837362550984, + "acc_norm": 0.5515695067264574, + "acc_norm_stderr": 0.033378837362550984 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5267175572519084, + "acc_stderr": 0.04379024936553894, + "acc_norm": 0.5267175572519084, + "acc_norm_stderr": 0.04379024936553894 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.628099173553719, + "acc_stderr": 0.044120158066245044, + "acc_norm": 0.628099173553719, + "acc_norm_stderr": 0.044120158066245044 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.04820403072760628, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.04820403072760628 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.50920245398773, + "acc_stderr": 0.03927705600787443, + "acc_norm": 0.50920245398773, + "acc_norm_stderr": 0.03927705600787443 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764376, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5728155339805825, + "acc_stderr": 0.048979577377811674, + "acc_norm": 0.5728155339805825, + "acc_norm_stderr": 0.048979577377811674 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.688034188034188, + "acc_stderr": 0.030351527323344937, + "acc_norm": 0.688034188034188, + "acc_norm_stderr": 0.030351527323344937 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6411238825031929, + "acc_stderr": 0.017152991797501342, + "acc_norm": 0.6411238825031929, + "acc_norm_stderr": 0.017152991797501342 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.49710982658959535, + "acc_stderr": 0.026918645383239015, + "acc_norm": 0.49710982658959535, + "acc_norm_stderr": 0.026918645383239015 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.49673202614379086, + "acc_stderr": 0.028629305194003543, + "acc_norm": 0.49673202614379086, + "acc_norm_stderr": 0.028629305194003543 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6045016077170418, + "acc_stderr": 0.027770918531427838, + "acc_norm": 0.6045016077170418, + "acc_norm_stderr": 0.027770918531427838 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5092592592592593, + "acc_stderr": 0.027815973433878014, + "acc_norm": 0.5092592592592593, + "acc_norm_stderr": 0.027815973433878014 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.36524822695035464, + "acc_stderr": 0.028723863853281278, + "acc_norm": 0.36524822695035464, + "acc_norm_stderr": 0.028723863853281278 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.36897001303780963, + "acc_stderr": 0.01232393665017486, + "acc_norm": 0.36897001303780963, + "acc_norm_stderr": 0.01232393665017486 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5073529411764706, + "acc_stderr": 0.030369552523902173, + "acc_norm": 0.5073529411764706, + "acc_norm_stderr": 0.030369552523902173 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.44281045751633985, + "acc_stderr": 0.020095083154577344, + "acc_norm": 0.44281045751633985, + "acc_norm_stderr": 0.020095083154577344 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5272727272727272, + "acc_stderr": 0.04782001791380061, + "acc_norm": 0.5272727272727272, + "acc_norm_stderr": 0.04782001791380061 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.46122448979591835, + "acc_stderr": 0.03191282052669277, + "acc_norm": 0.46122448979591835, + "acc_norm_stderr": 0.03191282052669277 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6218905472636815, + "acc_stderr": 0.034288678487786564, + "acc_norm": 0.6218905472636815, + "acc_norm_stderr": 0.034288678487786564 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.65, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.65, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.39156626506024095, + "acc_stderr": 0.03799857454479637, + "acc_norm": 0.39156626506024095, + "acc_norm_stderr": 0.03799857454479637 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7192982456140351, + "acc_stderr": 0.034462962170884265, + "acc_norm": 0.7192982456140351, + "acc_norm_stderr": 0.034462962170884265 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.26438188494492043, + "mc1_stderr": 0.015438211119522512, + "mc2": 0.39481096196846566, + "mc2_stderr": 0.013796205321597201 + }, + "all": { + "acc": 0.4639503533482998, + "acc_stderr": 0.03519400615590806, + "acc_norm": 0.467921814589003, + "acc_norm_stderr": 0.03517936985393269, + "mc1": 0.26438188494492043, + "mc1_stderr": 0.015438211119522512, + "mc2": 0.39481096196846566, + "mc2_stderr": 0.013796205321597201 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TinyPixel/llama2-7b-instruct", + "model_sha": "4c0aa1032cbebeef1aad2becb5dcb613b8a1cc97", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "2733.647334098816", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TinyPixel/llama2-7b-oa/results_2023-08-15T14-31-12.640943.json b/eval-results/TinyPixel/llama2-7b-oa/results_2023-08-15T14-31-12.640943.json new file mode 100644 index 0000000000000000000000000000000000000000..161ca2b307de6aa1b019eac0e4ae6ac42d7d51ce --- /dev/null +++ b/eval-results/TinyPixel/llama2-7b-oa/results_2023-08-15T14-31-12.640943.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.4974402730375427, + "acc_stderr": 0.014611199329843784, + "acc_norm": 0.5341296928327645, + "acc_norm_stderr": 0.014577311315231102 + }, + "harness|hellaswag|10": { + "acc": 0.5893248356901015, + "acc_stderr": 0.004909509538525167, + "acc_norm": 0.7871937860983867, + "acc_norm_stderr": 0.0040845526419036665 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.40131578947368424, + "acc_stderr": 0.03988903703336284, + "acc_norm": 0.40131578947368424, + "acc_norm_stderr": 0.03988903703336284 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4641509433962264, + "acc_stderr": 0.030693675018458003, + "acc_norm": 0.4641509433962264, + "acc_norm_stderr": 0.030693675018458003 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4513888888888889, + "acc_stderr": 0.04161402398403279, + "acc_norm": 0.4513888888888889, + "acc_norm_stderr": 0.04161402398403279 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.43352601156069365, + "acc_stderr": 0.03778621079092055, + "acc_norm": 0.43352601156069365, + "acc_norm_stderr": 0.03778621079092055 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.040925639582376536, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.040925639582376536 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.425531914893617, + "acc_stderr": 0.03232146916224469, + "acc_norm": 0.425531914893617, + "acc_norm_stderr": 0.03232146916224469 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322004, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322004 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2698412698412698, + "acc_stderr": 0.02286083830923207, + "acc_norm": 0.2698412698412698, + "acc_norm_stderr": 0.02286083830923207 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.04006168083848878, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.04006168083848878 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621503, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621503 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5, + "acc_stderr": 0.028444006199428714, + "acc_norm": 0.5, + "acc_norm_stderr": 0.028444006199428714 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.35960591133004927, + "acc_stderr": 0.033764582465095665, + "acc_norm": 0.35960591133004927, + "acc_norm_stderr": 0.033764582465095665 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6242424242424243, + "acc_stderr": 0.03781887353205982, + "acc_norm": 0.6242424242424243, + "acc_norm_stderr": 0.03781887353205982 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.494949494949495, + "acc_stderr": 0.035621707606254015, + "acc_norm": 0.494949494949495, + "acc_norm_stderr": 0.035621707606254015 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.694300518134715, + "acc_stderr": 0.03324837939758159, + "acc_norm": 0.694300518134715, + "acc_norm_stderr": 0.03324837939758159 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.441025641025641, + "acc_stderr": 0.025174048384000763, + "acc_norm": 0.441025641025641, + "acc_norm_stderr": 0.025174048384000763 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2851851851851852, + "acc_stderr": 0.0275285992103405, + "acc_norm": 0.2851851851851852, + "acc_norm_stderr": 0.0275285992103405 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4495798319327731, + "acc_stderr": 0.03231293497137707, + "acc_norm": 0.4495798319327731, + "acc_norm_stderr": 0.03231293497137707 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943342, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943342 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6330275229357798, + "acc_stderr": 0.020664675659520525, + "acc_norm": 0.6330275229357798, + "acc_norm_stderr": 0.020664675659520525 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.27314814814814814, + "acc_stderr": 0.030388051301678116, + "acc_norm": 0.27314814814814814, + "acc_norm_stderr": 0.030388051301678116 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.03503235296367992, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.03503235296367992 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.5991561181434599, + "acc_stderr": 0.031900803894732356, + "acc_norm": 0.5991561181434599, + "acc_norm_stderr": 0.031900803894732356 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5605381165919282, + "acc_stderr": 0.03331092511038179, + "acc_norm": 0.5605381165919282, + "acc_norm_stderr": 0.03331092511038179 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5419847328244275, + "acc_stderr": 0.04369802690578756, + "acc_norm": 0.5419847328244275, + "acc_norm_stderr": 0.04369802690578756 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6528925619834711, + "acc_stderr": 0.04345724570292534, + "acc_norm": 0.6528925619834711, + "acc_norm_stderr": 0.04345724570292534 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.048262172941398944, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.048262172941398944 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5214723926380368, + "acc_stderr": 0.03924746876751129, + "acc_norm": 0.5214723926380368, + "acc_norm_stderr": 0.03924746876751129 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5631067961165048, + "acc_stderr": 0.049111471073657764, + "acc_norm": 0.5631067961165048, + "acc_norm_stderr": 0.049111471073657764 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7008547008547008, + "acc_stderr": 0.029996951858349472, + "acc_norm": 0.7008547008547008, + "acc_norm_stderr": 0.029996951858349472 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.644955300127714, + "acc_stderr": 0.017112085772772994, + "acc_norm": 0.644955300127714, + "acc_norm_stderr": 0.017112085772772994 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5057803468208093, + "acc_stderr": 0.026917296179149116, + "acc_norm": 0.5057803468208093, + "acc_norm_stderr": 0.026917296179149116 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4934640522875817, + "acc_stderr": 0.028627470550556047, + "acc_norm": 0.4934640522875817, + "acc_norm_stderr": 0.028627470550556047 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6045016077170418, + "acc_stderr": 0.027770918531427838, + "acc_norm": 0.6045016077170418, + "acc_norm_stderr": 0.027770918531427838 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5, + "acc_stderr": 0.02782074420373286, + "acc_norm": 0.5, + "acc_norm_stderr": 0.02782074420373286 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.35815602836879434, + "acc_stderr": 0.028602085862759422, + "acc_norm": 0.35815602836879434, + "acc_norm_stderr": 0.028602085862759422 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3670143415906128, + "acc_stderr": 0.012310264244842125, + "acc_norm": 0.3670143415906128, + "acc_norm_stderr": 0.012310264244842125 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5147058823529411, + "acc_stderr": 0.03035969707904612, + "acc_norm": 0.5147058823529411, + "acc_norm_stderr": 0.03035969707904612 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4477124183006536, + "acc_stderr": 0.02011692534742242, + "acc_norm": 0.4477124183006536, + "acc_norm_stderr": 0.02011692534742242 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5363636363636364, + "acc_stderr": 0.04776449162396197, + "acc_norm": 0.5363636363636364, + "acc_norm_stderr": 0.04776449162396197 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4857142857142857, + "acc_stderr": 0.03199615232806287, + "acc_norm": 0.4857142857142857, + "acc_norm_stderr": 0.03199615232806287 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6467661691542289, + "acc_stderr": 0.03379790611796777, + "acc_norm": 0.6467661691542289, + "acc_norm_stderr": 0.03379790611796777 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.39759036144578314, + "acc_stderr": 0.038099730845402184, + "acc_norm": 0.39759036144578314, + "acc_norm_stderr": 0.038099730845402184 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7017543859649122, + "acc_stderr": 0.03508771929824563, + "acc_norm": 0.7017543859649122, + "acc_norm_stderr": 0.03508771929824563 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.27539779681762544, + "mc1_stderr": 0.01563813566777552, + "mc2": 0.410552466411242, + "mc2_stderr": 0.013851446004390434 + }, + "all": { + "acc": 0.4693826697156303, + "acc_stderr": 0.03524167491117509, + "acc_norm": 0.4733582353122999, + "acc_norm_stderr": 0.035227118217764336, + "mc1": 0.27539779681762544, + "mc1_stderr": 0.01563813566777552, + "mc2": 0.410552466411242, + "mc2_stderr": 0.013851446004390434 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "TinyPixel/llama2-7b-oa", + "model_sha": "f346cbe795a2dadb6da0b40d70afd4976bcae90e", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "2692.328539609909", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TinyPixel/testmodel-3/results_2023-10-01T13-50-05.522780.json b/eval-results/TinyPixel/testmodel-3/results_2023-10-01T13-50-05.522780.json new file mode 100644 index 0000000000000000000000000000000000000000..bb26802696f5bee495a3b82691a048aeb2a8582b --- /dev/null +++ b/eval-results/TinyPixel/testmodel-3/results_2023-10-01T13-50-05.522780.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "TinyPixel/testmodel-3", + "model_sha": "a1fbc4d8a2c1a3d211325bdff9e7f0539fa7a2b1", + "model_size": "12.58 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4931740614334471, + "acc_stderr": 0.014610029151379813, + "acc_norm": 0.5324232081911263, + "acc_norm_stderr": 0.014580637569995421 + }, + "harness|hellaswag|10": { + "acc": 0.589026090420235, + "acc_stderr": 0.004910049928688087, + "acc_norm": 0.7871937860983867, + "acc_norm_stderr": 0.004084552641903664 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.40789473684210525, + "acc_stderr": 0.03999309712777471, + "acc_norm": 0.40789473684210525, + "acc_norm_stderr": 0.03999309712777471 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.46037735849056605, + "acc_stderr": 0.030676096599389184, + "acc_norm": 0.46037735849056605, + "acc_norm_stderr": 0.030676096599389184 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4652777777777778, + "acc_stderr": 0.04171115858181618, + "acc_norm": 0.4652777777777778, + "acc_norm_stderr": 0.04171115858181618 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.42196531791907516, + "acc_stderr": 0.037657466938651504, + "acc_norm": 0.42196531791907516, + "acc_norm_stderr": 0.037657466938651504 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171453, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171453 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.41702127659574467, + "acc_stderr": 0.032232762667117124, + "acc_norm": 0.41702127659574467, + "acc_norm_stderr": 0.032232762667117124 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322004, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322004 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2671957671957672, + "acc_stderr": 0.022789673145776564, + "acc_norm": 0.2671957671957672, + "acc_norm_stderr": 0.022789673145776564 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.040735243221471255, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.040735243221471255 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5032258064516129, + "acc_stderr": 0.028443414226438316, + "acc_norm": 0.5032258064516129, + "acc_norm_stderr": 0.028443414226438316 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.35467980295566504, + "acc_stderr": 0.03366124489051451, + "acc_norm": 0.35467980295566504, + "acc_norm_stderr": 0.03366124489051451 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6424242424242425, + "acc_stderr": 0.03742597043806586, + "acc_norm": 0.6424242424242425, + "acc_norm_stderr": 0.03742597043806586 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.4797979797979798, + "acc_stderr": 0.0355944356556392, + "acc_norm": 0.4797979797979798, + "acc_norm_stderr": 0.0355944356556392 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6994818652849741, + "acc_stderr": 0.0330881859441575, + "acc_norm": 0.6994818652849741, + "acc_norm_stderr": 0.0330881859441575 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.43846153846153846, + "acc_stderr": 0.025158266016868564, + "acc_norm": 0.43846153846153846, + "acc_norm_stderr": 0.025158266016868564 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.027840811495871923, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.027840811495871923 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4327731092436975, + "acc_stderr": 0.03218358107742613, + "acc_norm": 0.4327731092436975, + "acc_norm_stderr": 0.03218358107742613 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6330275229357798, + "acc_stderr": 0.020664675659520525, + "acc_norm": 0.6330275229357798, + "acc_norm_stderr": 0.020664675659520525 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.03054674526495318, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.03054674526495318 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.03503235296367992, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.03503235296367992 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6075949367088608, + "acc_stderr": 0.03178471874564729, + "acc_norm": 0.6075949367088608, + "acc_norm_stderr": 0.03178471874564729 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5560538116591929, + "acc_stderr": 0.03334625674242728, + "acc_norm": 0.5560538116591929, + "acc_norm_stderr": 0.03334625674242728 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5648854961832062, + "acc_stderr": 0.04348208051644858, + "acc_norm": 0.5648854961832062, + "acc_norm_stderr": 0.04348208051644858 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6446280991735537, + "acc_stderr": 0.0436923632657398, + "acc_norm": 0.6446280991735537, + "acc_norm_stderr": 0.0436923632657398 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.048262172941398944, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.048262172941398944 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5153374233128835, + "acc_stderr": 0.03926522378708843, + "acc_norm": 0.5153374233128835, + "acc_norm_stderr": 0.03926522378708843 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.0457237235873743, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.0457237235873743 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5533980582524272, + "acc_stderr": 0.04922424153458933, + "acc_norm": 0.5533980582524272, + "acc_norm_stderr": 0.04922424153458933 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7008547008547008, + "acc_stderr": 0.029996951858349472, + "acc_norm": 0.7008547008547008, + "acc_norm_stderr": 0.029996951858349472 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.52, + "acc_stderr": 0.05021167315686779, + "acc_norm": 0.52, + "acc_norm_stderr": 0.05021167315686779 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6462324393358876, + "acc_stderr": 0.017098184708161906, + "acc_norm": 0.6462324393358876, + "acc_norm_stderr": 0.017098184708161906 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5057803468208093, + "acc_stderr": 0.026917296179149116, + "acc_norm": 0.5057803468208093, + "acc_norm_stderr": 0.026917296179149116 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.48366013071895425, + "acc_stderr": 0.028614624752805413, + "acc_norm": 0.48366013071895425, + "acc_norm_stderr": 0.028614624752805413 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5980707395498392, + "acc_stderr": 0.027846476005930477, + "acc_norm": 0.5980707395498392, + "acc_norm_stderr": 0.027846476005930477 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5, + "acc_stderr": 0.02782074420373286, + "acc_norm": 0.5, + "acc_norm_stderr": 0.02782074420373286 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3617021276595745, + "acc_stderr": 0.028663820147199492, + "acc_norm": 0.3617021276595745, + "acc_norm_stderr": 0.028663820147199492 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3617992177314211, + "acc_stderr": 0.012272736233262931, + "acc_norm": 0.3617992177314211, + "acc_norm_stderr": 0.012272736233262931 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5036764705882353, + "acc_stderr": 0.030372015885428188, + "acc_norm": 0.5036764705882353, + "acc_norm_stderr": 0.030372015885428188 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4493464052287582, + "acc_stderr": 0.020123766528027266, + "acc_norm": 0.4493464052287582, + "acc_norm_stderr": 0.020123766528027266 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5272727272727272, + "acc_stderr": 0.04782001791380061, + "acc_norm": 0.5272727272727272, + "acc_norm_stderr": 0.04782001791380061 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4775510204081633, + "acc_stderr": 0.031976941187136725, + "acc_norm": 0.4775510204081633, + "acc_norm_stderr": 0.031976941187136725 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6417910447761194, + "acc_stderr": 0.03390393042268813, + "acc_norm": 0.6417910447761194, + "acc_norm_stderr": 0.03390393042268813 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4036144578313253, + "acc_stderr": 0.038194861407583984, + "acc_norm": 0.4036144578313253, + "acc_norm_stderr": 0.038194861407583984 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7076023391812866, + "acc_stderr": 0.03488647713457922, + "acc_norm": 0.7076023391812866, + "acc_norm_stderr": 0.03488647713457922 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.25091799265605874, + "mc1_stderr": 0.015176985027707687, + "mc2": 0.3875189010720103, + "mc2_stderr": 0.013537362497855546 + }, + "all": { + "acc": 0.46822735252346837, + "acc_stderr": 0.035230234963000304, + "acc_norm": 0.47225136680204177, + "acc_norm_stderr": 0.03521574532116693, + "mc1": 0.25091799265605874, + "mc1_stderr": 0.015176985027707687, + "mc2": 0.3875189010720103, + "mc2_stderr": 0.013537362497855546 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4857.104401350021", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TinyPixel/testmodel-3/results_2023-10-28T10-06-15.149786.json b/eval-results/TinyPixel/testmodel-3/results_2023-10-28T10-06-15.149786.json new file mode 100644 index 0000000000000000000000000000000000000000..eec503d743cae90151be5ad26d2a5a042ffab115 --- /dev/null +++ b/eval-results/TinyPixel/testmodel-3/results_2023-10-28T10-06-15.149786.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TinyPixel/testmodel-3", + "model_sha": "a1fbc4d8a2c1a3d211325bdff9e7f0539fa7a2b1", + "model_size": "12.58 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0012583892617449664, + "em_stderr": 0.00036305608931194434, + "f1": 0.05639261744966453, + "f1_stderr": 0.0013225481636967586 + }, + "harness|gsm8k|5": { + "acc": 0.0758150113722517, + "acc_stderr": 0.0072912057231625926 + }, + "harness|winogrande|5": { + "acc": 0.7387529597474349, + "acc_stderr": 0.012346914863415303 + }, + "all": { + "em": 0.0012583892617449664, + "em_stderr": 0.00036305608931194434, + "f1": 0.05639261744966453, + "f1_stderr": 0.0013225481636967586, + "acc": 0.4072839855598433, + "acc_stderr": 0.009819060293288948 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "01078f0f2e6d5e48" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "1ceec26b97cd623d" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "c52079f6e3f054f7" + }, + "total_evaluation_time_secondes": "10112.63693022728", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TinyPixel/testmodel2/results_2023-09-18T14-28-17.558290.json b/eval-results/TinyPixel/testmodel2/results_2023-09-18T14-28-17.558290.json new file mode 100644 index 0000000000000000000000000000000000000000..07e86339fa554143be6acc936a6752f1b4bb8a55 --- /dev/null +++ b/eval-results/TinyPixel/testmodel2/results_2023-09-18T14-28-17.558290.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "TinyPixel/testmodel2", + "model_sha": "cb1111653997cee2818ffcf13a1c37237ea2934d", + "model_size": "12.58 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.49146757679180886, + "acc_stderr": 0.01460926316563219, + "acc_norm": 0.5324232081911263, + "acc_norm_stderr": 0.014580637569995421 + }, + "harness|hellaswag|10": { + "acc": 0.5907189802828122, + "acc_stderr": 0.004906962980328293, + "acc_norm": 0.7877912766381199, + "acc_norm_stderr": 0.0040803622082511695 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3881578947368421, + "acc_stderr": 0.03965842097512744, + "acc_norm": 0.3881578947368421, + "acc_norm_stderr": 0.03965842097512744 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.44150943396226416, + "acc_stderr": 0.03056159042673184, + "acc_norm": 0.44150943396226416, + "acc_norm_stderr": 0.03056159042673184 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4583333333333333, + "acc_stderr": 0.04166666666666665, + "acc_norm": 0.4583333333333333, + "acc_norm_stderr": 0.04166666666666665 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952344, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952344 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.43352601156069365, + "acc_stderr": 0.03778621079092055, + "acc_norm": 0.43352601156069365, + "acc_norm_stderr": 0.03778621079092055 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.04280105837364395, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.04280105837364395 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.42127659574468085, + "acc_stderr": 0.03227834510146267, + "acc_norm": 0.42127659574468085, + "acc_norm_stderr": 0.03227834510146267 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159393, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159393 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2671957671957672, + "acc_stderr": 0.02278967314577656, + "acc_norm": 0.2671957671957672, + "acc_norm_stderr": 0.02278967314577656 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.040061680838488774, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.040061680838488774 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5096774193548387, + "acc_stderr": 0.02843867799890955, + "acc_norm": 0.5096774193548387, + "acc_norm_stderr": 0.02843867799890955 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3497536945812808, + "acc_stderr": 0.03355400904969566, + "acc_norm": 0.3497536945812808, + "acc_norm_stderr": 0.03355400904969566 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6303030303030303, + "acc_stderr": 0.03769430314512567, + "acc_norm": 0.6303030303030303, + "acc_norm_stderr": 0.03769430314512567 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.48484848484848486, + "acc_stderr": 0.0356071651653106, + "acc_norm": 0.48484848484848486, + "acc_norm_stderr": 0.0356071651653106 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7046632124352331, + "acc_stderr": 0.0329229663915514, + "acc_norm": 0.7046632124352331, + "acc_norm_stderr": 0.0329229663915514 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4512820512820513, + "acc_stderr": 0.025230381238934833, + "acc_norm": 0.4512820512820513, + "acc_norm_stderr": 0.025230381238934833 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.02803792996911499, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.02803792996911499 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4369747899159664, + "acc_stderr": 0.03221943636566196, + "acc_norm": 0.4369747899159664, + "acc_norm_stderr": 0.03221943636566196 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.03802039760107903, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.03802039760107903 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6293577981651376, + "acc_stderr": 0.02070745816435298, + "acc_norm": 0.6293577981651376, + "acc_norm_stderr": 0.02070745816435298 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.03054674526495318, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.03054674526495318 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.03503235296367992, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.03503235296367992 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.620253164556962, + "acc_stderr": 0.031591887529658504, + "acc_norm": 0.620253164556962, + "acc_norm_stderr": 0.031591887529658504 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5695067264573991, + "acc_stderr": 0.033231973029429394, + "acc_norm": 0.5695067264573991, + "acc_norm_stderr": 0.033231973029429394 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.549618320610687, + "acc_stderr": 0.04363643698524779, + "acc_norm": 0.549618320610687, + "acc_norm_stderr": 0.04363643698524779 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6611570247933884, + "acc_stderr": 0.043207678075366705, + "acc_norm": 0.6611570247933884, + "acc_norm_stderr": 0.043207678075366705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.04830366024635331, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.04830366024635331 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5153374233128835, + "acc_stderr": 0.03926522378708843, + "acc_norm": 0.5153374233128835, + "acc_norm_stderr": 0.03926522378708843 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5631067961165048, + "acc_stderr": 0.049111471073657764, + "acc_norm": 0.5631067961165048, + "acc_norm_stderr": 0.049111471073657764 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7051282051282052, + "acc_stderr": 0.02987257770889119, + "acc_norm": 0.7051282051282052, + "acc_norm_stderr": 0.02987257770889119 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6411238825031929, + "acc_stderr": 0.017152991797501342, + "acc_norm": 0.6411238825031929, + "acc_norm_stderr": 0.017152991797501342 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.48265895953757226, + "acc_stderr": 0.026902900458666647, + "acc_norm": 0.48265895953757226, + "acc_norm_stderr": 0.026902900458666647 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4934640522875817, + "acc_stderr": 0.028627470550556047, + "acc_norm": 0.4934640522875817, + "acc_norm_stderr": 0.028627470550556047 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.594855305466238, + "acc_stderr": 0.027882383791325953, + "acc_norm": 0.594855305466238, + "acc_norm_stderr": 0.027882383791325953 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.49691358024691357, + "acc_stderr": 0.027820214158594384, + "acc_norm": 0.49691358024691357, + "acc_norm_stderr": 0.027820214158594384 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.36524822695035464, + "acc_stderr": 0.028723863853281278, + "acc_norm": 0.36524822695035464, + "acc_norm_stderr": 0.028723863853281278 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.36114732724902215, + "acc_stderr": 0.01226793547751903, + "acc_norm": 0.36114732724902215, + "acc_norm_stderr": 0.01226793547751903 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5073529411764706, + "acc_stderr": 0.030369552523902173, + "acc_norm": 0.5073529411764706, + "acc_norm_stderr": 0.030369552523902173 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.44607843137254904, + "acc_stderr": 0.020109864547181357, + "acc_norm": 0.44607843137254904, + "acc_norm_stderr": 0.020109864547181357 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.509090909090909, + "acc_stderr": 0.0478833976870286, + "acc_norm": 0.509090909090909, + "acc_norm_stderr": 0.0478833976870286 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4816326530612245, + "acc_stderr": 0.031987615467631264, + "acc_norm": 0.4816326530612245, + "acc_norm_stderr": 0.031987615467631264 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6318407960199005, + "acc_stderr": 0.03410410565495301, + "acc_norm": 0.6318407960199005, + "acc_norm_stderr": 0.03410410565495301 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.40963855421686746, + "acc_stderr": 0.03828401115079022, + "acc_norm": 0.40963855421686746, + "acc_norm_stderr": 0.03828401115079022 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7076023391812866, + "acc_stderr": 0.03488647713457922, + "acc_norm": 0.7076023391812866, + "acc_norm_stderr": 0.03488647713457922 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2582619339045288, + "mc1_stderr": 0.0153218216884762, + "mc2": 0.3917355712197997, + "mc2_stderr": 0.013582107635745794 + }, + "all": { + "acc": 0.46863022947647875, + "acc_stderr": 0.03525899737630071, + "acc_norm": 0.4726646011333369, + "acc_norm_stderr": 0.03524450201413607, + "mc1": 0.2582619339045288, + "mc1_stderr": 0.0153218216884762, + "mc2": 0.3917355712197997, + "mc2_stderr": 0.013582107635745794 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4938.020981550217", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/TinyPixel/testmodel2/results_2023-10-24T13-54-24.629963.json b/eval-results/TinyPixel/testmodel2/results_2023-10-24T13-54-24.629963.json new file mode 100644 index 0000000000000000000000000000000000000000..ae529fdad6441d9f2985665fad5d4d43f7bcfef1 --- /dev/null +++ b/eval-results/TinyPixel/testmodel2/results_2023-10-24T13-54-24.629963.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "TinyPixel/testmodel2", + "model_sha": "cb1111653997cee2818ffcf13a1c37237ea2934d", + "model_size": "12.58 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0012583892617449664, + "em_stderr": 0.00036305608931189794, + "f1": 0.05664848993288591, + "f1_stderr": 0.001329470291478584 + }, + "harness|gsm8k|5": { + "acc": 0.07657316148597422, + "acc_stderr": 0.007324564881451568 + }, + "harness|winogrande|5": { + "acc": 0.7379636937647988, + "acc_stderr": 0.012358944431637561 + }, + "all": { + "em": 0.0012583892617449664, + "em_stderr": 0.00036305608931189794, + "f1": 0.05664848993288591, + "f1_stderr": 0.001329470291478584, + "acc": 0.4072684276253865, + "acc_stderr": 0.009841754656544565 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "b5bfb875ffb2f329" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "04cc92deb840ed99" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "4d8600f12b85fcfa" + }, + "total_evaluation_time_secondes": "10007.986499786377", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Amethyst-13B-Mistral/results_2023-10-04T09-03-58.552887.json b/eval-results/Undi95/Amethyst-13B-Mistral/results_2023-10-04T09-03-58.552887.json new file mode 100644 index 0000000000000000000000000000000000000000..9033f64b6a9e28266c9e4644d36866abd8a3d334 --- /dev/null +++ b/eval-results/Undi95/Amethyst-13B-Mistral/results_2023-10-04T09-03-58.552887.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/Amethyst-13B-Mistral", + "model_sha": "4328809e568f01e3f0a05764e3bb58e901310415", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5921501706484642, + "acc_stderr": 0.014361097288449708, + "acc_norm": 0.6262798634812287, + "acc_norm_stderr": 0.01413770860175909 + }, + "harness|hellaswag|10": { + "acc": 0.6387173869747063, + "acc_stderr": 0.004793904922401889, + "acc_norm": 0.8317068313085043, + "acc_norm_stderr": 0.003733618111043529 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621503, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621503 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5259259259259259, + "acc_stderr": 0.04313531696750575, + "acc_norm": 0.5259259259259259, + "acc_norm_stderr": 0.04313531696750575 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5460526315789473, + "acc_stderr": 0.04051646342874142, + "acc_norm": 0.5460526315789473, + "acc_norm_stderr": 0.04051646342874142 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.569811320754717, + "acc_stderr": 0.030471445867183235, + "acc_norm": 0.569811320754717, + "acc_norm_stderr": 0.030471445867183235 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5763888888888888, + "acc_stderr": 0.041321250197233685, + "acc_norm": 0.5763888888888888, + "acc_norm_stderr": 0.041321250197233685 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5433526011560693, + "acc_stderr": 0.03798106566014498, + "acc_norm": 0.5433526011560693, + "acc_norm_stderr": 0.03798106566014498 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.04280105837364395, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.04280105837364395 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.04560480215720685, + "acc_norm": 0.71, + "acc_norm_stderr": 0.04560480215720685 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4765957446808511, + "acc_stderr": 0.032650194750335815, + "acc_norm": 0.4765957446808511, + "acc_norm_stderr": 0.032650194750335815 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.04142439719489361, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.04142439719489361 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.496551724137931, + "acc_stderr": 0.041665675771015785, + "acc_norm": 0.496551724137931, + "acc_norm_stderr": 0.041665675771015785 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.024130158299762613, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.024130158299762613 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.04360314860077459, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.04360314860077459 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6548387096774193, + "acc_stderr": 0.02704574657353433, + "acc_norm": 0.6548387096774193, + "acc_norm_stderr": 0.02704574657353433 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.41379310344827586, + "acc_stderr": 0.03465304488406795, + "acc_norm": 0.41379310344827586, + "acc_norm_stderr": 0.03465304488406795 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.036639749943912434, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.036639749943912434 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7070707070707071, + "acc_stderr": 0.032424979581788166, + "acc_norm": 0.7070707070707071, + "acc_norm_stderr": 0.032424979581788166 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7979274611398963, + "acc_stderr": 0.028979089794296732, + "acc_norm": 0.7979274611398963, + "acc_norm_stderr": 0.028979089794296732 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5256410256410257, + "acc_stderr": 0.025317649726448656, + "acc_norm": 0.5256410256410257, + "acc_norm_stderr": 0.025317649726448656 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.028037929969114993, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.028037929969114993 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.592436974789916, + "acc_stderr": 0.031918633744784645, + "acc_norm": 0.592436974789916, + "acc_norm_stderr": 0.031918633744784645 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7357798165137615, + "acc_stderr": 0.018904164171510193, + "acc_norm": 0.7357798165137615, + "acc_norm_stderr": 0.018904164171510193 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.033509916046960415, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.033509916046960415 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.75, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.75, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7552742616033755, + "acc_stderr": 0.027985699387036423, + "acc_norm": 0.7552742616033755, + "acc_norm_stderr": 0.027985699387036423 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.03102441174057222, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.03102441174057222 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908706, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908706 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.043300437496507416, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.043300437496507416 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6871165644171779, + "acc_stderr": 0.036429145782924055, + "acc_norm": 0.6871165644171779, + "acc_norm_stderr": 0.036429145782924055 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3392857142857143, + "acc_stderr": 0.04493949068613539, + "acc_norm": 0.3392857142857143, + "acc_norm_stderr": 0.04493949068613539 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6893203883495146, + "acc_stderr": 0.045821241601615506, + "acc_norm": 0.6893203883495146, + "acc_norm_stderr": 0.045821241601615506 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7905982905982906, + "acc_stderr": 0.026655699653922744, + "acc_norm": 0.7905982905982906, + "acc_norm_stderr": 0.026655699653922744 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7547892720306514, + "acc_stderr": 0.015384352284543941, + "acc_norm": 0.7547892720306514, + "acc_norm_stderr": 0.015384352284543941 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6358381502890174, + "acc_stderr": 0.025906632631016127, + "acc_norm": 0.6358381502890174, + "acc_norm_stderr": 0.025906632631016127 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4324022346368715, + "acc_stderr": 0.01656897123354861, + "acc_norm": 0.4324022346368715, + "acc_norm_stderr": 0.01656897123354861 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.630718954248366, + "acc_stderr": 0.02763417668960266, + "acc_norm": 0.630718954248366, + "acc_norm_stderr": 0.02763417668960266 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.639871382636656, + "acc_stderr": 0.02726429759980401, + "acc_norm": 0.639871382636656, + "acc_norm_stderr": 0.02726429759980401 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6358024691358025, + "acc_stderr": 0.02677492989972233, + "acc_norm": 0.6358024691358025, + "acc_norm_stderr": 0.02677492989972233 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4219858156028369, + "acc_stderr": 0.029462189233370593, + "acc_norm": 0.4219858156028369, + "acc_norm_stderr": 0.029462189233370593 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.42242503259452413, + "acc_stderr": 0.012615600475734921, + "acc_norm": 0.42242503259452413, + "acc_norm_stderr": 0.012615600475734921 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.03032024326500413, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.03032024326500413 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5800653594771242, + "acc_stderr": 0.01996681117825648, + "acc_norm": 0.5800653594771242, + "acc_norm_stderr": 0.01996681117825648 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.046075820907199756, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.046075820907199756 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6285714285714286, + "acc_stderr": 0.030932858792789855, + "acc_norm": 0.6285714285714286, + "acc_norm_stderr": 0.030932858792789855 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7064676616915423, + "acc_stderr": 0.032200241045342054, + "acc_norm": 0.7064676616915423, + "acc_norm_stderr": 0.032200241045342054 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774708, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774708 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.03094445977853321, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.03094445977853321 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3684210526315789, + "mc1_stderr": 0.016886551261046042, + "mc2": 0.5242719773292807, + "mc2_stderr": 0.015543122220738859 + }, + "all": { + "acc": 0.5610110706771517, + "acc_stderr": 0.0343909052667031, + "acc_norm": 0.5648605475782799, + "acc_norm_stderr": 0.03436914805487177, + "mc1": 0.3684210526315789, + "mc1_stderr": 0.016886551261046042, + "mc2": 0.5242719773292807, + "mc2_stderr": 0.015543122220738859 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6371.6917378902435", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Amethyst-13B-Mistral/results_2023-10-27T13-44-51.984627.json b/eval-results/Undi95/Amethyst-13B-Mistral/results_2023-10-27T13-44-51.984627.json new file mode 100644 index 0000000000000000000000000000000000000000..162b19456bb0098fc6d9bdb43d980da8abc5e031 --- /dev/null +++ b/eval-results/Undi95/Amethyst-13B-Mistral/results_2023-10-27T13-44-51.984627.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/Amethyst-13B-Mistral", + "model_sha": "4328809e568f01e3f0a05764e3bb58e901310415", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.11891778523489933, + "em_stderr": 0.003314906435546502, + "f1": 0.18699769295301977, + "f1_stderr": 0.0034428005809407332 + }, + "harness|gsm8k|5": { + "acc": 0.10841546626231995, + "acc_stderr": 0.008563852506627492 + }, + "harness|winogrande|5": { + "acc": 0.7474348855564326, + "acc_stderr": 0.012211148449394105 + }, + "all": { + "em": 0.11891778523489933, + "em_stderr": 0.003314906435546502, + "f1": 0.18699769295301977, + "f1_stderr": 0.0034428005809407332, + "acc": 0.42792517590937623, + "acc_stderr": 0.010387500478010799 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "6ccb6aea37c71b7e" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "e930c0c0d390f771" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "f98db30e54cc3c23" + }, + "total_evaluation_time_secondes": "12455.955059051514", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Amethyst-13B/results_2023-10-03T17-37-36.187420.json b/eval-results/Undi95/Amethyst-13B/results_2023-10-03T17-37-36.187420.json new file mode 100644 index 0000000000000000000000000000000000000000..884196f780a9eed0b0200ce258d6fdb5141687a7 --- /dev/null +++ b/eval-results/Undi95/Amethyst-13B/results_2023-10-03T17-37-36.187420.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/Amethyst-13B", + "model_sha": "d4a85b1006f0b9439e64f0e7400533a7b867c24d", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5921501706484642, + "acc_stderr": 0.014361097288449708, + "acc_norm": 0.6262798634812287, + "acc_norm_stderr": 0.01413770860175909 + }, + "harness|hellaswag|10": { + "acc": 0.6387173869747063, + "acc_stderr": 0.004793904922401889, + "acc_norm": 0.8317068313085043, + "acc_norm_stderr": 0.003733618111043529 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621503, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621503 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5259259259259259, + "acc_stderr": 0.04313531696750575, + "acc_norm": 0.5259259259259259, + "acc_norm_stderr": 0.04313531696750575 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5460526315789473, + "acc_stderr": 0.04051646342874142, + "acc_norm": 0.5460526315789473, + "acc_norm_stderr": 0.04051646342874142 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.569811320754717, + "acc_stderr": 0.030471445867183235, + "acc_norm": 0.569811320754717, + "acc_norm_stderr": 0.030471445867183235 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5763888888888888, + "acc_stderr": 0.041321250197233685, + "acc_norm": 0.5763888888888888, + "acc_norm_stderr": 0.041321250197233685 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5433526011560693, + "acc_stderr": 0.03798106566014498, + "acc_norm": 0.5433526011560693, + "acc_norm_stderr": 0.03798106566014498 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.04280105837364395, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.04280105837364395 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.04560480215720685, + "acc_norm": 0.71, + "acc_norm_stderr": 0.04560480215720685 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4765957446808511, + "acc_stderr": 0.032650194750335815, + "acc_norm": 0.4765957446808511, + "acc_norm_stderr": 0.032650194750335815 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.04142439719489361, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.04142439719489361 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.496551724137931, + "acc_stderr": 0.041665675771015785, + "acc_norm": 0.496551724137931, + "acc_norm_stderr": 0.041665675771015785 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.024130158299762613, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.024130158299762613 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.04360314860077459, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.04360314860077459 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6548387096774193, + "acc_stderr": 0.02704574657353433, + "acc_norm": 0.6548387096774193, + "acc_norm_stderr": 0.02704574657353433 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.41379310344827586, + "acc_stderr": 0.03465304488406795, + "acc_norm": 0.41379310344827586, + "acc_norm_stderr": 0.03465304488406795 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.036639749943912434, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.036639749943912434 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7070707070707071, + "acc_stderr": 0.032424979581788166, + "acc_norm": 0.7070707070707071, + "acc_norm_stderr": 0.032424979581788166 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7979274611398963, + "acc_stderr": 0.028979089794296732, + "acc_norm": 0.7979274611398963, + "acc_norm_stderr": 0.028979089794296732 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5256410256410257, + "acc_stderr": 0.025317649726448656, + "acc_norm": 0.5256410256410257, + "acc_norm_stderr": 0.025317649726448656 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.028037929969114993, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.028037929969114993 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.592436974789916, + "acc_stderr": 0.031918633744784645, + "acc_norm": 0.592436974789916, + "acc_norm_stderr": 0.031918633744784645 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7357798165137615, + "acc_stderr": 0.018904164171510193, + "acc_norm": 0.7357798165137615, + "acc_norm_stderr": 0.018904164171510193 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.033509916046960415, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.033509916046960415 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.75, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.75, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7552742616033755, + "acc_stderr": 0.027985699387036423, + "acc_norm": 0.7552742616033755, + "acc_norm_stderr": 0.027985699387036423 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.03102441174057222, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.03102441174057222 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908706, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908706 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.043300437496507416, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.043300437496507416 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6871165644171779, + "acc_stderr": 0.036429145782924055, + "acc_norm": 0.6871165644171779, + "acc_norm_stderr": 0.036429145782924055 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3392857142857143, + "acc_stderr": 0.04493949068613539, + "acc_norm": 0.3392857142857143, + "acc_norm_stderr": 0.04493949068613539 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6893203883495146, + "acc_stderr": 0.045821241601615506, + "acc_norm": 0.6893203883495146, + "acc_norm_stderr": 0.045821241601615506 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7905982905982906, + "acc_stderr": 0.026655699653922744, + "acc_norm": 0.7905982905982906, + "acc_norm_stderr": 0.026655699653922744 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7547892720306514, + "acc_stderr": 0.015384352284543941, + "acc_norm": 0.7547892720306514, + "acc_norm_stderr": 0.015384352284543941 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6358381502890174, + "acc_stderr": 0.025906632631016127, + "acc_norm": 0.6358381502890174, + "acc_norm_stderr": 0.025906632631016127 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4324022346368715, + "acc_stderr": 0.01656897123354861, + "acc_norm": 0.4324022346368715, + "acc_norm_stderr": 0.01656897123354861 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.630718954248366, + "acc_stderr": 0.02763417668960266, + "acc_norm": 0.630718954248366, + "acc_norm_stderr": 0.02763417668960266 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.639871382636656, + "acc_stderr": 0.02726429759980401, + "acc_norm": 0.639871382636656, + "acc_norm_stderr": 0.02726429759980401 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6358024691358025, + "acc_stderr": 0.02677492989972233, + "acc_norm": 0.6358024691358025, + "acc_norm_stderr": 0.02677492989972233 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4219858156028369, + "acc_stderr": 0.029462189233370593, + "acc_norm": 0.4219858156028369, + "acc_norm_stderr": 0.029462189233370593 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.42242503259452413, + "acc_stderr": 0.012615600475734921, + "acc_norm": 0.42242503259452413, + "acc_norm_stderr": 0.012615600475734921 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.03032024326500413, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.03032024326500413 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5800653594771242, + "acc_stderr": 0.01996681117825648, + "acc_norm": 0.5800653594771242, + "acc_norm_stderr": 0.01996681117825648 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.046075820907199756, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.046075820907199756 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6285714285714286, + "acc_stderr": 0.030932858792789855, + "acc_norm": 0.6285714285714286, + "acc_norm_stderr": 0.030932858792789855 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7064676616915423, + "acc_stderr": 0.032200241045342054, + "acc_norm": 0.7064676616915423, + "acc_norm_stderr": 0.032200241045342054 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774708, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774708 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.03094445977853321, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.03094445977853321 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3684210526315789, + "mc1_stderr": 0.016886551261046042, + "mc2": 0.5242719773292807, + "mc2_stderr": 0.015543122220738859 + }, + "all": { + "acc": 0.5610110706771517, + "acc_stderr": 0.0343909052667031, + "acc_norm": 0.5648605475782799, + "acc_norm_stderr": 0.03436914805487177, + "mc1": 0.3684210526315789, + "mc1_stderr": 0.016886551261046042, + "mc2": 0.5242719773292807, + "mc2_stderr": 0.015543122220738859 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6415.881764173508", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Amethyst-13B/results_2023-10-25T11-51-19.859333.json b/eval-results/Undi95/Amethyst-13B/results_2023-10-25T11-51-19.859333.json new file mode 100644 index 0000000000000000000000000000000000000000..3cbe961b421028549f69a23effafd4c3bcd5919e --- /dev/null +++ b/eval-results/Undi95/Amethyst-13B/results_2023-10-25T11-51-19.859333.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/Amethyst-13B", + "model_sha": "d4a85b1006f0b9439e64f0e7400533a7b867c24d", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.11891778523489933, + "em_stderr": 0.003314906435546502, + "f1": 0.18699769295301977, + "f1_stderr": 0.0034428005809407332 + }, + "harness|gsm8k|5": { + "acc": 0.10841546626231995, + "acc_stderr": 0.008563852506627492 + }, + "harness|winogrande|5": { + "acc": 0.7474348855564326, + "acc_stderr": 0.012211148449394105 + }, + "all": { + "em": 0.11891778523489933, + "em_stderr": 0.003314906435546502, + "f1": 0.18699769295301977, + "f1_stderr": 0.0034428005809407332, + "acc": 0.42792517590937623, + "acc_stderr": 0.010387500478010799 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "6ccb6aea37c71b7e" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "e930c0c0d390f771" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "f98db30e54cc3c23" + }, + "total_evaluation_time_secondes": "12372.180121898651", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Clover3-17B/results_2023-12-12T06-10-19.622221.json b/eval-results/Undi95/Clover3-17B/results_2023-12-12T06-10-19.622221.json new file mode 100644 index 0000000000000000000000000000000000000000..e171aef17bdfeaf443a211349ab320249d7909e6 --- /dev/null +++ b/eval-results/Undi95/Clover3-17B/results_2023-12-12T06-10-19.622221.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 800812.024232554, + "end_time": 816288.558231362, + "total_evaluation_time_secondes": "15476.533998808009", + "model_name": "Undi95/Clover3-17B", + "model_sha": "428f6f58869426baae2c49442b207a15bc2da3cc", + "model_dtype": "torch.float16", + "model_size": "32.55 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5656996587030717, + "acc_stderr": 0.01448470304885736, + "acc_norm": 0.5989761092150171, + "acc_norm_stderr": 0.014322255790719867 + }, + "harness|hellaswag|10": { + "acc": 0.6161123282214698, + "acc_stderr": 0.004853371646239244, + "acc_norm": 0.811790479984067, + "acc_norm_stderr": 0.003900805416736722 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5777777777777777, + "acc_stderr": 0.04266763404099582, + "acc_norm": 0.5777777777777777, + "acc_norm_stderr": 0.04266763404099582 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6513157894736842, + "acc_stderr": 0.0387813988879761, + "acc_norm": 0.6513157894736842, + "acc_norm_stderr": 0.0387813988879761 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6226415094339622, + "acc_stderr": 0.029832808114796005, + "acc_norm": 0.6226415094339622, + "acc_norm_stderr": 0.029832808114796005 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7361111111111112, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.7361111111111112, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5953757225433526, + "acc_stderr": 0.03742461193887248, + "acc_norm": 0.5953757225433526, + "acc_norm_stderr": 0.03742461193887248 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.37254901960784315, + "acc_stderr": 0.04810840148082636, + "acc_norm": 0.37254901960784315, + "acc_norm_stderr": 0.04810840148082636 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5531914893617021, + "acc_stderr": 0.0325005368436584, + "acc_norm": 0.5531914893617021, + "acc_norm_stderr": 0.0325005368436584 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.45614035087719296, + "acc_stderr": 0.046854730419077895, + "acc_norm": 0.45614035087719296, + "acc_norm_stderr": 0.046854730419077895 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5241379310344828, + "acc_stderr": 0.0416180850350153, + "acc_norm": 0.5241379310344828, + "acc_norm_stderr": 0.0416180850350153 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.02510742548113729, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.02510742548113729 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.043758884927270605, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.043758884927270605 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7387096774193549, + "acc_stderr": 0.024993053397764826, + "acc_norm": 0.7387096774193549, + "acc_norm_stderr": 0.024993053397764826 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.46798029556650245, + "acc_stderr": 0.035107665979592154, + "acc_norm": 0.46798029556650245, + "acc_norm_stderr": 0.035107665979592154 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.64, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.64, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7333333333333333, + "acc_stderr": 0.03453131801885417, + "acc_norm": 0.7333333333333333, + "acc_norm_stderr": 0.03453131801885417 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7424242424242424, + "acc_stderr": 0.031156269519646836, + "acc_norm": 0.7424242424242424, + "acc_norm_stderr": 0.031156269519646836 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.844559585492228, + "acc_stderr": 0.026148483469153327, + "acc_norm": 0.844559585492228, + "acc_norm_stderr": 0.026148483469153327 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6410256410256411, + "acc_stderr": 0.024321738484602354, + "acc_norm": 0.6410256410256411, + "acc_norm_stderr": 0.024321738484602354 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3296296296296296, + "acc_stderr": 0.028661201116524575, + "acc_norm": 0.3296296296296296, + "acc_norm_stderr": 0.028661201116524575 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6848739495798319, + "acc_stderr": 0.030176808288974337, + "acc_norm": 0.6848739495798319, + "acc_norm_stderr": 0.030176808288974337 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.03822746937658753, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.03822746937658753 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.818348623853211, + "acc_stderr": 0.01653061740926687, + "acc_norm": 0.818348623853211, + "acc_norm_stderr": 0.01653061740926687 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4675925925925926, + "acc_stderr": 0.034028015813589656, + "acc_norm": 0.4675925925925926, + "acc_norm_stderr": 0.034028015813589656 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7549019607843137, + "acc_stderr": 0.030190282453501954, + "acc_norm": 0.7549019607843137, + "acc_norm_stderr": 0.030190282453501954 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7426160337552743, + "acc_stderr": 0.028458820991460305, + "acc_norm": 0.7426160337552743, + "acc_norm_stderr": 0.028458820991460305 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6771300448430493, + "acc_stderr": 0.03138147637575499, + "acc_norm": 0.6771300448430493, + "acc_norm_stderr": 0.03138147637575499 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7175572519083969, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.7175572519083969, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.04026187527591205, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.04026187527591205 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.042844679680521934, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.042844679680521934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6871165644171779, + "acc_stderr": 0.036429145782924055, + "acc_norm": 0.6871165644171779, + "acc_norm_stderr": 0.036429145782924055 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.45535714285714285, + "acc_stderr": 0.047268355537191, + "acc_norm": 0.45535714285714285, + "acc_norm_stderr": 0.047268355537191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8675213675213675, + "acc_stderr": 0.022209309073165612, + "acc_norm": 0.8675213675213675, + "acc_norm_stderr": 0.022209309073165612 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.80970625798212, + "acc_stderr": 0.014036945850381398, + "acc_norm": 0.80970625798212, + "acc_norm_stderr": 0.014036945850381398 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.630057803468208, + "acc_stderr": 0.02599247202930639, + "acc_norm": 0.630057803468208, + "acc_norm_stderr": 0.02599247202930639 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.20446927374301677, + "acc_stderr": 0.013488813404711919, + "acc_norm": 0.20446927374301677, + "acc_norm_stderr": 0.013488813404711919 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6928104575163399, + "acc_stderr": 0.026415601914388992, + "acc_norm": 0.6928104575163399, + "acc_norm_stderr": 0.026415601914388992 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6720257234726688, + "acc_stderr": 0.026664410886937624, + "acc_norm": 0.6720257234726688, + "acc_norm_stderr": 0.026664410886937624 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6882716049382716, + "acc_stderr": 0.02577311116963046, + "acc_norm": 0.6882716049382716, + "acc_norm_stderr": 0.02577311116963046 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4148936170212766, + "acc_stderr": 0.029392236584612506, + "acc_norm": 0.4148936170212766, + "acc_norm_stderr": 0.029392236584612506 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.42698826597131684, + "acc_stderr": 0.012633353557534425, + "acc_norm": 0.42698826597131684, + "acc_norm_stderr": 0.012633353557534425 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6433823529411765, + "acc_stderr": 0.02909720956841195, + "acc_norm": 0.6433823529411765, + "acc_norm_stderr": 0.02909720956841195 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.01965992249362335, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.01965992249362335 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6, + "acc_stderr": 0.0469237132203465, + "acc_norm": 0.6, + "acc_norm_stderr": 0.0469237132203465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6448979591836734, + "acc_stderr": 0.030635655150387638, + "acc_norm": 0.6448979591836734, + "acc_norm_stderr": 0.030635655150387638 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7960199004975125, + "acc_stderr": 0.02849317624532607, + "acc_norm": 0.7960199004975125, + "acc_norm_stderr": 0.02849317624532607 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5481927710843374, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.5481927710843374, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8128654970760234, + "acc_stderr": 0.02991312723236804, + "acc_norm": 0.8128654970760234, + "acc_norm_stderr": 0.02991312723236804 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.26805385556915545, + "mc1_stderr": 0.01550620472283456, + "mc2": 0.4072173688663445, + "mc2_stderr": 0.014502556892504742 + }, + "harness|winogrande|5": { + "acc": 0.7861089187056038, + "acc_stderr": 0.011524466954090259 + }, + "harness|gsm8k|5": { + "acc": 0.18802122820318423, + "acc_stderr": 0.010762621695354888 + }, + "all": { + "acc": 0.600361059236723, + "acc_stderr": 0.033074807649830854, + "acc_norm": 0.608082187606879, + "acc_norm_stderr": 0.03379934177123045, + "mc1": 0.26805385556915545, + "mc1_stderr": 0.01550620472283456, + "mc2": 0.4072173688663445, + "mc2_stderr": 0.014502556892504742 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "221c67ae699db6d3" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "db3a34a698e89ebd" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/CodeEngine/results_2023-09-12T11-51-31.235775.json b/eval-results/Undi95/CodeEngine/results_2023-09-12T11-51-31.235775.json new file mode 100644 index 0000000000000000000000000000000000000000..672d2ff92e5292334743c4f0566f728076b6c46c --- /dev/null +++ b/eval-results/Undi95/CodeEngine/results_2023-09-12T11-51-31.235775.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/CodeEngine", + "model_sha": "f57879831c39f2dcb656cb2c9e9ce5878e92bb44", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5494880546075085, + "acc_stderr": 0.014539646098471627, + "acc_norm": 0.5836177474402731, + "acc_norm_stderr": 0.014405618279436174 + }, + "harness|hellaswag|10": { + "acc": 0.6243776140211114, + "acc_stderr": 0.004832934529120793, + "acc_norm": 0.8227444732125074, + "acc_norm_stderr": 0.0038110434120246575 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5037037037037037, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.5037037037037037, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5131578947368421, + "acc_stderr": 0.04067533136309174, + "acc_norm": 0.5131578947368421, + "acc_norm_stderr": 0.04067533136309174 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5849056603773585, + "acc_stderr": 0.03032594578928611, + "acc_norm": 0.5849056603773585, + "acc_norm_stderr": 0.03032594578928611 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5625, + "acc_stderr": 0.04148415739394154, + "acc_norm": 0.5625, + "acc_norm_stderr": 0.04148415739394154 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5028901734104047, + "acc_stderr": 0.038124005659748335, + "acc_norm": 0.5028901734104047, + "acc_norm_stderr": 0.038124005659748335 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.04533838195929777, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.04533838195929777 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.39574468085106385, + "acc_stderr": 0.031967586978353627, + "acc_norm": 0.39574468085106385, + "acc_norm_stderr": 0.031967586978353627 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537315, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537315 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.023636975996101806, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.023636975996101806 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.04163453031302859, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.04163453031302859 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.632258064516129, + "acc_stderr": 0.02743086657997347, + "acc_norm": 0.632258064516129, + "acc_norm_stderr": 0.02743086657997347 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.46798029556650245, + "acc_stderr": 0.03510766597959217, + "acc_norm": 0.46798029556650245, + "acc_norm_stderr": 0.03510766597959217 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.03663974994391244, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.03663974994391244 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6717171717171717, + "acc_stderr": 0.03345678422756776, + "acc_norm": 0.6717171717171717, + "acc_norm_stderr": 0.03345678422756776 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7564766839378239, + "acc_stderr": 0.030975436386845443, + "acc_norm": 0.7564766839378239, + "acc_norm_stderr": 0.030975436386845443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5, + "acc_stderr": 0.02535100632816969, + "acc_norm": 0.5, + "acc_norm_stderr": 0.02535100632816969 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.026962424325073838, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.026962424325073838 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.48739495798319327, + "acc_stderr": 0.03246816765752174, + "acc_norm": 0.48739495798319327, + "acc_norm_stderr": 0.03246816765752174 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7137614678899082, + "acc_stderr": 0.019379436628919975, + "acc_norm": 0.7137614678899082, + "acc_norm_stderr": 0.019379436628919975 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4212962962962963, + "acc_stderr": 0.03367462138896078, + "acc_norm": 0.4212962962962963, + "acc_norm_stderr": 0.03367462138896078 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.75, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.75, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7130801687763713, + "acc_stderr": 0.02944377302259469, + "acc_norm": 0.7130801687763713, + "acc_norm_stderr": 0.02944377302259469 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6322869955156951, + "acc_stderr": 0.03236198350928276, + "acc_norm": 0.6322869955156951, + "acc_norm_stderr": 0.03236198350928276 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6183206106870229, + "acc_stderr": 0.04260735157644559, + "acc_norm": 0.6183206106870229, + "acc_norm_stderr": 0.04260735157644559 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.04026187527591207, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.04026187527591207 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7037037037037037, + "acc_stderr": 0.044143436668549335, + "acc_norm": 0.7037037037037037, + "acc_norm_stderr": 0.044143436668549335 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6809815950920245, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.6809815950920245, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.045218299028335865, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.045218299028335865 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6601941747572816, + "acc_stderr": 0.04689765937278135, + "acc_norm": 0.6601941747572816, + "acc_norm_stderr": 0.04689765937278135 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7393162393162394, + "acc_stderr": 0.028760348956523414, + "acc_norm": 0.7393162393162394, + "acc_norm_stderr": 0.028760348956523414 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7318007662835249, + "acc_stderr": 0.015842430835269407, + "acc_norm": 0.7318007662835249, + "acc_norm_stderr": 0.015842430835269407 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5982658959537572, + "acc_stderr": 0.026394104177643634, + "acc_norm": 0.5982658959537572, + "acc_norm_stderr": 0.026394104177643634 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.35195530726256985, + "acc_stderr": 0.01597266852368907, + "acc_norm": 0.35195530726256985, + "acc_norm_stderr": 0.01597266852368907 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6013071895424836, + "acc_stderr": 0.028036092273891776, + "acc_norm": 0.6013071895424836, + "acc_norm_stderr": 0.028036092273891776 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6141479099678456, + "acc_stderr": 0.027648149599751464, + "acc_norm": 0.6141479099678456, + "acc_norm_stderr": 0.027648149599751464 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6481481481481481, + "acc_stderr": 0.026571483480719964, + "acc_norm": 0.6481481481481481, + "acc_norm_stderr": 0.026571483480719964 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.38652482269503546, + "acc_stderr": 0.029049190342543458, + "acc_norm": 0.38652482269503546, + "acc_norm_stderr": 0.029049190342543458 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.38461538461538464, + "acc_stderr": 0.012425548416302943, + "acc_norm": 0.38461538461538464, + "acc_norm_stderr": 0.012425548416302943 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.49264705882352944, + "acc_stderr": 0.030369552523902173, + "acc_norm": 0.49264705882352944, + "acc_norm_stderr": 0.030369552523902173 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5343137254901961, + "acc_stderr": 0.020180144843307296, + "acc_norm": 0.5343137254901961, + "acc_norm_stderr": 0.020180144843307296 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.046075820907199756, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.046075820907199756 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6081632653061224, + "acc_stderr": 0.03125127591089165, + "acc_norm": 0.6081632653061224, + "acc_norm_stderr": 0.03125127591089165 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7412935323383084, + "acc_stderr": 0.030965903123573026, + "acc_norm": 0.7412935323383084, + "acc_norm_stderr": 0.030965903123573026 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932263, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932263 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4397590361445783, + "acc_stderr": 0.03864139923699121, + "acc_norm": 0.4397590361445783, + "acc_norm_stderr": 0.03864139923699121 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7309941520467836, + "acc_stderr": 0.034010526201040885, + "acc_norm": 0.7309941520467836, + "acc_norm_stderr": 0.034010526201040885 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.32313341493268055, + "mc1_stderr": 0.016371836286454604, + "mc2": 0.45182279949368886, + "mc2_stderr": 0.01544933845997067 + }, + "all": { + "acc": 0.5433186047027702, + "acc_stderr": 0.03474642991909903, + "acc_norm": 0.5472592242286033, + "acc_norm_stderr": 0.03472683807272391, + "mc1": 0.32313341493268055, + "mc1_stderr": 0.016371836286454604, + "mc2": 0.45182279949368886, + "mc2_stderr": 0.01544933845997067 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "11790.728584051132", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/CodeEngine/results_2023-10-25T07-16-21.496689.json b/eval-results/Undi95/CodeEngine/results_2023-10-25T07-16-21.496689.json new file mode 100644 index 0000000000000000000000000000000000000000..999ab5e5d2215b0693e86e084b53f508e28232ee --- /dev/null +++ b/eval-results/Undi95/CodeEngine/results_2023-10-25T07-16-21.496689.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/CodeEngine", + "model_sha": "f57879831c39f2dcb656cb2c9e9ce5878e92bb44", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.31008808724832215, + "em_stderr": 0.004736734191590966, + "f1": 0.4059154781879224, + "f1_stderr": 0.004594505528583743 + }, + "harness|gsm8k|5": { + "acc": 0.015163002274450341, + "acc_stderr": 0.0033660229497263702 + }, + "harness|winogrande|5": { + "acc": 0.7458563535911602, + "acc_stderr": 0.012236307219708269 + }, + "all": { + "em": 0.31008808724832215, + "em_stderr": 0.004736734191590966, + "f1": 0.4059154781879224, + "f1_stderr": 0.004594505528583743, + "acc": 0.38050967793280527, + "acc_stderr": 0.00780116508471732 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "ac9ce9b167d1a903" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "76d8515486420727" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "686f8ff2032f06b7" + }, + "total_evaluation_time_secondes": "6712.175133943558", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/CreativityEngine/results_2023-09-11T17-22-32.752077.json b/eval-results/Undi95/CreativityEngine/results_2023-09-11T17-22-32.752077.json new file mode 100644 index 0000000000000000000000000000000000000000..66eeecac43520b152ecb2c302b314964c5388d3b --- /dev/null +++ b/eval-results/Undi95/CreativityEngine/results_2023-09-11T17-22-32.752077.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/CreativityEngine", + "model_sha": "7870cc50b82b5cbebfa9935b6d73a9d20170299a", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5563139931740614, + "acc_stderr": 0.014518421825670444, + "acc_norm": 0.5930034129692833, + "acc_norm_stderr": 0.014356399418009121 + }, + "harness|hellaswag|10": { + "acc": 0.6253734315873332, + "acc_stderr": 0.004830371317841056, + "acc_norm": 0.8242381995618403, + "acc_norm_stderr": 0.0037983950550215346 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.43703703703703706, + "acc_stderr": 0.04284958639753399, + "acc_norm": 0.43703703703703706, + "acc_norm_stderr": 0.04284958639753399 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5, + "acc_stderr": 0.04068942293855797, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04068942293855797 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5547169811320755, + "acc_stderr": 0.030588052974270655, + "acc_norm": 0.5547169811320755, + "acc_norm_stderr": 0.030588052974270655 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5416666666666666, + "acc_stderr": 0.04166666666666665, + "acc_norm": 0.5416666666666666, + "acc_norm_stderr": 0.04166666666666665 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.45, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4508670520231214, + "acc_stderr": 0.037940126746970296, + "acc_norm": 0.4508670520231214, + "acc_norm_stderr": 0.037940126746970296 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.04440521906179328, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.04440521906179328 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.425531914893617, + "acc_stderr": 0.03232146916224468, + "acc_norm": 0.425531914893617, + "acc_norm_stderr": 0.03232146916224468 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748142, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748142 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.02397386199899207, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.02397386199899207 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.041905964388711366, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.041905964388711366 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5935483870967742, + "acc_stderr": 0.027941727346256308, + "acc_norm": 0.5935483870967742, + "acc_norm_stderr": 0.027941727346256308 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4039408866995074, + "acc_stderr": 0.03452453903822039, + "acc_norm": 0.4039408866995074, + "acc_norm_stderr": 0.03452453903822039 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6484848484848484, + "acc_stderr": 0.037282069986826503, + "acc_norm": 0.6484848484848484, + "acc_norm_stderr": 0.037282069986826503 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6767676767676768, + "acc_stderr": 0.03332299921070644, + "acc_norm": 0.6767676767676768, + "acc_norm_stderr": 0.03332299921070644 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.772020725388601, + "acc_stderr": 0.030276909945178274, + "acc_norm": 0.772020725388601, + "acc_norm_stderr": 0.030276909945178274 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.517948717948718, + "acc_stderr": 0.02533466708095492, + "acc_norm": 0.517948717948718, + "acc_norm_stderr": 0.02533466708095492 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2851851851851852, + "acc_stderr": 0.027528599210340492, + "acc_norm": 0.2851851851851852, + "acc_norm_stderr": 0.027528599210340492 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4831932773109244, + "acc_stderr": 0.03246013680375308, + "acc_norm": 0.4831932773109244, + "acc_norm_stderr": 0.03246013680375308 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7027522935779816, + "acc_stderr": 0.019595707224643533, + "acc_norm": 0.7027522935779816, + "acc_norm_stderr": 0.019595707224643533 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.37037037037037035, + "acc_stderr": 0.03293377139415191, + "acc_norm": 0.37037037037037035, + "acc_norm_stderr": 0.03293377139415191 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7254901960784313, + "acc_stderr": 0.031321798030832904, + "acc_norm": 0.7254901960784313, + "acc_norm_stderr": 0.031321798030832904 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7257383966244726, + "acc_stderr": 0.029041333510598035, + "acc_norm": 0.7257383966244726, + "acc_norm_stderr": 0.029041333510598035 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6681614349775785, + "acc_stderr": 0.031602951437766785, + "acc_norm": 0.6681614349775785, + "acc_norm_stderr": 0.031602951437766785 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6030534351145038, + "acc_stderr": 0.04291135671009224, + "acc_norm": 0.6030534351145038, + "acc_norm_stderr": 0.04291135671009224 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.040261875275912073, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.040261875275912073 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6574074074074074, + "acc_stderr": 0.045879047413018105, + "acc_norm": 0.6574074074074074, + "acc_norm_stderr": 0.045879047413018105 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.656441717791411, + "acc_stderr": 0.037311335196738925, + "acc_norm": 0.656441717791411, + "acc_norm_stderr": 0.037311335196738925 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4017857142857143, + "acc_stderr": 0.04653333146973646, + "acc_norm": 0.4017857142857143, + "acc_norm_stderr": 0.04653333146973646 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6407766990291263, + "acc_stderr": 0.047504583990416946, + "acc_norm": 0.6407766990291263, + "acc_norm_stderr": 0.047504583990416946 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8076923076923077, + "acc_stderr": 0.025819233256483717, + "acc_norm": 0.8076923076923077, + "acc_norm_stderr": 0.025819233256483717 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7432950191570882, + "acc_stderr": 0.015620480263064524, + "acc_norm": 0.7432950191570882, + "acc_norm_stderr": 0.015620480263064524 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6069364161849711, + "acc_stderr": 0.026296227915613674, + "acc_norm": 0.6069364161849711, + "acc_norm_stderr": 0.026296227915613674 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.43575418994413406, + "acc_stderr": 0.01658388195860239, + "acc_norm": 0.43575418994413406, + "acc_norm_stderr": 0.01658388195860239 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.545751633986928, + "acc_stderr": 0.028509807802626592, + "acc_norm": 0.545751633986928, + "acc_norm_stderr": 0.028509807802626592 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.617363344051447, + "acc_stderr": 0.027604689028581986, + "acc_norm": 0.617363344051447, + "acc_norm_stderr": 0.027604689028581986 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6018518518518519, + "acc_stderr": 0.027237415094592474, + "acc_norm": 0.6018518518518519, + "acc_norm_stderr": 0.027237415094592474 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.40070921985815605, + "acc_stderr": 0.029233465745573083, + "acc_norm": 0.40070921985815605, + "acc_norm_stderr": 0.029233465745573083 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4198174706649283, + "acc_stderr": 0.012604960816087375, + "acc_norm": 0.4198174706649283, + "acc_norm_stderr": 0.012604960816087375 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5, + "acc_stderr": 0.030372836961539352, + "acc_norm": 0.5, + "acc_norm_stderr": 0.030372836961539352 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5408496732026143, + "acc_stderr": 0.020160213617222516, + "acc_norm": 0.5408496732026143, + "acc_norm_stderr": 0.020160213617222516 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7, + "acc_stderr": 0.04389311454644287, + "acc_norm": 0.7, + "acc_norm_stderr": 0.04389311454644287 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5795918367346938, + "acc_stderr": 0.03160106993449601, + "acc_norm": 0.5795918367346938, + "acc_norm_stderr": 0.03160106993449601 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6865671641791045, + "acc_stderr": 0.03280188205348643, + "acc_norm": 0.6865671641791045, + "acc_norm_stderr": 0.03280188205348643 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774708, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774708 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.45180722891566266, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.45180722891566266, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7602339181286549, + "acc_stderr": 0.03274485211946956, + "acc_norm": 0.7602339181286549, + "acc_norm_stderr": 0.03274485211946956 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3769889840881273, + "mc1_stderr": 0.016965517578930354, + "mc2": 0.5246393270856821, + "mc2_stderr": 0.015798380259118193 + }, + "all": { + "acc": 0.5374017113191415, + "acc_stderr": 0.03464431782025622, + "acc_norm": 0.5413941551796453, + "acc_norm_stderr": 0.03462408055465485, + "mc1": 0.3769889840881273, + "mc1_stderr": 0.016965517578930354, + "mc2": 0.5246393270856821, + "mc2_stderr": 0.015798380259118193 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "11822.822504997253", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/CreativityEngine/results_2023-10-28T03-34-54.369545.json b/eval-results/Undi95/CreativityEngine/results_2023-10-28T03-34-54.369545.json new file mode 100644 index 0000000000000000000000000000000000000000..72ff4e5fb3e711a2d18a4dbfbd72c0c39273aa33 --- /dev/null +++ b/eval-results/Undi95/CreativityEngine/results_2023-10-28T03-34-54.369545.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/CreativityEngine", + "model_sha": "7870cc50b82b5cbebfa9935b6d73a9d20170299a", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.24706375838926176, + "em_stderr": 0.00441695804511364, + "f1": 0.32981753355704885, + "f1_stderr": 0.004357223834591547 + }, + "harness|gsm8k|5": { + "acc": 0.09552691432903715, + "acc_stderr": 0.008096605771155733 + }, + "harness|winogrande|5": { + "acc": 0.7419100236779794, + "acc_stderr": 0.01229827883397239 + }, + "all": { + "em": 0.24706375838926176, + "em_stderr": 0.00441695804511364, + "f1": 0.32981753355704885, + "f1_stderr": 0.004357223834591547, + "acc": 0.4187184690035083, + "acc_stderr": 0.010197442302564062 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "b05de76a241221c2" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "e734cc9043924646" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "5fc37408cafefd38" + }, + "total_evaluation_time_secondes": "11184.66688299179", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Emerald-13B/results_2023-10-03T17-31-23.265550.json b/eval-results/Undi95/Emerald-13B/results_2023-10-03T17-31-23.265550.json new file mode 100644 index 0000000000000000000000000000000000000000..71f3418a2bad602c357bf2f0881b0186412244aa --- /dev/null +++ b/eval-results/Undi95/Emerald-13B/results_2023-10-03T17-31-23.265550.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/Emerald-13B", + "model_sha": "f7696299463d8ec402a4e1eb001f3a447f1c5552", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5853242320819113, + "acc_stderr": 0.014397070564409172, + "acc_norm": 0.6228668941979523, + "acc_norm_stderr": 0.014163366896192601 + }, + "harness|hellaswag|10": { + "acc": 0.6385182234614618, + "acc_stderr": 0.0047944784263826085, + "acc_norm": 0.836885082652858, + "acc_norm_stderr": 0.0036871539405687963 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5259259259259259, + "acc_stderr": 0.04313531696750575, + "acc_norm": 0.5259259259259259, + "acc_norm_stderr": 0.04313531696750575 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5394736842105263, + "acc_stderr": 0.04056242252249034, + "acc_norm": 0.5394736842105263, + "acc_norm_stderr": 0.04056242252249034 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5660377358490566, + "acc_stderr": 0.030503292013342592, + "acc_norm": 0.5660377358490566, + "acc_norm_stderr": 0.030503292013342592 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5902777777777778, + "acc_stderr": 0.04112490974670788, + "acc_norm": 0.5902777777777778, + "acc_norm_stderr": 0.04112490974670788 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.45, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5260115606936416, + "acc_stderr": 0.038073017265045125, + "acc_norm": 0.5260115606936416, + "acc_norm_stderr": 0.038073017265045125 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171452, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171452 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.04560480215720685, + "acc_norm": 0.71, + "acc_norm_stderr": 0.04560480215720685 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.03261936918467381, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.03261936918467381 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.044346007015849245, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.044346007015849245 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.041665675771015785, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.041665675771015785 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.0242785680243077, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.0242785680243077 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.04343525428949098, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.04343525428949098 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.632258064516129, + "acc_stderr": 0.027430866579973463, + "acc_norm": 0.632258064516129, + "acc_norm_stderr": 0.027430866579973463 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43349753694581283, + "acc_stderr": 0.03486731727419872, + "acc_norm": 0.43349753694581283, + "acc_norm_stderr": 0.03486731727419872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.03663974994391244, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.03663974994391244 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7070707070707071, + "acc_stderr": 0.032424979581788166, + "acc_norm": 0.7070707070707071, + "acc_norm_stderr": 0.032424979581788166 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.772020725388601, + "acc_stderr": 0.030276909945178274, + "acc_norm": 0.772020725388601, + "acc_norm_stderr": 0.030276909945178274 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5128205128205128, + "acc_stderr": 0.02534267129380725, + "acc_norm": 0.5128205128205128, + "acc_norm_stderr": 0.02534267129380725 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.31851851851851853, + "acc_stderr": 0.02840653309060846, + "acc_norm": 0.31851851851851853, + "acc_norm_stderr": 0.02840653309060846 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5840336134453782, + "acc_stderr": 0.032016501007396114, + "acc_norm": 0.5840336134453782, + "acc_norm_stderr": 0.032016501007396114 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943342, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943342 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7376146788990826, + "acc_stderr": 0.01886188502153473, + "acc_norm": 0.7376146788990826, + "acc_norm_stderr": 0.01886188502153473 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.37962962962962965, + "acc_stderr": 0.03309682581119035, + "acc_norm": 0.37962962962962965, + "acc_norm_stderr": 0.03309682581119035 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7745098039215687, + "acc_stderr": 0.02933116229425174, + "acc_norm": 0.7745098039215687, + "acc_norm_stderr": 0.02933116229425174 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7721518987341772, + "acc_stderr": 0.02730348459906943, + "acc_norm": 0.7721518987341772, + "acc_norm_stderr": 0.02730348459906943 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7520661157024794, + "acc_stderr": 0.03941897526516304, + "acc_norm": 0.7520661157024794, + "acc_norm_stderr": 0.03941897526516304 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7129629629629629, + "acc_stderr": 0.043733130409147614, + "acc_norm": 0.7129629629629629, + "acc_norm_stderr": 0.043733130409147614 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6871165644171779, + "acc_stderr": 0.036429145782924055, + "acc_norm": 0.6871165644171779, + "acc_norm_stderr": 0.036429145782924055 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285714, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285714 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6796116504854369, + "acc_stderr": 0.04620284082280042, + "acc_norm": 0.6796116504854369, + "acc_norm_stderr": 0.04620284082280042 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.782051282051282, + "acc_stderr": 0.027046857630716677, + "acc_norm": 0.782051282051282, + "acc_norm_stderr": 0.027046857630716677 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.55, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.55, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7547892720306514, + "acc_stderr": 0.015384352284543944, + "acc_norm": 0.7547892720306514, + "acc_norm_stderr": 0.015384352284543944 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6358381502890174, + "acc_stderr": 0.02590663263101613, + "acc_norm": 0.6358381502890174, + "acc_norm_stderr": 0.02590663263101613 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4122905027932961, + "acc_stderr": 0.016463200238114525, + "acc_norm": 0.4122905027932961, + "acc_norm_stderr": 0.016463200238114525 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6241830065359477, + "acc_stderr": 0.027732834353363933, + "acc_norm": 0.6241830065359477, + "acc_norm_stderr": 0.027732834353363933 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.639871382636656, + "acc_stderr": 0.02726429759980401, + "acc_norm": 0.639871382636656, + "acc_norm_stderr": 0.02726429759980401 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6296296296296297, + "acc_stderr": 0.02686949074481525, + "acc_norm": 0.6296296296296297, + "acc_norm_stderr": 0.02686949074481525 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.41134751773049644, + "acc_stderr": 0.029354911159940985, + "acc_norm": 0.41134751773049644, + "acc_norm_stderr": 0.029354911159940985 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.42633637548891784, + "acc_stderr": 0.01263088477159969, + "acc_norm": 0.42633637548891784, + "acc_norm_stderr": 0.01263088477159969 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5183823529411765, + "acc_stderr": 0.030352303395351964, + "acc_norm": 0.5183823529411765, + "acc_norm_stderr": 0.030352303395351964 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5702614379084967, + "acc_stderr": 0.02002712278492855, + "acc_norm": 0.5702614379084967, + "acc_norm_stderr": 0.02002712278492855 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5918367346938775, + "acc_stderr": 0.03146465712827424, + "acc_norm": 0.5918367346938775, + "acc_norm_stderr": 0.03146465712827424 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7064676616915423, + "acc_stderr": 0.032200241045342054, + "acc_norm": 0.7064676616915423, + "acc_norm_stderr": 0.032200241045342054 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.463855421686747, + "acc_stderr": 0.03882310850890593, + "acc_norm": 0.463855421686747, + "acc_norm_stderr": 0.03882310850890593 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7894736842105263, + "acc_stderr": 0.031267817146631786, + "acc_norm": 0.7894736842105263, + "acc_norm_stderr": 0.031267817146631786 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.36107711138310894, + "mc1_stderr": 0.016814312844836886, + "mc2": 0.5094365067991387, + "mc2_stderr": 0.015354293715350336 + }, + "all": { + "acc": 0.5588193039950157, + "acc_stderr": 0.03445212545677957, + "acc_norm": 0.5628177704578535, + "acc_norm_stderr": 0.0344293961660333, + "mc1": 0.36107711138310894, + "mc1_stderr": 0.016814312844836886, + "mc2": 0.5094365067991387, + "mc2_stderr": 0.015354293715350336 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6419.3316922187805", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Emerald-13B/results_2023-10-23T18-27-52.311274.json b/eval-results/Undi95/Emerald-13B/results_2023-10-23T18-27-52.311274.json new file mode 100644 index 0000000000000000000000000000000000000000..574c8b953f78e1b389b1b6dbb17b1a619cb536c0 --- /dev/null +++ b/eval-results/Undi95/Emerald-13B/results_2023-10-23T18-27-52.311274.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/Emerald-13B", + "model_sha": "f7696299463d8ec402a4e1eb001f3a447f1c5552", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.11566694630872483, + "em_stderr": 0.0032753085227622833, + "f1": 0.18378460570469723, + "f1_stderr": 0.003376754461365903 + }, + "harness|gsm8k|5": { + "acc": 0.1281273692191054, + "acc_stderr": 0.009206398549980031 + }, + "harness|winogrande|5": { + "acc": 0.7592738752959748, + "acc_stderr": 0.012015559212224176 + }, + "all": { + "em": 0.11566694630872483, + "em_stderr": 0.0032753085227622833, + "f1": 0.18378460570469723, + "f1_stderr": 0.003376754461365903, + "acc": 0.4437006222575401, + "acc_stderr": 0.010610978881102105 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "9157f8c77727bcd0" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "2c00fcb9d3e01de4" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "ba33099f7c13e583" + }, + "total_evaluation_time_secondes": "12243.24050951004", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Emerhyst-20B/results_2023-10-04T09-24-08.717468.json b/eval-results/Undi95/Emerhyst-20B/results_2023-10-04T09-24-08.717468.json new file mode 100644 index 0000000000000000000000000000000000000000..f5428db1486cb05caa87e95dfc2ccd0f6f58bc17 --- /dev/null +++ b/eval-results/Undi95/Emerhyst-20B/results_2023-10-04T09-24-08.717468.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/Emerhyst-20B", + "model_sha": "e4c23af4f5dd88cb27d245e2bfc3b81db652632c", + "model_size": "37.36 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5895904436860068, + "acc_stderr": 0.01437492219264266, + "acc_norm": 0.6168941979522184, + "acc_norm_stderr": 0.014206472661672877 + }, + "harness|hellaswag|10": { + "acc": 0.6613224457279426, + "acc_stderr": 0.00472292833283405, + "acc_norm": 0.8498307110137423, + "acc_norm_stderr": 0.0035650718701954478 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5111111111111111, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.5111111111111111, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5394736842105263, + "acc_stderr": 0.04056242252249034, + "acc_norm": 0.5394736842105263, + "acc_norm_stderr": 0.04056242252249034 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6, + "acc_stderr": 0.030151134457776285, + "acc_norm": 0.6, + "acc_norm_stderr": 0.030151134457776285 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6458333333333334, + "acc_stderr": 0.03999411135753543, + "acc_norm": 0.6458333333333334, + "acc_norm_stderr": 0.03999411135753543 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5317919075144508, + "acc_stderr": 0.03804749744364764, + "acc_norm": 0.5317919075144508, + "acc_norm_stderr": 0.03804749744364764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3137254901960784, + "acc_stderr": 0.04617034827006716, + "acc_norm": 0.3137254901960784, + "acc_norm_stderr": 0.04617034827006716 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4808510638297872, + "acc_stderr": 0.032662042990646775, + "acc_norm": 0.4808510638297872, + "acc_norm_stderr": 0.032662042990646775 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.0433913832257986, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.0433913832257986 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5517241379310345, + "acc_stderr": 0.04144311810878152, + "acc_norm": 0.5517241379310345, + "acc_norm_stderr": 0.04144311810878152 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3386243386243386, + "acc_stderr": 0.02437319786798305, + "acc_norm": 0.3386243386243386, + "acc_norm_stderr": 0.02437319786798305 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.04306241259127153, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.04306241259127153 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6580645161290323, + "acc_stderr": 0.02698528957655274, + "acc_norm": 0.6580645161290323, + "acc_norm_stderr": 0.02698528957655274 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4482758620689655, + "acc_stderr": 0.03499113137676744, + "acc_norm": 0.4482758620689655, + "acc_norm_stderr": 0.03499113137676744 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.036085410115739666, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.036085410115739666 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.031911782267135466, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.031911782267135466 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8134715025906736, + "acc_stderr": 0.02811209121011748, + "acc_norm": 0.8134715025906736, + "acc_norm_stderr": 0.02811209121011748 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5717948717948718, + "acc_stderr": 0.025088301454694834, + "acc_norm": 0.5717948717948718, + "acc_norm_stderr": 0.025088301454694834 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.028742040903948492, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.028742040903948492 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.03156663099215416, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.03156663099215416 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526732, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526732 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7339449541284404, + "acc_stderr": 0.0189460223222256, + "acc_norm": 0.7339449541284404, + "acc_norm_stderr": 0.0189460223222256 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4583333333333333, + "acc_stderr": 0.03398110890294636, + "acc_norm": 0.4583333333333333, + "acc_norm_stderr": 0.03398110890294636 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7745098039215687, + "acc_stderr": 0.02933116229425174, + "acc_norm": 0.7745098039215687, + "acc_norm_stderr": 0.02933116229425174 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7721518987341772, + "acc_stderr": 0.027303484599069422, + "acc_norm": 0.7721518987341772, + "acc_norm_stderr": 0.027303484599069422 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.672645739910314, + "acc_stderr": 0.03149384670994131, + "acc_norm": 0.672645739910314, + "acc_norm_stderr": 0.03149384670994131 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6412213740458015, + "acc_stderr": 0.04206739313864908, + "acc_norm": 0.6412213740458015, + "acc_norm_stderr": 0.04206739313864908 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7603305785123967, + "acc_stderr": 0.03896878985070416, + "acc_norm": 0.7603305785123967, + "acc_norm_stderr": 0.03896878985070416 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7129629629629629, + "acc_stderr": 0.04373313040914761, + "acc_norm": 0.7129629629629629, + "acc_norm_stderr": 0.04373313040914761 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7055214723926381, + "acc_stderr": 0.03581165790474082, + "acc_norm": 0.7055214723926381, + "acc_norm_stderr": 0.03581165790474082 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3392857142857143, + "acc_stderr": 0.04493949068613538, + "acc_norm": 0.3392857142857143, + "acc_norm_stderr": 0.04493949068613538 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6601941747572816, + "acc_stderr": 0.046897659372781335, + "acc_norm": 0.6601941747572816, + "acc_norm_stderr": 0.046897659372781335 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8290598290598291, + "acc_stderr": 0.024662496845209814, + "acc_norm": 0.8290598290598291, + "acc_norm_stderr": 0.024662496845209814 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7471264367816092, + "acc_stderr": 0.015543377313719683, + "acc_norm": 0.7471264367816092, + "acc_norm_stderr": 0.015543377313719683 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.02572280220089581, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.02572280220089581 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.40782122905027934, + "acc_stderr": 0.016435865260914746, + "acc_norm": 0.40782122905027934, + "acc_norm_stderr": 0.016435865260914746 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6339869281045751, + "acc_stderr": 0.027582811415159607, + "acc_norm": 0.6339869281045751, + "acc_norm_stderr": 0.027582811415159607 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6688102893890675, + "acc_stderr": 0.02673062072800491, + "acc_norm": 0.6688102893890675, + "acc_norm_stderr": 0.02673062072800491 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6759259259259259, + "acc_stderr": 0.026041766202717163, + "acc_norm": 0.6759259259259259, + "acc_norm_stderr": 0.026041766202717163 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.43617021276595747, + "acc_stderr": 0.029583452036284066, + "acc_norm": 0.43617021276595747, + "acc_norm_stderr": 0.029583452036284066 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4530638852672751, + "acc_stderr": 0.012713845972358978, + "acc_norm": 0.4530638852672751, + "acc_norm_stderr": 0.012713845972358978 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5551470588235294, + "acc_stderr": 0.030187532060329387, + "acc_norm": 0.5551470588235294, + "acc_norm_stderr": 0.030187532060329387 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6062091503267973, + "acc_stderr": 0.019766211991073063, + "acc_norm": 0.6062091503267973, + "acc_norm_stderr": 0.019766211991073063 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5727272727272728, + "acc_stderr": 0.047381987035454834, + "acc_norm": 0.5727272727272728, + "acc_norm_stderr": 0.047381987035454834 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6489795918367347, + "acc_stderr": 0.03055531675557364, + "acc_norm": 0.6489795918367347, + "acc_norm_stderr": 0.03055531675557364 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.736318407960199, + "acc_stderr": 0.03115715086935556, + "acc_norm": 0.736318407960199, + "acc_norm_stderr": 0.03115715086935556 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.03588702812826371, + "acc_norm": 0.85, + "acc_norm_stderr": 0.03588702812826371 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.39759036144578314, + "acc_stderr": 0.038099730845402184, + "acc_norm": 0.39759036144578314, + "acc_norm_stderr": 0.038099730845402184 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7485380116959064, + "acc_stderr": 0.033275044238468436, + "acc_norm": 0.7485380116959064, + "acc_norm_stderr": 0.033275044238468436 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.37209302325581395, + "mc1_stderr": 0.016921090118814038, + "mc2": 0.5416298206037606, + "mc2_stderr": 0.01577120429386257 + }, + "all": { + "acc": 0.5716796109016449, + "acc_stderr": 0.03435374865976559, + "acc_norm": 0.5753374417415095, + "acc_norm_stderr": 0.03433126889716206, + "mc1": 0.37209302325581395, + "mc1_stderr": 0.016921090118814038, + "mc2": 0.5416298206037606, + "mc2_stderr": 0.01577120429386257 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "9975.898248672485", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Emerhyst-20B/results_2023-10-26T23-55-45.308698.json b/eval-results/Undi95/Emerhyst-20B/results_2023-10-26T23-55-45.308698.json new file mode 100644 index 0000000000000000000000000000000000000000..5c83d1fd9d615798fcc363ef5a0b4ae0af806701 --- /dev/null +++ b/eval-results/Undi95/Emerhyst-20B/results_2023-10-26T23-55-45.308698.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/Emerhyst-20B", + "model_sha": "e4c23af4f5dd88cb27d245e2bfc3b81db652632c", + "model_size": "37.36 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.13779362416107382, + "em_stderr": 0.003529879074740249, + "f1": 0.20561661073825346, + "f1_stderr": 0.0036264688196139742 + }, + "harness|gsm8k|5": { + "acc": 0.08491281273692192, + "acc_stderr": 0.007678212824450795 + }, + "harness|winogrande|5": { + "acc": 0.760852407261247, + "acc_stderr": 0.011988541844843914 + }, + "all": { + "em": 0.13779362416107382, + "em_stderr": 0.003529879074740249, + "f1": 0.20561661073825346, + "f1_stderr": 0.0036264688196139742, + "acc": 0.42288260999908445, + "acc_stderr": 0.009833377334647354 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "4fdbc299c55df2bd" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "c926a7d6bb88f48b" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "aebe7a59d7567707" + }, + "total_evaluation_time_secondes": "22061.84569334984", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/LewdEngine/results_2023-09-05T02-56-23.442470.json b/eval-results/Undi95/LewdEngine/results_2023-09-05T02-56-23.442470.json new file mode 100644 index 0000000000000000000000000000000000000000..d536f6aa5c68a3c1ca65cf35145cf3cc3434a68b --- /dev/null +++ b/eval-results/Undi95/LewdEngine/results_2023-09-05T02-56-23.442470.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "Undi95/LewdEngine", + "model_sha": "6e918ff9f563552af4ad66f4308f6d040e24af4b", + "model_dtype": "torch.float16", + "lighteval_sha": "9f7699e1a44b5b4d7bd4f326b57a34db83b67c3f", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5571672354948806, + "acc_stderr": 0.014515573873348902, + "acc_norm": 0.6049488054607508, + "acc_norm_stderr": 0.014285898292938165 + }, + "harness|hellaswag|10": { + "acc": 0.6331408086038638, + "acc_stderr": 0.004809626723626823, + "acc_norm": 0.8308105954989046, + "acc_norm_stderr": 0.0037415289563158417 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4740740740740741, + "acc_stderr": 0.04313531696750574, + "acc_norm": 0.4740740740740741, + "acc_norm_stderr": 0.04313531696750574 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5197368421052632, + "acc_stderr": 0.040657710025626036, + "acc_norm": 0.5197368421052632, + "acc_norm_stderr": 0.040657710025626036 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6, + "acc_stderr": 0.03015113445777629, + "acc_norm": 0.6, + "acc_norm_stderr": 0.03015113445777629 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5972222222222222, + "acc_stderr": 0.04101405519842425, + "acc_norm": 0.5972222222222222, + "acc_norm_stderr": 0.04101405519842425 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4682080924855491, + "acc_stderr": 0.03804749744364764, + "acc_norm": 0.4682080924855491, + "acc_norm_stderr": 0.03804749744364764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.043898699568087764, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.043898699568087764 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526094, + "acc_norm": 0.67, + "acc_norm_stderr": 0.047258156262526094 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.425531914893617, + "acc_stderr": 0.03232146916224468, + "acc_norm": 0.425531914893617, + "acc_norm_stderr": 0.03232146916224468 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322004, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322004 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.335978835978836, + "acc_stderr": 0.024326310529149138, + "acc_norm": 0.335978835978836, + "acc_norm_stderr": 0.024326310529149138 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.041905964388711366, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.041905964388711366 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6483870967741936, + "acc_stderr": 0.02716253782694846, + "acc_norm": 0.6483870967741936, + "acc_norm_stderr": 0.02716253782694846 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4630541871921182, + "acc_stderr": 0.035083705204426656, + "acc_norm": 0.4630541871921182, + "acc_norm_stderr": 0.035083705204426656 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6848484848484848, + "acc_stderr": 0.0362773057502241, + "acc_norm": 0.6848484848484848, + "acc_norm_stderr": 0.0362773057502241 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6868686868686869, + "acc_stderr": 0.033042050878136525, + "acc_norm": 0.6868686868686869, + "acc_norm_stderr": 0.033042050878136525 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8082901554404145, + "acc_stderr": 0.02840895362624528, + "acc_norm": 0.8082901554404145, + "acc_norm_stderr": 0.02840895362624528 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5076923076923077, + "acc_stderr": 0.025348006031534778, + "acc_norm": 0.5076923076923077, + "acc_norm_stderr": 0.025348006031534778 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3111111111111111, + "acc_stderr": 0.02822644674968352, + "acc_norm": 0.3111111111111111, + "acc_norm_stderr": 0.02822644674968352 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5756302521008403, + "acc_stderr": 0.032104790510157764, + "acc_norm": 0.5756302521008403, + "acc_norm_stderr": 0.032104790510157764 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7155963302752294, + "acc_stderr": 0.01934203658770259, + "acc_norm": 0.7155963302752294, + "acc_norm_stderr": 0.01934203658770259 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4166666666666667, + "acc_stderr": 0.033622774366080445, + "acc_norm": 0.4166666666666667, + "acc_norm_stderr": 0.033622774366080445 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7549019607843137, + "acc_stderr": 0.030190282453501943, + "acc_norm": 0.7549019607843137, + "acc_norm_stderr": 0.030190282453501943 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7215189873417721, + "acc_stderr": 0.029178682304842538, + "acc_norm": 0.7215189873417721, + "acc_norm_stderr": 0.029178682304842538 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6564885496183206, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.6564885496183206, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.71900826446281, + "acc_stderr": 0.041032038305145124, + "acc_norm": 0.71900826446281, + "acc_norm_stderr": 0.041032038305145124 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7129629629629629, + "acc_stderr": 0.04373313040914761, + "acc_norm": 0.7129629629629629, + "acc_norm_stderr": 0.04373313040914761 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6809815950920245, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.6809815950920245, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.29464285714285715, + "acc_stderr": 0.04327040932578729, + "acc_norm": 0.29464285714285715, + "acc_norm_stderr": 0.04327040932578729 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7669902912621359, + "acc_stderr": 0.04185832598928315, + "acc_norm": 0.7669902912621359, + "acc_norm_stderr": 0.04185832598928315 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7905982905982906, + "acc_stderr": 0.026655699653922737, + "acc_norm": 0.7905982905982906, + "acc_norm_stderr": 0.026655699653922737 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.52, + "acc_stderr": 0.05021167315686779, + "acc_norm": 0.52, + "acc_norm_stderr": 0.05021167315686779 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7292464878671775, + "acc_stderr": 0.015889888362560483, + "acc_norm": 0.7292464878671775, + "acc_norm_stderr": 0.015889888362560483 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.638728323699422, + "acc_stderr": 0.025862201852277906, + "acc_norm": 0.638728323699422, + "acc_norm_stderr": 0.025862201852277906 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.30614525139664805, + "acc_stderr": 0.015414494487903219, + "acc_norm": 0.30614525139664805, + "acc_norm_stderr": 0.015414494487903219 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.630718954248366, + "acc_stderr": 0.027634176689602663, + "acc_norm": 0.630718954248366, + "acc_norm_stderr": 0.027634176689602663 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6205787781350482, + "acc_stderr": 0.027559949802347817, + "acc_norm": 0.6205787781350482, + "acc_norm_stderr": 0.027559949802347817 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6018518518518519, + "acc_stderr": 0.02723741509459248, + "acc_norm": 0.6018518518518519, + "acc_norm_stderr": 0.02723741509459248 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.42907801418439717, + "acc_stderr": 0.02952591430255856, + "acc_norm": 0.42907801418439717, + "acc_norm_stderr": 0.02952591430255856 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4198174706649283, + "acc_stderr": 0.012604960816087378, + "acc_norm": 0.4198174706649283, + "acc_norm_stderr": 0.012604960816087378 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5183823529411765, + "acc_stderr": 0.030352303395351964, + "acc_norm": 0.5183823529411765, + "acc_norm_stderr": 0.030352303395351964 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5424836601307189, + "acc_stderr": 0.020154685712590884, + "acc_norm": 0.5424836601307189, + "acc_norm_stderr": 0.020154685712590884 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6090909090909091, + "acc_stderr": 0.04673752333670238, + "acc_norm": 0.6090909090909091, + "acc_norm_stderr": 0.04673752333670238 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6326530612244898, + "acc_stderr": 0.030862144921087558, + "acc_norm": 0.6326530612244898, + "acc_norm_stderr": 0.030862144921087558 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7064676616915423, + "acc_stderr": 0.03220024104534205, + "acc_norm": 0.7064676616915423, + "acc_norm_stderr": 0.03220024104534205 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42168674698795183, + "acc_stderr": 0.03844453181770917, + "acc_norm": 0.42168674698795183, + "acc_norm_stderr": 0.03844453181770917 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7602339181286549, + "acc_stderr": 0.03274485211946956, + "acc_norm": 0.7602339181286549, + "acc_norm_stderr": 0.03274485211946956 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3047735618115055, + "mc1_stderr": 0.016114124156882455, + "mc2": 0.43629332146485117, + "mc2_stderr": 0.014738333697751311 + }, + "all": { + "acc": 0.5500083344336444, + "acc_stderr": 0.0345125155387716, + "acc_norm": 0.5541685269228124, + "acc_norm_stderr": 0.034490519380335635, + "mc1": 0.3047735618115055, + "mc1_stderr": 0.016114124156882455, + "mc2": 0.43629332146485117, + "mc2_stderr": 0.014738333697751311 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "11770.016389131546", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/LewdEngine/results_2023-10-18T07-14-30.015522.json b/eval-results/Undi95/LewdEngine/results_2023-10-18T07-14-30.015522.json new file mode 100644 index 0000000000000000000000000000000000000000..cc16c0004831a9062ab6f423c0ba901d32bc36a7 --- /dev/null +++ b/eval-results/Undi95/LewdEngine/results_2023-10-18T07-14-30.015522.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/LewdEngine", + "model_sha": "ae8fb45025806c9475cb67eca08b8bf0dd65a0a8", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0019924496644295304, + "em_stderr": 0.0004566676462666989, + "f1": 0.06167575503355703, + "f1_stderr": 0.0013753579135200263 + }, + "harness|gsm8k|5": { + "acc": 0.12357846853677028, + "acc_stderr": 0.00906505030677692 + }, + "harness|winogrande|5": { + "acc": 0.7490134175217048, + "acc_stderr": 0.012185776220516151 + }, + "all": { + "em": 0.0019924496644295304, + "em_stderr": 0.0004566676462666989, + "f1": 0.06167575503355703, + "f1_stderr": 0.0013753579135200263, + "acc": 0.4362959430292375, + "acc_stderr": 0.010625413263646535 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "90261326828841f5" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "638786ac13d970c8" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "d4c0787a69cbcf21" + }, + "total_evaluation_time_secondes": "40229.785307884216", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Llama2-13B-no_robots-alpaca-lora/results_2023-11-15T08-15-04.836039.json b/eval-results/Undi95/Llama2-13B-no_robots-alpaca-lora/results_2023-11-15T08-15-04.836039.json new file mode 100644 index 0000000000000000000000000000000000000000..743a416dc2f9539d43cde17c8abd0122a4476a50 --- /dev/null +++ b/eval-results/Undi95/Llama2-13B-no_robots-alpaca-lora/results_2023-11-15T08-15-04.836039.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 2631547.52411086, + "end_time": 2652857.776027544, + "total_evaluation_time_secondes": "21310.251916683745", + "model_name": "Undi95/Llama2-13B-no_robots-alpaca-lora", + "model_sha": "581aba329e607533c299746bb9eb4154a7aab139", + "model_dtype": "torch.float16", + "model_size": "24.32 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5418088737201365, + "acc_stderr": 0.014560220308714695, + "acc_norm": 0.5887372013651877, + "acc_norm_stderr": 0.014379441068522082 + }, + "harness|hellaswag|10": { + "acc": 0.6309500099581756, + "acc_stderr": 0.004815613144385403, + "acc_norm": 0.8243377813184625, + "acc_norm_stderr": 0.003797548252851636 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5328947368421053, + "acc_stderr": 0.040601270352363966, + "acc_norm": 0.5328947368421053, + "acc_norm_stderr": 0.040601270352363966 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5660377358490566, + "acc_stderr": 0.030503292013342592, + "acc_norm": 0.5660377358490566, + "acc_norm_stderr": 0.030503292013342592 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5625, + "acc_stderr": 0.04148415739394154, + "acc_norm": 0.5625, + "acc_norm_stderr": 0.04148415739394154 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.49710982658959535, + "acc_stderr": 0.038124005659748335, + "acc_norm": 0.49710982658959535, + "acc_norm_stderr": 0.038124005659748335 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171452, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171452 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4127659574468085, + "acc_stderr": 0.03218471141400351, + "acc_norm": 0.4127659574468085, + "acc_norm_stderr": 0.03218471141400351 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.32456140350877194, + "acc_stderr": 0.04404556157374768, + "acc_norm": 0.32456140350877194, + "acc_norm_stderr": 0.04404556157374768 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.024552292209342654, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.024552292209342654 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.039325376803928704, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.039325376803928704 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6258064516129033, + "acc_stderr": 0.027528904299845704, + "acc_norm": 0.6258064516129033, + "acc_norm_stderr": 0.027528904299845704 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4630541871921182, + "acc_stderr": 0.035083705204426656, + "acc_norm": 0.4630541871921182, + "acc_norm_stderr": 0.035083705204426656 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6242424242424243, + "acc_stderr": 0.037818873532059816, + "acc_norm": 0.6242424242424243, + "acc_norm_stderr": 0.037818873532059816 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6464646464646465, + "acc_stderr": 0.03406086723547155, + "acc_norm": 0.6464646464646465, + "acc_norm_stderr": 0.03406086723547155 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7564766839378239, + "acc_stderr": 0.030975436386845454, + "acc_norm": 0.7564766839378239, + "acc_norm_stderr": 0.030975436386845454 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5205128205128206, + "acc_stderr": 0.02532966316348994, + "acc_norm": 0.5205128205128206, + "acc_norm_stderr": 0.02532966316348994 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.02620276653465215, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.02620276653465215 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.032252942323996406, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.032252942323996406 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.03802039760107903, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.03802039760107903 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6844036697247706, + "acc_stderr": 0.019926117513869666, + "acc_norm": 0.6844036697247706, + "acc_norm_stderr": 0.019926117513869666 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4212962962962963, + "acc_stderr": 0.03367462138896079, + "acc_norm": 0.4212962962962963, + "acc_norm_stderr": 0.03367462138896079 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7205882352941176, + "acc_stderr": 0.031493281045079556, + "acc_norm": 0.7205882352941176, + "acc_norm_stderr": 0.031493281045079556 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7383966244725738, + "acc_stderr": 0.028609516716994934, + "acc_norm": 0.7383966244725738, + "acc_norm_stderr": 0.028609516716994934 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6502242152466368, + "acc_stderr": 0.03200736719484503, + "acc_norm": 0.6502242152466368, + "acc_norm_stderr": 0.03200736719484503 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6106870229007634, + "acc_stderr": 0.04276486542814591, + "acc_norm": 0.6106870229007634, + "acc_norm_stderr": 0.04276486542814591 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.71900826446281, + "acc_stderr": 0.041032038305145124, + "acc_norm": 0.71900826446281, + "acc_norm_stderr": 0.041032038305145124 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.0471282125742677, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.0471282125742677 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6625766871165644, + "acc_stderr": 0.03714908409935574, + "acc_norm": 0.6625766871165644, + "acc_norm_stderr": 0.03714908409935574 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.042878587513404544, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.042878587513404544 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7281553398058253, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.7281553398058253, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7350427350427351, + "acc_stderr": 0.028911208802749486, + "acc_norm": 0.7350427350427351, + "acc_norm_stderr": 0.028911208802749486 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7266922094508301, + "acc_stderr": 0.015936681062628556, + "acc_norm": 0.7266922094508301, + "acc_norm_stderr": 0.015936681062628556 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.615606936416185, + "acc_stderr": 0.026189666966272035, + "acc_norm": 0.615606936416185, + "acc_norm_stderr": 0.026189666966272035 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.26927374301675977, + "acc_stderr": 0.014835616582882611, + "acc_norm": 0.26927374301675977, + "acc_norm_stderr": 0.014835616582882611 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.02845263998508801, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.02845263998508801 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6141479099678456, + "acc_stderr": 0.027648149599751468, + "acc_norm": 0.6141479099678456, + "acc_norm_stderr": 0.027648149599751468 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6172839506172839, + "acc_stderr": 0.02704453813840261, + "acc_norm": 0.6172839506172839, + "acc_norm_stderr": 0.02704453813840261 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.41843971631205673, + "acc_stderr": 0.02942799403941999, + "acc_norm": 0.41843971631205673, + "acc_norm_stderr": 0.02942799403941999 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.41134289439374183, + "acc_stderr": 0.012567882673803685, + "acc_norm": 0.41134289439374183, + "acc_norm_stderr": 0.012567882673803685 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4338235294117647, + "acc_stderr": 0.030105636570016633, + "acc_norm": 0.4338235294117647, + "acc_norm_stderr": 0.030105636570016633 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.02019280827143379, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.02019280827143379 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5363636363636364, + "acc_stderr": 0.04776449162396197, + "acc_norm": 0.5363636363636364, + "acc_norm_stderr": 0.04776449162396197 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6163265306122448, + "acc_stderr": 0.031130880396235933, + "acc_norm": 0.6163265306122448, + "acc_norm_stderr": 0.031130880396235933 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7412935323383084, + "acc_stderr": 0.030965903123573033, + "acc_norm": 0.7412935323383084, + "acc_norm_stderr": 0.030965903123573033 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.74, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.74, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7485380116959064, + "acc_stderr": 0.033275044238468436, + "acc_norm": 0.7485380116959064, + "acc_norm_stderr": 0.033275044238468436 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.28151774785801714, + "mc1_stderr": 0.01574402724825605, + "mc2": 0.4045559753787184, + "mc2_stderr": 0.01423646056016957 + }, + "harness|winogrande|5": { + "acc": 0.7529597474348856, + "acc_stderr": 0.012121402942855575 + }, + "harness|drop|3": { + "em": 0.031774328859060404, + "em_stderr": 0.0017962473521312278, + "f1": 0.09261220637583845, + "f1_stderr": 0.0021550523797604715 + }, + "harness|gsm8k|5": { + "acc": 0.06444275966641395, + "acc_stderr": 0.006763391728488274 + }, + "all": { + "acc": 0.5288556443369928, + "acc_stderr": 0.03390383953418472, + "acc_norm": 0.5370018287535696, + "acc_norm_stderr": 0.034712721572579625, + "mc1": 0.28151774785801714, + "mc1_stderr": 0.01574402724825605, + "mc2": 0.4045559753787184, + "mc2_stderr": 0.01423646056016957, + "em": 0.031774328859060404, + "em_stderr": 0.0017962473521312278, + "f1": 0.09261220637583845, + "f1_stderr": 0.0021550523797604715 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "203a641603acd51c" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "f098bb38e63fd1ab" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "379266f3a5365f9d", + "hash_cont_tokens": "44d870a88c77b5c0" + }, + "truncated": 3, + "non_truncated": 38192, + "padded": 113348, + "non_padded": 11060, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Llamix2-Xwin-MoE-4x13B/results_2023-12-16T23-47-16.165655.json b/eval-results/Undi95/Llamix2-Xwin-MoE-4x13B/results_2023-12-16T23-47-16.165655.json new file mode 100644 index 0000000000000000000000000000000000000000..d75d2c52904cdb365aab71ad190b81926718a565 --- /dev/null +++ b/eval-results/Undi95/Llamix2-Xwin-MoE-4x13B/results_2023-12-16T23-47-16.165655.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 369528.412107451, + "end_time": 401399.232514856, + "total_evaluation_time_secondes": "31870.82040740497", + "model_name": "Undi95/Llamix2-Xwin-MoE-4x13B", + "model_sha": "220833f87c233684e8a4b0e03126ffcdffce5229", + "model_dtype": "torch.float16", + "model_size": "72.33 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5699658703071673, + "acc_stderr": 0.014467631559137993, + "acc_norm": 0.6040955631399317, + "acc_norm_stderr": 0.01429122839353659 + }, + "harness|hellaswag|10": { + "acc": 0.6299541923919538, + "acc_stderr": 0.004818298991012551, + "acc_norm": 0.8296156144194383, + "acc_norm_stderr": 0.0037520176390837532 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5259259259259259, + "acc_stderr": 0.04313531696750575, + "acc_norm": 0.5259259259259259, + "acc_norm_stderr": 0.04313531696750575 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5526315789473685, + "acc_stderr": 0.0404633688397825, + "acc_norm": 0.5526315789473685, + "acc_norm_stderr": 0.0404633688397825 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6150943396226415, + "acc_stderr": 0.02994649856769995, + "acc_norm": 0.6150943396226415, + "acc_norm_stderr": 0.02994649856769995 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6388888888888888, + "acc_stderr": 0.040166600304512336, + "acc_norm": 0.6388888888888888, + "acc_norm_stderr": 0.040166600304512336 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5375722543352601, + "acc_stderr": 0.0380168510452446, + "acc_norm": 0.5375722543352601, + "acc_norm_stderr": 0.0380168510452446 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.44680851063829785, + "acc_stderr": 0.0325005368436584, + "acc_norm": 0.44680851063829785, + "acc_norm_stderr": 0.0325005368436584 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.04372748290278007, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.04372748290278007 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.496551724137931, + "acc_stderr": 0.041665675771015785, + "acc_norm": 0.496551724137931, + "acc_norm_stderr": 0.041665675771015785 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.31216931216931215, + "acc_stderr": 0.0238652068369726, + "acc_norm": 0.31216931216931215, + "acc_norm_stderr": 0.0238652068369726 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3412698412698413, + "acc_stderr": 0.04240799327574925, + "acc_norm": 0.3412698412698413, + "acc_norm_stderr": 0.04240799327574925 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6741935483870968, + "acc_stderr": 0.026662010578567107, + "acc_norm": 0.6741935483870968, + "acc_norm_stderr": 0.026662010578567107 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.458128078817734, + "acc_stderr": 0.03505630140785741, + "acc_norm": 0.458128078817734, + "acc_norm_stderr": 0.03505630140785741 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.03663974994391245, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.03663974994391245 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.696969696969697, + "acc_stderr": 0.03274287914026868, + "acc_norm": 0.696969696969697, + "acc_norm_stderr": 0.03274287914026868 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8238341968911918, + "acc_stderr": 0.027493504244548057, + "acc_norm": 0.8238341968911918, + "acc_norm_stderr": 0.027493504244548057 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.49743589743589745, + "acc_stderr": 0.025350672979412195, + "acc_norm": 0.49743589743589745, + "acc_norm_stderr": 0.025350672979412195 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.027634907264178544, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.027634907264178544 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.032145368597886394, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.032145368597886394 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.03802039760107903, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.03802039760107903 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7688073394495413, + "acc_stderr": 0.018075750241633146, + "acc_norm": 0.7688073394495413, + "acc_norm_stderr": 0.018075750241633146 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.033723432716530645, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.033723432716530645 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7598039215686274, + "acc_stderr": 0.02998373305591361, + "acc_norm": 0.7598039215686274, + "acc_norm_stderr": 0.02998373305591361 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7679324894514767, + "acc_stderr": 0.027479744550808503, + "acc_norm": 0.7679324894514767, + "acc_norm_stderr": 0.027479744550808503 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6547085201793722, + "acc_stderr": 0.03191100192835794, + "acc_norm": 0.6547085201793722, + "acc_norm_stderr": 0.03191100192835794 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6183206106870229, + "acc_stderr": 0.042607351576445594, + "acc_norm": 0.6183206106870229, + "acc_norm_stderr": 0.042607351576445594 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.039849796533028725, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.039849796533028725 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.04330043749650743, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.04330043749650743 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6809815950920245, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.6809815950920245, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285714, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285714 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7378640776699029, + "acc_stderr": 0.04354631077260595, + "acc_norm": 0.7378640776699029, + "acc_norm_stderr": 0.04354631077260595 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.811965811965812, + "acc_stderr": 0.02559819368665225, + "acc_norm": 0.811965811965812, + "acc_norm_stderr": 0.02559819368665225 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.55, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.55, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7624521072796935, + "acc_stderr": 0.015218733046150195, + "acc_norm": 0.7624521072796935, + "acc_norm_stderr": 0.015218733046150195 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.653179190751445, + "acc_stderr": 0.025624723994030454, + "acc_norm": 0.653179190751445, + "acc_norm_stderr": 0.025624723994030454 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3888268156424581, + "acc_stderr": 0.01630389953079613, + "acc_norm": 0.3888268156424581, + "acc_norm_stderr": 0.01630389953079613 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6274509803921569, + "acc_stderr": 0.027684181883302898, + "acc_norm": 0.6274509803921569, + "acc_norm_stderr": 0.027684181883302898 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.662379421221865, + "acc_stderr": 0.026858825879488544, + "acc_norm": 0.662379421221865, + "acc_norm_stderr": 0.026858825879488544 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6419753086419753, + "acc_stderr": 0.026675611926037106, + "acc_norm": 0.6419753086419753, + "acc_norm_stderr": 0.026675611926037106 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.40070921985815605, + "acc_stderr": 0.029233465745573083, + "acc_norm": 0.40070921985815605, + "acc_norm_stderr": 0.029233465745573083 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4211212516297262, + "acc_stderr": 0.012610325733489905, + "acc_norm": 0.4211212516297262, + "acc_norm_stderr": 0.012610325733489905 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.03032024326500413, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.03032024326500413 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5637254901960784, + "acc_stderr": 0.02006287424353913, + "acc_norm": 0.5637254901960784, + "acc_norm_stderr": 0.02006287424353913 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6272727272727273, + "acc_stderr": 0.04631381319425465, + "acc_norm": 0.6272727272727273, + "acc_norm_stderr": 0.04631381319425465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6489795918367347, + "acc_stderr": 0.03055531675557364, + "acc_norm": 0.6489795918367347, + "acc_norm_stderr": 0.03055531675557364 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7711442786069652, + "acc_stderr": 0.029705284056772432, + "acc_norm": 0.7711442786069652, + "acc_norm_stderr": 0.029705284056772432 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.03487350880197769, + "acc_norm": 0.86, + "acc_norm_stderr": 0.03487350880197769 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4939759036144578, + "acc_stderr": 0.03892212195333047, + "acc_norm": 0.4939759036144578, + "acc_norm_stderr": 0.03892212195333047 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7719298245614035, + "acc_stderr": 0.032180937956023566, + "acc_norm": 0.7719298245614035, + "acc_norm_stderr": 0.032180937956023566 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2729498164014688, + "mc1_stderr": 0.015594753632006525, + "mc2": 0.3963209435327923, + "mc2_stderr": 0.014481742388552897 + }, + "harness|winogrande|5": { + "acc": 0.7513812154696132, + "acc_stderr": 0.012147314713403108 + }, + "harness|gsm8k|5": { + "acc": 0.33206974981046244, + "acc_stderr": 0.012972465034361873 + }, + "all": { + "acc": 0.5629553312475245, + "acc_stderr": 0.03336826953256445, + "acc_norm": 0.5676375484013418, + "acc_norm_stderr": 0.03405257588510401, + "mc1": 0.2729498164014688, + "mc1_stderr": 0.015594753632006525, + "mc2": 0.3963209435327923, + "mc2_stderr": 0.014481742388552897 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "dc168d09d2e006a0" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "a8fa53915153e1db", + "hash_cont_tokens": "f6440f6e2f936b9f" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MLewd-Chat-v2-13B/results_2023-10-04T08-57-05.085680.json b/eval-results/Undi95/MLewd-Chat-v2-13B/results_2023-10-04T08-57-05.085680.json new file mode 100644 index 0000000000000000000000000000000000000000..91544f05c8f89373f5838a45a8fddc3c55641645 --- /dev/null +++ b/eval-results/Undi95/MLewd-Chat-v2-13B/results_2023-10-04T08-57-05.085680.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/MLewd-Chat-v2-13B", + "model_sha": "f6181961a6a2f9ca534e1a8907b4a4459be6b6bd", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5921501706484642, + "acc_stderr": 0.014361097288449701, + "acc_norm": 0.6186006825938567, + "acc_norm_stderr": 0.014194389086685251 + }, + "harness|hellaswag|10": { + "acc": 0.6437960565624378, + "acc_stderr": 0.00477897803138964, + "acc_norm": 0.8380800637323242, + "acc_norm_stderr": 0.0036762448867232573 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411022, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411022 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5460526315789473, + "acc_stderr": 0.04051646342874143, + "acc_norm": 0.5460526315789473, + "acc_norm_stderr": 0.04051646342874143 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5773584905660377, + "acc_stderr": 0.03040233144576954, + "acc_norm": 0.5773584905660377, + "acc_norm_stderr": 0.03040233144576954 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6180555555555556, + "acc_stderr": 0.040629907841466674, + "acc_norm": 0.6180555555555556, + "acc_norm_stderr": 0.040629907841466674 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5260115606936416, + "acc_stderr": 0.03807301726504513, + "acc_norm": 0.5260115606936416, + "acc_norm_stderr": 0.03807301726504513 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.0433643270799318, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.0433643270799318 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.04560480215720685, + "acc_norm": 0.71, + "acc_norm_stderr": 0.04560480215720685 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4978723404255319, + "acc_stderr": 0.03268572658667492, + "acc_norm": 0.4978723404255319, + "acc_norm_stderr": 0.03268572658667492 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537315, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537315 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3201058201058201, + "acc_stderr": 0.0240268463928735, + "acc_norm": 0.3201058201058201, + "acc_norm_stderr": 0.0240268463928735 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.04375888492727062, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.04375888492727062 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6516129032258065, + "acc_stderr": 0.02710482632810094, + "acc_norm": 0.6516129032258065, + "acc_norm_stderr": 0.02710482632810094 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43349753694581283, + "acc_stderr": 0.03486731727419872, + "acc_norm": 0.43349753694581283, + "acc_norm_stderr": 0.03486731727419872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.703030303030303, + "acc_stderr": 0.035679697722680495, + "acc_norm": 0.703030303030303, + "acc_norm_stderr": 0.035679697722680495 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.03173071239071724, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.03173071239071724 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8290155440414507, + "acc_stderr": 0.027171213683164542, + "acc_norm": 0.8290155440414507, + "acc_norm_stderr": 0.027171213683164542 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5487179487179488, + "acc_stderr": 0.025230381238934837, + "acc_norm": 0.5487179487179488, + "acc_norm_stderr": 0.025230381238934837 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32222222222222224, + "acc_stderr": 0.028493465091028597, + "acc_norm": 0.32222222222222224, + "acc_norm_stderr": 0.028493465091028597 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.03196876989195778, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.03196876989195778 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7577981651376147, + "acc_stderr": 0.01836817630659862, + "acc_norm": 0.7577981651376147, + "acc_norm_stderr": 0.01836817630659862 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.39351851851851855, + "acc_stderr": 0.03331747876370312, + "acc_norm": 0.39351851851851855, + "acc_norm_stderr": 0.03331747876370312 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.028867431449849313, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.028867431449849313 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7721518987341772, + "acc_stderr": 0.027303484599069425, + "acc_norm": 0.7721518987341772, + "acc_norm_stderr": 0.027303484599069425 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.03102441174057221, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.03102441174057221 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6870229007633588, + "acc_stderr": 0.04066962905677698, + "acc_norm": 0.6870229007633588, + "acc_norm_stderr": 0.04066962905677698 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7107438016528925, + "acc_stderr": 0.041391127276354626, + "acc_norm": 0.7107438016528925, + "acc_norm_stderr": 0.041391127276354626 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.042365112580946315, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.042365112580946315 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6748466257668712, + "acc_stderr": 0.036803503712864616, + "acc_norm": 0.6748466257668712, + "acc_norm_stderr": 0.036803503712864616 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764377, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764377 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6893203883495146, + "acc_stderr": 0.0458212416016155, + "acc_norm": 0.6893203883495146, + "acc_norm_stderr": 0.0458212416016155 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8205128205128205, + "acc_stderr": 0.025140935950335435, + "acc_norm": 0.8205128205128205, + "acc_norm_stderr": 0.025140935950335435 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7675606641123882, + "acc_stderr": 0.015104550008905726, + "acc_norm": 0.7675606641123882, + "acc_norm_stderr": 0.015104550008905726 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6184971098265896, + "acc_stderr": 0.0261521986197268, + "acc_norm": 0.6184971098265896, + "acc_norm_stderr": 0.0261521986197268 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.42681564245810055, + "acc_stderr": 0.016542401954631917, + "acc_norm": 0.42681564245810055, + "acc_norm_stderr": 0.016542401954631917 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6437908496732027, + "acc_stderr": 0.02742047766262923, + "acc_norm": 0.6437908496732027, + "acc_norm_stderr": 0.02742047766262923 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6463022508038585, + "acc_stderr": 0.02715520810320086, + "acc_norm": 0.6463022508038585, + "acc_norm_stderr": 0.02715520810320086 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6358024691358025, + "acc_stderr": 0.026774929899722327, + "acc_norm": 0.6358024691358025, + "acc_norm_stderr": 0.026774929899722327 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4326241134751773, + "acc_stderr": 0.029555454236778852, + "acc_norm": 0.4326241134751773, + "acc_norm_stderr": 0.029555454236778852 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4322033898305085, + "acc_stderr": 0.012652297777114968, + "acc_norm": 0.4322033898305085, + "acc_norm_stderr": 0.012652297777114968 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5183823529411765, + "acc_stderr": 0.030352303395351964, + "acc_norm": 0.5183823529411765, + "acc_norm_stderr": 0.030352303395351964 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.019944914136873583, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.019944914136873583 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.04494290866252091, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.04494290866252091 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6530612244897959, + "acc_stderr": 0.030472526026726496, + "acc_norm": 0.6530612244897959, + "acc_norm_stderr": 0.030472526026726496 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7313432835820896, + "acc_stderr": 0.03134328358208954, + "acc_norm": 0.7313432835820896, + "acc_norm_stderr": 0.03134328358208954 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.03487350880197769, + "acc_norm": 0.86, + "acc_norm_stderr": 0.03487350880197769 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4578313253012048, + "acc_stderr": 0.0387862677100236, + "acc_norm": 0.4578313253012048, + "acc_norm_stderr": 0.0387862677100236 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.03094445977853321, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.03094445977853321 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3929008567931457, + "mc1_stderr": 0.017097248285233065, + "mc2": 0.5450807561335627, + "mc2_stderr": 0.01567086824854785 + }, + "all": { + "acc": 0.5716297987214715, + "acc_stderr": 0.03423062928663591, + "acc_norm": 0.5753710617573236, + "acc_norm_stderr": 0.034209113331611654, + "mc1": 0.3929008567931457, + "mc1_stderr": 0.017097248285233065, + "mc2": 0.5450807561335627, + "mc2_stderr": 0.01567086824854785 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6299.00984120369", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MLewd-Chat-v2-13B/results_2023-10-25T04-58-47.743949.json b/eval-results/Undi95/MLewd-Chat-v2-13B/results_2023-10-25T04-58-47.743949.json new file mode 100644 index 0000000000000000000000000000000000000000..bbf3bf66a0f2f78418d649841aeff6a391a7f6b6 --- /dev/null +++ b/eval-results/Undi95/MLewd-Chat-v2-13B/results_2023-10-25T04-58-47.743949.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/MLewd-Chat-v2-13B", + "model_sha": "f6181961a6a2f9ca534e1a8907b4a4459be6b6bd", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.16935822147651006, + "em_stderr": 0.003841047509071323, + "f1": 0.25626572986577256, + "f1_stderr": 0.003896453812497321 + }, + "harness|gsm8k|5": { + "acc": 0.10462471569370735, + "acc_stderr": 0.00843066808202928 + }, + "harness|winogrande|5": { + "acc": 0.7576953433307024, + "acc_stderr": 0.012042352526174789 + }, + "all": { + "em": 0.16935822147651006, + "em_stderr": 0.003841047509071323, + "f1": 0.25626572986577256, + "f1_stderr": 0.003896453812497321, + "acc": 0.4311600295122049, + "acc_stderr": 0.010236510304102034 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "926065e0570843b6" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "798ea38162156587" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "1b1d85359fccd53e" + }, + "total_evaluation_time_secondes": "12254.943300008774", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MLewd-L2-13B/results_2023-09-05T05-06-12.728207.json b/eval-results/Undi95/MLewd-L2-13B/results_2023-09-05T05-06-12.728207.json new file mode 100644 index 0000000000000000000000000000000000000000..f3c35addc37857bb956a5e7ab15f4fd171b2e7d2 --- /dev/null +++ b/eval-results/Undi95/MLewd-L2-13B/results_2023-09-05T05-06-12.728207.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "Undi95/MLewd-L2-13B", + "model_sha": "feb1fa71e0b24261d3ca428b4aed881dd31f166e", + "model_dtype": "torch.float16", + "lighteval_sha": "9f7699e1a44b5b4d7bd4f326b57a34db83b67c3f", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5435153583617748, + "acc_stderr": 0.01455594976049644, + "acc_norm": 0.5827645051194539, + "acc_norm_stderr": 0.014409825518403077 + }, + "harness|hellaswag|10": { + "acc": 0.6319458275243975, + "acc_stderr": 0.004812905279066437, + "acc_norm": 0.8232423819956184, + "acc_norm_stderr": 0.00380683844816174 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4740740740740741, + "acc_stderr": 0.04313531696750574, + "acc_norm": 0.4740740740740741, + "acc_norm_stderr": 0.04313531696750574 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5328947368421053, + "acc_stderr": 0.04060127035236395, + "acc_norm": 0.5328947368421053, + "acc_norm_stderr": 0.04060127035236395 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5886792452830188, + "acc_stderr": 0.03028500925900979, + "acc_norm": 0.5886792452830188, + "acc_norm_stderr": 0.03028500925900979 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.041553199555931467, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.041553199555931467 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4682080924855491, + "acc_stderr": 0.03804749744364764, + "acc_norm": 0.4682080924855491, + "acc_norm_stderr": 0.03804749744364764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3137254901960784, + "acc_stderr": 0.04617034827006716, + "acc_norm": 0.3137254901960784, + "acc_norm_stderr": 0.04617034827006716 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.43829787234042555, + "acc_stderr": 0.03243618636108101, + "acc_norm": 0.43829787234042555, + "acc_norm_stderr": 0.03243618636108101 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4896551724137931, + "acc_stderr": 0.04165774775728763, + "acc_norm": 0.4896551724137931, + "acc_norm_stderr": 0.04165774775728763 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.024130158299762613, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.024130158299762613 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.04006168083848878, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.04006168083848878 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.632258064516129, + "acc_stderr": 0.02743086657997347, + "acc_norm": 0.632258064516129, + "acc_norm_stderr": 0.02743086657997347 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43349753694581283, + "acc_stderr": 0.03486731727419872, + "acc_norm": 0.43349753694581283, + "acc_norm_stderr": 0.03486731727419872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.036085410115739666, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.036085410115739666 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6868686868686869, + "acc_stderr": 0.033042050878136525, + "acc_norm": 0.6868686868686869, + "acc_norm_stderr": 0.033042050878136525 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8082901554404145, + "acc_stderr": 0.028408953626245258, + "acc_norm": 0.8082901554404145, + "acc_norm_stderr": 0.028408953626245258 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5153846153846153, + "acc_stderr": 0.025339003010106515, + "acc_norm": 0.5153846153846153, + "acc_norm_stderr": 0.025339003010106515 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.29259259259259257, + "acc_stderr": 0.02773896963217609, + "acc_norm": 0.29259259259259257, + "acc_norm_stderr": 0.02773896963217609 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5378151260504201, + "acc_stderr": 0.03238546948758979, + "acc_norm": 0.5378151260504201, + "acc_norm_stderr": 0.03238546948758979 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.726605504587156, + "acc_stderr": 0.019109299846098278, + "acc_norm": 0.726605504587156, + "acc_norm_stderr": 0.019109299846098278 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3287037037037037, + "acc_stderr": 0.03203614084670058, + "acc_norm": 0.3287037037037037, + "acc_norm_stderr": 0.03203614084670058 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7352941176470589, + "acc_stderr": 0.030964517926923403, + "acc_norm": 0.7352941176470589, + "acc_norm_stderr": 0.030964517926923403 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.70042194092827, + "acc_stderr": 0.029818024749753095, + "acc_norm": 0.70042194092827, + "acc_norm_stderr": 0.029818024749753095 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.03102441174057221, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.03102441174057221 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6793893129770993, + "acc_stderr": 0.04093329229834278, + "acc_norm": 0.6793893129770993, + "acc_norm_stderr": 0.04093329229834278 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.040261875275912073, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.040261875275912073 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7037037037037037, + "acc_stderr": 0.04414343666854934, + "acc_norm": 0.7037037037037037, + "acc_norm_stderr": 0.04414343666854934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6748466257668712, + "acc_stderr": 0.036803503712864616, + "acc_norm": 0.6748466257668712, + "acc_norm_stderr": 0.036803503712864616 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.04521829902833586, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.04521829902833586 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.04541609446503949, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.04541609446503949 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.811965811965812, + "acc_stderr": 0.02559819368665225, + "acc_norm": 0.811965811965812, + "acc_norm_stderr": 0.02559819368665225 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7203065134099617, + "acc_stderr": 0.01605079214803652, + "acc_norm": 0.7203065134099617, + "acc_norm_stderr": 0.01605079214803652 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.02607431485165708, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.02607431485165708 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3843575418994413, + "acc_stderr": 0.016269088663959406, + "acc_norm": 0.3843575418994413, + "acc_norm_stderr": 0.016269088663959406 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6143790849673203, + "acc_stderr": 0.02787074527829027, + "acc_norm": 0.6143790849673203, + "acc_norm_stderr": 0.02787074527829027 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6141479099678456, + "acc_stderr": 0.027648149599751468, + "acc_norm": 0.6141479099678456, + "acc_norm_stderr": 0.027648149599751468 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5895061728395061, + "acc_stderr": 0.027371350925124764, + "acc_norm": 0.5895061728395061, + "acc_norm_stderr": 0.027371350925124764 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.42907801418439717, + "acc_stderr": 0.02952591430255855, + "acc_norm": 0.42907801418439717, + "acc_norm_stderr": 0.02952591430255855 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4074315514993481, + "acc_stderr": 0.012549473714212226, + "acc_norm": 0.4074315514993481, + "acc_norm_stderr": 0.012549473714212226 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5147058823529411, + "acc_stderr": 0.03035969707904612, + "acc_norm": 0.5147058823529411, + "acc_norm_stderr": 0.03035969707904612 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5441176470588235, + "acc_stderr": 0.020148939420415745, + "acc_norm": 0.5441176470588235, + "acc_norm_stderr": 0.020148939420415745 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6272727272727273, + "acc_stderr": 0.04631381319425465, + "acc_norm": 0.6272727272727273, + "acc_norm_stderr": 0.04631381319425465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6530612244897959, + "acc_stderr": 0.0304725260267265, + "acc_norm": 0.6530612244897959, + "acc_norm_stderr": 0.0304725260267265 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7064676616915423, + "acc_stderr": 0.03220024104534205, + "acc_norm": 0.7064676616915423, + "acc_norm_stderr": 0.03220024104534205 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.463855421686747, + "acc_stderr": 0.03882310850890593, + "acc_norm": 0.463855421686747, + "acc_norm_stderr": 0.03882310850890593 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7309941520467836, + "acc_stderr": 0.03401052620104089, + "acc_norm": 0.7309941520467836, + "acc_norm_stderr": 0.03401052620104089 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3390452876376989, + "mc1_stderr": 0.016571797910626605, + "mc2": 0.4866402159418837, + "mc2_stderr": 0.015878252541467283 + }, + "all": { + "acc": 0.5480941554989293, + "acc_stderr": 0.03458699327783796, + "acc_norm": 0.5520017097570462, + "acc_norm_stderr": 0.03456746461558375, + "mc1": 0.3390452876376989, + "mc1_stderr": 0.016571797910626605, + "mc2": 0.4866402159418837, + "mc2_stderr": 0.015878252541467283 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6352.614720582962", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MLewd-L2-13B/results_2023-10-18T07-35-31.407630.json b/eval-results/Undi95/MLewd-L2-13B/results_2023-10-18T07-35-31.407630.json new file mode 100644 index 0000000000000000000000000000000000000000..3824e44446065248ff28e4cc775b6133ccb877cb --- /dev/null +++ b/eval-results/Undi95/MLewd-L2-13B/results_2023-10-18T07-35-31.407630.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/MLewd-L2-13B", + "model_sha": "ec22f332e8d17375043d56990a471979b7548a9e", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.012164429530201342, + "em_stderr": 0.0011226072817372202, + "f1": 0.09181417785234938, + "f1_stderr": 0.0019450870531667406 + }, + "harness|gsm8k|5": { + "acc": 0.01288855193328279, + "acc_stderr": 0.003106901266499655 + }, + "harness|winogrande|5": { + "acc": 0.7348066298342542, + "acc_stderr": 0.012406549466192861 + }, + "all": { + "em": 0.012164429530201342, + "em_stderr": 0.0011226072817372202, + "f1": 0.09181417785234938, + "f1_stderr": 0.0019450870531667406, + "acc": 0.37384759088376845, + "acc_stderr": 0.007756725366346258 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "e94d71b235569611" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "3d85d3d3a355672a" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "c3af4d53d4357ccb" + }, + "total_evaluation_time_secondes": "13514.441354513168", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MLewd-L2-Chat-13B/results_2023-09-18T13-38-28.135797.json b/eval-results/Undi95/MLewd-L2-Chat-13B/results_2023-09-18T13-38-28.135797.json new file mode 100644 index 0000000000000000000000000000000000000000..b63a77c7c18d817be47db331819064a119ed05db --- /dev/null +++ b/eval-results/Undi95/MLewd-L2-Chat-13B/results_2023-09-18T13-38-28.135797.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/MLewd-L2-Chat-13B", + "model_sha": "6c66622a99c1bc73498aa6a15a59da825d875310", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5998293515358362, + "acc_stderr": 0.014317197787809174, + "acc_norm": 0.6203071672354948, + "acc_norm_stderr": 0.014182119866974872 + }, + "harness|hellaswag|10": { + "acc": 0.6452897829117705, + "acc_stderr": 0.004774476498238617, + "acc_norm": 0.8418641704839673, + "acc_norm_stderr": 0.003641226294167795 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237103, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237103 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4962962962962963, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.4962962962962963, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5789473684210527, + "acc_stderr": 0.04017901275981749, + "acc_norm": 0.5789473684210527, + "acc_norm_stderr": 0.04017901275981749 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6339622641509434, + "acc_stderr": 0.029647813539365245, + "acc_norm": 0.6339622641509434, + "acc_norm_stderr": 0.029647813539365245 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03942082639927213, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03942082639927213 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5549132947976878, + "acc_stderr": 0.03789401760283647, + "acc_norm": 0.5549132947976878, + "acc_norm_stderr": 0.03789401760283647 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04690650298201942, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04690650298201942 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.49361702127659574, + "acc_stderr": 0.032683358999363366, + "acc_norm": 0.49361702127659574, + "acc_norm_stderr": 0.032683358999363366 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.04096985139843671, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.04096985139843671 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.024552292209342668, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.024552292209342668 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6935483870967742, + "acc_stderr": 0.026226485652553887, + "acc_norm": 0.6935483870967742, + "acc_norm_stderr": 0.026226485652553887 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.458128078817734, + "acc_stderr": 0.03505630140785741, + "acc_norm": 0.458128078817734, + "acc_norm_stderr": 0.03505630140785741 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7212121212121212, + "acc_stderr": 0.035014387062967806, + "acc_norm": 0.7212121212121212, + "acc_norm_stderr": 0.035014387062967806 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7474747474747475, + "acc_stderr": 0.030954055470365907, + "acc_norm": 0.7474747474747475, + "acc_norm_stderr": 0.030954055470365907 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8549222797927462, + "acc_stderr": 0.02541634309630644, + "acc_norm": 0.8549222797927462, + "acc_norm_stderr": 0.02541634309630644 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5794871794871795, + "acc_stderr": 0.02502861027671086, + "acc_norm": 0.5794871794871795, + "acc_norm_stderr": 0.02502861027671086 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34074074074074073, + "acc_stderr": 0.028897748741131137, + "acc_norm": 0.34074074074074073, + "acc_norm_stderr": 0.028897748741131137 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6008403361344538, + "acc_stderr": 0.03181110032413926, + "acc_norm": 0.6008403361344538, + "acc_norm_stderr": 0.03181110032413926 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7743119266055046, + "acc_stderr": 0.017923087667803064, + "acc_norm": 0.7743119266055046, + "acc_norm_stderr": 0.017923087667803064 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4166666666666667, + "acc_stderr": 0.03362277436608044, + "acc_norm": 0.4166666666666667, + "acc_norm_stderr": 0.03362277436608044 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.02812597226565437, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.02812597226565437 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7637130801687764, + "acc_stderr": 0.02765215314415927, + "acc_norm": 0.7637130801687764, + "acc_norm_stderr": 0.02765215314415927 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.03102441174057221, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.03102441174057221 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6717557251908397, + "acc_stderr": 0.041184385658062976, + "acc_norm": 0.6717557251908397, + "acc_norm_stderr": 0.041184385658062976 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908705, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7962962962962963, + "acc_stderr": 0.03893542518824847, + "acc_norm": 0.7962962962962963, + "acc_norm_stderr": 0.03893542518824847 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6871165644171779, + "acc_stderr": 0.036429145782924055, + "acc_norm": 0.6871165644171779, + "acc_norm_stderr": 0.036429145782924055 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.04432804055291519, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.04432804055291519 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7378640776699029, + "acc_stderr": 0.04354631077260594, + "acc_norm": 0.7378640776699029, + "acc_norm_stderr": 0.04354631077260594 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8247863247863247, + "acc_stderr": 0.02490443909891824, + "acc_norm": 0.8247863247863247, + "acc_norm_stderr": 0.02490443909891824 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252607, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252607 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7790549169859514, + "acc_stderr": 0.014836205167333555, + "acc_norm": 0.7790549169859514, + "acc_norm_stderr": 0.014836205167333555 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6416184971098265, + "acc_stderr": 0.025816756791584197, + "acc_norm": 0.6416184971098265, + "acc_norm_stderr": 0.025816756791584197 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.5027932960893855, + "acc_stderr": 0.016722240595491714, + "acc_norm": 0.5027932960893855, + "acc_norm_stderr": 0.016722240595491714 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6470588235294118, + "acc_stderr": 0.027363593284684965, + "acc_norm": 0.6470588235294118, + "acc_norm_stderr": 0.027363593284684965 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6366559485530546, + "acc_stderr": 0.02731684767419271, + "acc_norm": 0.6366559485530546, + "acc_norm_stderr": 0.02731684767419271 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6604938271604939, + "acc_stderr": 0.026348564412011628, + "acc_norm": 0.6604938271604939, + "acc_norm_stderr": 0.026348564412011628 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.450354609929078, + "acc_stderr": 0.029680105565029036, + "acc_norm": 0.450354609929078, + "acc_norm_stderr": 0.029680105565029036 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.43741851368970014, + "acc_stderr": 0.012669813464935726, + "acc_norm": 0.43741851368970014, + "acc_norm_stderr": 0.012669813464935726 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5698529411764706, + "acc_stderr": 0.030074971917302875, + "acc_norm": 0.5698529411764706, + "acc_norm_stderr": 0.030074971917302875 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5915032679738562, + "acc_stderr": 0.019886221037501862, + "acc_norm": 0.5915032679738562, + "acc_norm_stderr": 0.019886221037501862 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6272727272727273, + "acc_stderr": 0.04631381319425465, + "acc_norm": 0.6272727272727273, + "acc_norm_stderr": 0.04631381319425465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6653061224489796, + "acc_stderr": 0.030209235226242307, + "acc_norm": 0.6653061224489796, + "acc_norm_stderr": 0.030209235226242307 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7661691542288557, + "acc_stderr": 0.029929415408348384, + "acc_norm": 0.7661691542288557, + "acc_norm_stderr": 0.029929415408348384 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.03487350880197769, + "acc_norm": 0.86, + "acc_norm_stderr": 0.03487350880197769 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4578313253012048, + "acc_stderr": 0.038786267710023595, + "acc_norm": 0.4578313253012048, + "acc_norm_stderr": 0.038786267710023595 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.03094445977853321, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.03094445977853321 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3806609547123623, + "mc1_stderr": 0.016997627871907926, + "mc2": 0.5283543922925904, + "mc2_stderr": 0.015514015586882196 + }, + "all": { + "acc": 0.5886983442536506, + "acc_stderr": 0.03393248935524807, + "acc_norm": 0.5923771951565636, + "acc_norm_stderr": 0.033910992268385266, + "mc1": 0.3806609547123623, + "mc1_stderr": 0.016997627871907926, + "mc2": 0.5283543922925904, + "mc2_stderr": 0.015514015586882196 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6343.98304438591", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MLewd-L2-Chat-13B/results_2023-11-05T00-36-15.205012.json b/eval-results/Undi95/MLewd-L2-Chat-13B/results_2023-11-05T00-36-15.205012.json new file mode 100644 index 0000000000000000000000000000000000000000..7f28d5f62faac9ab7133716db4e7e0088af57ca9 --- /dev/null +++ b/eval-results/Undi95/MLewd-L2-Chat-13B/results_2023-11-05T00-36-15.205012.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "Undi95/MLewd-L2-Chat-13B", + "model_sha": "399d09d9c6bc5b85fd2d4a4c1e5663c49b577bcb", + "model_dtype": "torch.float16", + "model_size": "24.32 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.039953859060402684, + "em_stderr": 0.0020056958276819816, + "f1": 0.12528313758389248, + "f1_stderr": 0.0025138994037981494 + }, + "harness|gsm8k|5": { + "acc": 0.11296436694465505, + "acc_stderr": 0.008719339028833055 + }, + "harness|winogrande|5": { + "acc": 0.7742699289660616, + "acc_stderr": 0.011749626260902545 + }, + "all": { + "em": 0.039953859060402684, + "em_stderr": 0.0020056958276819816, + "f1": 0.12528313758389248, + "f1_stderr": 0.0025138994037981494, + "acc": 0.44361714795535834, + "acc_stderr": 0.010234482644867801 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "ee72d97f2485d3f3" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "8cca4053ecf05cce" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "bcfcfba5dd1371b8" + }, + "truncated": 3, + "non_truncated": 12119, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MLewd-L2-Chat-13B/results_2023-11-07T04-02-20.497765.json b/eval-results/Undi95/MLewd-L2-Chat-13B/results_2023-11-07T04-02-20.497765.json new file mode 100644 index 0000000000000000000000000000000000000000..7f28d5f62faac9ab7133716db4e7e0088af57ca9 --- /dev/null +++ b/eval-results/Undi95/MLewd-L2-Chat-13B/results_2023-11-07T04-02-20.497765.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "Undi95/MLewd-L2-Chat-13B", + "model_sha": "399d09d9c6bc5b85fd2d4a4c1e5663c49b577bcb", + "model_dtype": "torch.float16", + "model_size": "24.32 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.039953859060402684, + "em_stderr": 0.0020056958276819816, + "f1": 0.12528313758389248, + "f1_stderr": 0.0025138994037981494 + }, + "harness|gsm8k|5": { + "acc": 0.11296436694465505, + "acc_stderr": 0.008719339028833055 + }, + "harness|winogrande|5": { + "acc": 0.7742699289660616, + "acc_stderr": 0.011749626260902545 + }, + "all": { + "em": 0.039953859060402684, + "em_stderr": 0.0020056958276819816, + "f1": 0.12528313758389248, + "f1_stderr": 0.0025138994037981494, + "acc": 0.44361714795535834, + "acc_stderr": 0.010234482644867801 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "ee72d97f2485d3f3" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "8cca4053ecf05cce" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "bcfcfba5dd1371b8" + }, + "truncated": 3, + "non_truncated": 12119, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MLewd-ReMM-L2-Chat-20B-Inverted/results_2023-10-08T21-13-04.392733.json b/eval-results/Undi95/MLewd-ReMM-L2-Chat-20B-Inverted/results_2023-10-08T21-13-04.392733.json new file mode 100644 index 0000000000000000000000000000000000000000..a12d191c3ce91682e79e89b5e6d5c571d36d47bc --- /dev/null +++ b/eval-results/Undi95/MLewd-ReMM-L2-Chat-20B-Inverted/results_2023-10-08T21-13-04.392733.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/MLewd-ReMM-L2-Chat-20B-Inverted", + "model_sha": "b5b501b4d23ec7ab24b827f79e48b2c67e548ddb", + "model_size": "37.36 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5921501706484642, + "acc_stderr": 0.014361097288449693, + "acc_norm": 0.6168941979522184, + "acc_norm_stderr": 0.014206472661672877 + }, + "harness|hellaswag|10": { + "acc": 0.6648078072097192, + "acc_stderr": 0.004710928569985762, + "acc_norm": 0.8532164907388966, + "acc_norm_stderr": 0.003531667185235823 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4740740740740741, + "acc_stderr": 0.04313531696750574, + "acc_norm": 0.4740740740740741, + "acc_norm_stderr": 0.04313531696750574 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5394736842105263, + "acc_stderr": 0.04056242252249033, + "acc_norm": 0.5394736842105263, + "acc_norm_stderr": 0.04056242252249033 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6188679245283019, + "acc_stderr": 0.029890609686286644, + "acc_norm": 0.6188679245283019, + "acc_norm_stderr": 0.029890609686286644 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6736111111111112, + "acc_stderr": 0.03921067198982266, + "acc_norm": 0.6736111111111112, + "acc_norm_stderr": 0.03921067198982266 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5549132947976878, + "acc_stderr": 0.03789401760283647, + "acc_norm": 0.5549132947976878, + "acc_norm_stderr": 0.03789401760283647 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04690650298201942, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04690650298201942 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252609, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252609 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.49361702127659574, + "acc_stderr": 0.032683358999363366, + "acc_norm": 0.49361702127659574, + "acc_norm_stderr": 0.032683358999363366 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748141, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748141 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.335978835978836, + "acc_stderr": 0.02432631052914915, + "acc_norm": 0.335978835978836, + "acc_norm_stderr": 0.02432631052914915 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.04403438954768177, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.04403438954768177 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6645161290322581, + "acc_stderr": 0.02686020644472435, + "acc_norm": 0.6645161290322581, + "acc_norm_stderr": 0.02686020644472435 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4876847290640394, + "acc_stderr": 0.035169204442208966, + "acc_norm": 0.4876847290640394, + "acc_norm_stderr": 0.035169204442208966 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7212121212121212, + "acc_stderr": 0.035014387062967806, + "acc_norm": 0.7212121212121212, + "acc_norm_stderr": 0.035014387062967806 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.031911782267135466, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.031911782267135466 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8186528497409327, + "acc_stderr": 0.027807032360686088, + "acc_norm": 0.8186528497409327, + "acc_norm_stderr": 0.027807032360686088 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.558974358974359, + "acc_stderr": 0.025174048384000745, + "acc_norm": 0.558974358974359, + "acc_norm_stderr": 0.025174048384000745 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34444444444444444, + "acc_stderr": 0.028972648884844267, + "acc_norm": 0.34444444444444444, + "acc_norm_stderr": 0.028972648884844267 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6596638655462185, + "acc_stderr": 0.030778057422931673, + "acc_norm": 0.6596638655462185, + "acc_norm_stderr": 0.030778057422931673 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.37748344370860926, + "acc_stderr": 0.03958027231121569, + "acc_norm": 0.37748344370860926, + "acc_norm_stderr": 0.03958027231121569 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7467889908256881, + "acc_stderr": 0.01864407304137504, + "acc_norm": 0.7467889908256881, + "acc_norm_stderr": 0.01864407304137504 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.028867431449849313, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.028867431449849313 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7637130801687764, + "acc_stderr": 0.02765215314415927, + "acc_norm": 0.7637130801687764, + "acc_norm_stderr": 0.02765215314415927 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7040358744394619, + "acc_stderr": 0.030636591348699813, + "acc_norm": 0.7040358744394619, + "acc_norm_stderr": 0.030636591348699813 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6335877862595419, + "acc_stderr": 0.04225875451969637, + "acc_norm": 0.6335877862595419, + "acc_norm_stderr": 0.04225875451969637 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.03984979653302872, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.03984979653302872 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.042844679680521934, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.042844679680521934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6932515337423313, + "acc_stderr": 0.03623089915724146, + "acc_norm": 0.6932515337423313, + "acc_norm_stderr": 0.03623089915724146 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04287858751340455, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04287858751340455 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.04541609446503948, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.04541609446503948 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.024414947304543678, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.024414947304543678 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7726692209450831, + "acc_stderr": 0.014987270640946002, + "acc_norm": 0.7726692209450831, + "acc_norm_stderr": 0.014987270640946002 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6705202312138728, + "acc_stderr": 0.025305258131879706, + "acc_norm": 0.6705202312138728, + "acc_norm_stderr": 0.025305258131879706 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4312849162011173, + "acc_stderr": 0.016563829399047703, + "acc_norm": 0.4312849162011173, + "acc_norm_stderr": 0.016563829399047703 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.630718954248366, + "acc_stderr": 0.02763417668960266, + "acc_norm": 0.630718954248366, + "acc_norm_stderr": 0.02763417668960266 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6752411575562701, + "acc_stderr": 0.026596782287697043, + "acc_norm": 0.6752411575562701, + "acc_norm_stderr": 0.026596782287697043 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6851851851851852, + "acc_stderr": 0.025842248700902168, + "acc_norm": 0.6851851851851852, + "acc_norm_stderr": 0.025842248700902168 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4574468085106383, + "acc_stderr": 0.02971928127223685, + "acc_norm": 0.4574468085106383, + "acc_norm_stderr": 0.02971928127223685 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.45697522816166886, + "acc_stderr": 0.012722869501611419, + "acc_norm": 0.45697522816166886, + "acc_norm_stderr": 0.012722869501611419 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5661764705882353, + "acc_stderr": 0.030105636570016633, + "acc_norm": 0.5661764705882353, + "acc_norm_stderr": 0.030105636570016633 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6241830065359477, + "acc_stderr": 0.01959402113657744, + "acc_norm": 0.6241830065359477, + "acc_norm_stderr": 0.01959402113657744 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6090909090909091, + "acc_stderr": 0.04673752333670239, + "acc_norm": 0.6090909090909091, + "acc_norm_stderr": 0.04673752333670239 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6489795918367347, + "acc_stderr": 0.03055531675557364, + "acc_norm": 0.6489795918367347, + "acc_norm_stderr": 0.03055531675557364 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.746268656716418, + "acc_stderr": 0.030769444967296018, + "acc_norm": 0.746268656716418, + "acc_norm_stderr": 0.030769444967296018 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.03379976689896309, + "acc_norm": 0.87, + "acc_norm_stderr": 0.03379976689896309 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.43373493975903615, + "acc_stderr": 0.03858158940685517, + "acc_norm": 0.43373493975903615, + "acc_norm_stderr": 0.03858158940685517 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7543859649122807, + "acc_stderr": 0.03301405946987249, + "acc_norm": 0.7543859649122807, + "acc_norm_stderr": 0.03301405946987249 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3733170134638923, + "mc1_stderr": 0.01693237055757063, + "mc2": 0.5376513807870049, + "mc2_stderr": 0.015911441950703046 + }, + "all": { + "acc": 0.5816164336009417, + "acc_stderr": 0.03420587695783267, + "acc_norm": 0.5852291914116693, + "acc_norm_stderr": 0.034183268720349165, + "mc1": 0.3733170134638923, + "mc1_stderr": 0.01693237055757063, + "mc2": 0.5376513807870049, + "mc2_stderr": 0.015911441950703046 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "9968.122472763062", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MLewd-ReMM-L2-Chat-20B-Inverted/results_2023-10-29T11-23-30.940403.json b/eval-results/Undi95/MLewd-ReMM-L2-Chat-20B-Inverted/results_2023-10-29T11-23-30.940403.json new file mode 100644 index 0000000000000000000000000000000000000000..4080312479663235b68c8fd4788fb42fc36b6e23 --- /dev/null +++ b/eval-results/Undi95/MLewd-ReMM-L2-Chat-20B-Inverted/results_2023-10-29T11-23-30.940403.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/MLewd-ReMM-L2-Chat-20B-Inverted", + "model_sha": "b5b501b4d23ec7ab24b827f79e48b2c67e548ddb", + "model_size": "37.36 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.04079278523489933, + "em_stderr": 0.0020257579367794474, + "f1": 0.12161703020134187, + "f1_stderr": 0.002493984929248759 + }, + "harness|gsm8k|5": { + "acc": 0.09097801364670205, + "acc_stderr": 0.007921322844013656 + }, + "harness|winogrande|5": { + "acc": 0.7561168113654302, + "acc_stderr": 0.01206892327890819 + }, + "all": { + "em": 0.04079278523489933, + "em_stderr": 0.0020257579367794474, + "f1": 0.12161703020134187, + "f1_stderr": 0.002493984929248759, + "acc": 0.4235474125060661, + "acc_stderr": 0.009995123061460923 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "00c9be5761c2b0fd" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "182159c4f34d2477" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "bff1e24645b3990c" + }, + "total_evaluation_time_secondes": "22116.086834669113", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MLewd-ReMM-L2-Chat-20B/results_2023-10-03T13-01-09.823619.json b/eval-results/Undi95/MLewd-ReMM-L2-Chat-20B/results_2023-10-03T13-01-09.823619.json new file mode 100644 index 0000000000000000000000000000000000000000..7f1c651a4fc3a3648693bdc9ed49fb6e5f768600 --- /dev/null +++ b/eval-results/Undi95/MLewd-ReMM-L2-Chat-20B/results_2023-10-03T13-01-09.823619.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/MLewd-ReMM-L2-Chat-20B", + "model_sha": "cda06630a1d8173541431e5ce8bc17dcfaa37e5e", + "model_size": "37.36 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5964163822525598, + "acc_stderr": 0.014337158914268447, + "acc_norm": 0.6245733788395904, + "acc_norm_stderr": 0.014150631435111728 + }, + "harness|hellaswag|10": { + "acc": 0.6690898227444733, + "acc_stderr": 0.004695791340502876, + "acc_norm": 0.8562039434375622, + "acc_norm_stderr": 0.0035016571073867085 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244219, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244219 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5037037037037037, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.5037037037037037, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5723684210526315, + "acc_stderr": 0.04026097083296563, + "acc_norm": 0.5723684210526315, + "acc_norm_stderr": 0.04026097083296563 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.63, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.63, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6226415094339622, + "acc_stderr": 0.029832808114796005, + "acc_norm": 0.6226415094339622, + "acc_norm_stderr": 0.029832808114796005 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6458333333333334, + "acc_stderr": 0.039994111357535424, + "acc_norm": 0.6458333333333334, + "acc_norm_stderr": 0.039994111357535424 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.45, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5895953757225434, + "acc_stderr": 0.03750757044895537, + "acc_norm": 0.5895953757225434, + "acc_norm_stderr": 0.03750757044895537 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04690650298201942, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04690650298201942 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.48936170212765956, + "acc_stderr": 0.03267862331014063, + "acc_norm": 0.48936170212765956, + "acc_norm_stderr": 0.03267862331014063 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.34656084656084657, + "acc_stderr": 0.024508777521028424, + "acc_norm": 0.34656084656084657, + "acc_norm_stderr": 0.024508777521028424 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.04375888492727061, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.04375888492727061 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7064516129032258, + "acc_stderr": 0.025906087021319295, + "acc_norm": 0.7064516129032258, + "acc_norm_stderr": 0.025906087021319295 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.49261083743842365, + "acc_stderr": 0.03517603540361008, + "acc_norm": 0.49261083743842365, + "acc_norm_stderr": 0.03517603540361008 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7696969696969697, + "acc_stderr": 0.03287666758603489, + "acc_norm": 0.7696969696969697, + "acc_norm_stderr": 0.03287666758603489 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7626262626262627, + "acc_stderr": 0.030313710538198906, + "acc_norm": 0.7626262626262627, + "acc_norm_stderr": 0.030313710538198906 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8393782383419689, + "acc_stderr": 0.026499057701397436, + "acc_norm": 0.8393782383419689, + "acc_norm_stderr": 0.026499057701397436 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5948717948717949, + "acc_stderr": 0.024890471769938145, + "acc_norm": 0.5948717948717949, + "acc_norm_stderr": 0.024890471769938145 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.337037037037037, + "acc_stderr": 0.028820884666253252, + "acc_norm": 0.337037037037037, + "acc_norm_stderr": 0.028820884666253252 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6386554621848739, + "acc_stderr": 0.031204691225150016, + "acc_norm": 0.6386554621848739, + "acc_norm_stderr": 0.031204691225150016 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3841059602649007, + "acc_stderr": 0.03971301814719197, + "acc_norm": 0.3841059602649007, + "acc_norm_stderr": 0.03971301814719197 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7596330275229358, + "acc_stderr": 0.01832060732096407, + "acc_norm": 0.7596330275229358, + "acc_norm_stderr": 0.01832060732096407 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.46296296296296297, + "acc_stderr": 0.03400603625538271, + "acc_norm": 0.46296296296296297, + "acc_norm_stderr": 0.03400603625538271 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.803921568627451, + "acc_stderr": 0.027865942286639318, + "acc_norm": 0.803921568627451, + "acc_norm_stderr": 0.027865942286639318 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7637130801687764, + "acc_stderr": 0.027652153144159267, + "acc_norm": 0.7637130801687764, + "acc_norm_stderr": 0.027652153144159267 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6995515695067265, + "acc_stderr": 0.03076935200822915, + "acc_norm": 0.6995515695067265, + "acc_norm_stderr": 0.03076935200822915 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7022900763358778, + "acc_stderr": 0.040103589424622034, + "acc_norm": 0.7022900763358778, + "acc_norm_stderr": 0.040103589424622034 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908705, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.04133119440243839, + "acc_norm": 0.7592592592592593, + "acc_norm_stderr": 0.04133119440243839 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7055214723926381, + "acc_stderr": 0.03581165790474082, + "acc_norm": 0.7055214723926381, + "acc_norm_stderr": 0.03581165790474082 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7184466019417476, + "acc_stderr": 0.044532548363264673, + "acc_norm": 0.7184466019417476, + "acc_norm_stderr": 0.044532548363264673 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8461538461538461, + "acc_stderr": 0.023636873317489294, + "acc_norm": 0.8461538461538461, + "acc_norm_stderr": 0.023636873317489294 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7586206896551724, + "acc_stderr": 0.015302380123542115, + "acc_norm": 0.7586206896551724, + "acc_norm_stderr": 0.015302380123542115 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6705202312138728, + "acc_stderr": 0.025305258131879706, + "acc_norm": 0.6705202312138728, + "acc_norm_stderr": 0.025305258131879706 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.5083798882681564, + "acc_stderr": 0.016720152794672486, + "acc_norm": 0.5083798882681564, + "acc_norm_stderr": 0.016720152794672486 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6437908496732027, + "acc_stderr": 0.027420477662629242, + "acc_norm": 0.6437908496732027, + "acc_norm_stderr": 0.027420477662629242 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6784565916398714, + "acc_stderr": 0.026527724079528872, + "acc_norm": 0.6784565916398714, + "acc_norm_stderr": 0.026527724079528872 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6728395061728395, + "acc_stderr": 0.026105673861409818, + "acc_norm": 0.6728395061728395, + "acc_norm_stderr": 0.026105673861409818 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.46099290780141844, + "acc_stderr": 0.02973659252642443, + "acc_norm": 0.46099290780141844, + "acc_norm_stderr": 0.02973659252642443 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.46284224250325945, + "acc_stderr": 0.012734923579532074, + "acc_norm": 0.46284224250325945, + "acc_norm_stderr": 0.012734923579532074 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5955882352941176, + "acc_stderr": 0.029812630701569743, + "acc_norm": 0.5955882352941176, + "acc_norm_stderr": 0.029812630701569743 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.01965992249362335, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.01965992249362335 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.046075820907199756, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.046075820907199756 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6857142857142857, + "acc_stderr": 0.029719329422417475, + "acc_norm": 0.6857142857142857, + "acc_norm_stderr": 0.029719329422417475 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.746268656716418, + "acc_stderr": 0.030769444967296018, + "acc_norm": 0.746268656716418, + "acc_norm_stderr": 0.030769444967296018 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774708, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774708 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.463855421686747, + "acc_stderr": 0.03882310850890593, + "acc_norm": 0.463855421686747, + "acc_norm_stderr": 0.03882310850890593 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7543859649122807, + "acc_stderr": 0.03301405946987249, + "acc_norm": 0.7543859649122807, + "acc_norm_stderr": 0.03301405946987249 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3929008567931457, + "mc1_stderr": 0.017097248285233065, + "mc2": 0.5562951743828177, + "mc2_stderr": 0.015862974807699288 + }, + "all": { + "acc": 0.5927393333620284, + "acc_stderr": 0.03399197195287319, + "acc_norm": 0.5963879963667763, + "acc_norm_stderr": 0.03396857090690247, + "mc1": 0.3929008567931457, + "mc1_stderr": 0.017097248285233065, + "mc2": 0.5562951743828177, + "mc2_stderr": 0.015862974807699288 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "9973.10211277008", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MLewd-ReMM-L2-Chat-20B/results_2023-10-23T21-41-03.684290.json b/eval-results/Undi95/MLewd-ReMM-L2-Chat-20B/results_2023-10-23T21-41-03.684290.json new file mode 100644 index 0000000000000000000000000000000000000000..1bcd56a24f9d24962d85ed96b519950f56b6a017 --- /dev/null +++ b/eval-results/Undi95/MLewd-ReMM-L2-Chat-20B/results_2023-10-23T21-41-03.684290.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/MLewd-ReMM-L2-Chat-20B", + "model_sha": "cda06630a1d8173541431e5ce8bc17dcfaa37e5e", + "model_size": "37.36 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.1294043624161074, + "em_stderr": 0.0034373389026090095, + "f1": 0.22332843959731472, + "f1_stderr": 0.003630049548732814 + }, + "harness|gsm8k|5": { + "acc": 0.10917361637604246, + "acc_stderr": 0.008590089300511142 + }, + "harness|winogrande|5": { + "acc": 0.7719021310181531, + "acc_stderr": 0.011793015817663595 + }, + "all": { + "em": 0.1294043624161074, + "em_stderr": 0.0034373389026090095, + "f1": 0.22332843959731472, + "f1_stderr": 0.003630049548732814, + "acc": 0.4405378736970978, + "acc_stderr": 0.01019155255908737 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "da2a3613e1dbb58b" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "d3fd7c2065d1bbd1" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "c63f935e35e8d4da" + }, + "total_evaluation_time_secondes": "20584.010545015335", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MLewd-v2.4-13B/results_2023-10-04T08-57-01.780513.json b/eval-results/Undi95/MLewd-v2.4-13B/results_2023-10-04T08-57-01.780513.json new file mode 100644 index 0000000000000000000000000000000000000000..5516a8a39dfd06165db8c94eb998f5a5b071eaf5 --- /dev/null +++ b/eval-results/Undi95/MLewd-v2.4-13B/results_2023-10-04T08-57-01.780513.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/MLewd-v2.4-13B", + "model_sha": "6f6ec6024ee054020e49fd96f149919692848f0b", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5870307167235495, + "acc_stderr": 0.014388344935398326, + "acc_norm": 0.6168941979522184, + "acc_norm_stderr": 0.014206472661672877 + }, + "harness|hellaswag|10": { + "acc": 0.6404102768372834, + "acc_stderr": 0.004788994060654276, + "acc_norm": 0.8382792272455686, + "acc_norm_stderr": 0.0036744197993536687 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5333333333333333, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.5333333333333333, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5263157894736842, + "acc_stderr": 0.04063302731486671, + "acc_norm": 0.5263157894736842, + "acc_norm_stderr": 0.04063302731486671 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5584905660377358, + "acc_stderr": 0.030561590426731833, + "acc_norm": 0.5584905660377358, + "acc_norm_stderr": 0.030561590426731833 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5763888888888888, + "acc_stderr": 0.041321250197233685, + "acc_norm": 0.5763888888888888, + "acc_norm_stderr": 0.041321250197233685 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411019, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411019 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.45, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.040925639582376556, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.040925639582376556 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4808510638297872, + "acc_stderr": 0.03266204299064678, + "acc_norm": 0.4808510638297872, + "acc_norm_stderr": 0.03266204299064678 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.04142439719489362, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.04142439719489362 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30423280423280424, + "acc_stderr": 0.023695415009463087, + "acc_norm": 0.30423280423280424, + "acc_norm_stderr": 0.023695415009463087 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.04325506042017086, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.04325506042017086 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6290322580645161, + "acc_stderr": 0.027480541887953593, + "acc_norm": 0.6290322580645161, + "acc_norm_stderr": 0.027480541887953593 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3891625615763547, + "acc_stderr": 0.03430462416103871, + "acc_norm": 0.3891625615763547, + "acc_norm_stderr": 0.03430462416103871 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6848484848484848, + "acc_stderr": 0.0362773057502241, + "acc_norm": 0.6848484848484848, + "acc_norm_stderr": 0.0362773057502241 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6868686868686869, + "acc_stderr": 0.033042050878136525, + "acc_norm": 0.6868686868686869, + "acc_norm_stderr": 0.033042050878136525 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7823834196891192, + "acc_stderr": 0.02977866303775296, + "acc_norm": 0.7823834196891192, + "acc_norm_stderr": 0.02977866303775296 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5256410256410257, + "acc_stderr": 0.025317649726448663, + "acc_norm": 0.5256410256410257, + "acc_norm_stderr": 0.025317649726448663 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.31851851851851853, + "acc_stderr": 0.02840653309060846, + "acc_norm": 0.31851851851851853, + "acc_norm_stderr": 0.02840653309060846 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5672268907563025, + "acc_stderr": 0.03218358107742613, + "acc_norm": 0.5672268907563025, + "acc_norm_stderr": 0.03218358107742613 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7321100917431193, + "acc_stderr": 0.018987462257978652, + "acc_norm": 0.7321100917431193, + "acc_norm_stderr": 0.018987462257978652 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3472222222222222, + "acc_stderr": 0.032468872436376486, + "acc_norm": 0.3472222222222222, + "acc_norm_stderr": 0.032468872436376486 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7647058823529411, + "acc_stderr": 0.029771775228145635, + "acc_norm": 0.7647058823529411, + "acc_norm_stderr": 0.029771775228145635 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7426160337552743, + "acc_stderr": 0.028458820991460302, + "acc_norm": 0.7426160337552743, + "acc_norm_stderr": 0.028458820991460302 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.039849796533028725, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.039849796533028725 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.04330043749650742, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.04330043749650742 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6687116564417178, + "acc_stderr": 0.03697983910025588, + "acc_norm": 0.6687116564417178, + "acc_norm_stderr": 0.03697983910025588 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.045218299028335865, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.045218299028335865 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6699029126213593, + "acc_stderr": 0.0465614711001235, + "acc_norm": 0.6699029126213593, + "acc_norm_stderr": 0.0465614711001235 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7991452991452992, + "acc_stderr": 0.026246772946890484, + "acc_norm": 0.7991452991452992, + "acc_norm_stderr": 0.026246772946890484 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7573435504469987, + "acc_stderr": 0.015329888940899847, + "acc_norm": 0.7573435504469987, + "acc_norm_stderr": 0.015329888940899847 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6127167630057804, + "acc_stderr": 0.026226158605124658, + "acc_norm": 0.6127167630057804, + "acc_norm_stderr": 0.026226158605124658 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.376536312849162, + "acc_stderr": 0.016204672385106603, + "acc_norm": 0.376536312849162, + "acc_norm_stderr": 0.016204672385106603 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6045751633986928, + "acc_stderr": 0.02799672318063145, + "acc_norm": 0.6045751633986928, + "acc_norm_stderr": 0.02799672318063145 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6334405144694534, + "acc_stderr": 0.027368078243971635, + "acc_norm": 0.6334405144694534, + "acc_norm_stderr": 0.027368078243971635 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6419753086419753, + "acc_stderr": 0.026675611926037096, + "acc_norm": 0.6419753086419753, + "acc_norm_stderr": 0.026675611926037096 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4326241134751773, + "acc_stderr": 0.02955545423677886, + "acc_norm": 0.4326241134751773, + "acc_norm_stderr": 0.02955545423677886 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4165580182529335, + "acc_stderr": 0.012591153245057388, + "acc_norm": 0.4165580182529335, + "acc_norm_stderr": 0.012591153245057388 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5073529411764706, + "acc_stderr": 0.030369552523902173, + "acc_norm": 0.5073529411764706, + "acc_norm_stderr": 0.030369552523902173 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5686274509803921, + "acc_stderr": 0.020036393768352628, + "acc_norm": 0.5686274509803921, + "acc_norm_stderr": 0.020036393768352628 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.0449429086625209, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.0449429086625209 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6163265306122448, + "acc_stderr": 0.031130880396235943, + "acc_norm": 0.6163265306122448, + "acc_norm_stderr": 0.031130880396235943 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6965174129353234, + "acc_stderr": 0.032510068164586174, + "acc_norm": 0.6965174129353234, + "acc_norm_stderr": 0.032510068164586174 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4578313253012048, + "acc_stderr": 0.0387862677100236, + "acc_norm": 0.4578313253012048, + "acc_norm_stderr": 0.0387862677100236 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.031885780176863984, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.031885780176863984 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.37454100367197063, + "mc1_stderr": 0.01694353512840532, + "mc2": 0.5333727097453667, + "mc2_stderr": 0.01556908807672099 + }, + "all": { + "acc": 0.5531081055200199, + "acc_stderr": 0.03439288137888107, + "acc_norm": 0.5569679772426801, + "acc_norm_stderr": 0.03437090770879589, + "mc1": 0.37454100367197063, + "mc1_stderr": 0.01694353512840532, + "mc2": 0.5333727097453667, + "mc2_stderr": 0.01556908807672099 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6385.4990401268005", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MLewd-v2.4-13B/results_2023-11-05T06-43-46.123528.json b/eval-results/Undi95/MLewd-v2.4-13B/results_2023-11-05T06-43-46.123528.json new file mode 100644 index 0000000000000000000000000000000000000000..8bbedc1f9182944d8032d1006657ad96e0bcef62 --- /dev/null +++ b/eval-results/Undi95/MLewd-v2.4-13B/results_2023-11-05T06-43-46.123528.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "Undi95/MLewd-v2.4-13B", + "model_sha": "6f6ec6024ee054020e49fd96f149919692848f0b", + "model_dtype": "torch.float16", + "model_size": "24.32 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.37153942953020136, + "em_stderr": 0.004948586020359345, + "f1": 0.4432686661073842, + "f1_stderr": 0.0047496461477472855 + }, + "harness|gsm8k|5": { + "acc": 0.0978013646702047, + "acc_stderr": 0.008182119821849047 + }, + "harness|winogrande|5": { + "acc": 0.745067087608524, + "acc_stderr": 0.012248806969376422 + }, + "all": { + "em": 0.37153942953020136, + "em_stderr": 0.004948586020359345, + "f1": 0.4432686661073842, + "f1_stderr": 0.0047496461477472855, + "acc": 0.4214342261393644, + "acc_stderr": 0.010215463395612735 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "e846dfb10312ac80" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "6776ddf9c83dc698" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "73a67a95eb7541d0" + }, + "truncated": 3, + "non_truncated": 12119, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MLewd-v2.4-13B/results_2023-11-06T15-01-09.022171.json b/eval-results/Undi95/MLewd-v2.4-13B/results_2023-11-06T15-01-09.022171.json new file mode 100644 index 0000000000000000000000000000000000000000..8bbedc1f9182944d8032d1006657ad96e0bcef62 --- /dev/null +++ b/eval-results/Undi95/MLewd-v2.4-13B/results_2023-11-06T15-01-09.022171.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "Undi95/MLewd-v2.4-13B", + "model_sha": "6f6ec6024ee054020e49fd96f149919692848f0b", + "model_dtype": "torch.float16", + "model_size": "24.32 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.37153942953020136, + "em_stderr": 0.004948586020359345, + "f1": 0.4432686661073842, + "f1_stderr": 0.0047496461477472855 + }, + "harness|gsm8k|5": { + "acc": 0.0978013646702047, + "acc_stderr": 0.008182119821849047 + }, + "harness|winogrande|5": { + "acc": 0.745067087608524, + "acc_stderr": 0.012248806969376422 + }, + "all": { + "em": 0.37153942953020136, + "em_stderr": 0.004948586020359345, + "f1": 0.4432686661073842, + "f1_stderr": 0.0047496461477472855, + "acc": 0.4214342261393644, + "acc_stderr": 0.010215463395612735 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "e846dfb10312ac80" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "6776ddf9c83dc698" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "73a67a95eb7541d0" + }, + "truncated": 3, + "non_truncated": 12119, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MLewdBoros-L2-13B/results_2023-09-18T13-56-38.282478.json b/eval-results/Undi95/MLewdBoros-L2-13B/results_2023-09-18T13-56-38.282478.json new file mode 100644 index 0000000000000000000000000000000000000000..ffe09f37f77eb6baed218584e974625c621ec615 --- /dev/null +++ b/eval-results/Undi95/MLewdBoros-L2-13B/results_2023-09-18T13-56-38.282478.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/MLewdBoros-L2-13B", + "model_sha": "a3033ac5825662f1c66418d7543648dc76980185", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5947098976109215, + "acc_stderr": 0.014346869060229327, + "acc_norm": 0.6254266211604096, + "acc_norm_stderr": 0.014144193471893454 + }, + "harness|hellaswag|10": { + "acc": 0.6385182234614618, + "acc_stderr": 0.004794478426382608, + "acc_norm": 0.8389762995419239, + "acc_norm_stderr": 0.003668016360975837 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5037037037037037, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.5037037037037037, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5460526315789473, + "acc_stderr": 0.04051646342874142, + "acc_norm": 0.5460526315789473, + "acc_norm_stderr": 0.04051646342874142 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6264150943396226, + "acc_stderr": 0.029773082713319875, + "acc_norm": 0.6264150943396226, + "acc_norm_stderr": 0.029773082713319875 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5902777777777778, + "acc_stderr": 0.04112490974670787, + "acc_norm": 0.5902777777777778, + "acc_norm_stderr": 0.04112490974670787 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.04488482852329017, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.04488482852329017 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.43829787234042555, + "acc_stderr": 0.03243618636108102, + "acc_norm": 0.43829787234042555, + "acc_norm_stderr": 0.03243618636108102 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.496551724137931, + "acc_stderr": 0.04166567577101579, + "acc_norm": 0.496551724137931, + "acc_norm_stderr": 0.04166567577101579 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.32275132275132273, + "acc_stderr": 0.024078943243597016, + "acc_norm": 0.32275132275132273, + "acc_norm_stderr": 0.024078943243597016 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.04306241259127153, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.04306241259127153 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.667741935483871, + "acc_stderr": 0.026795560848122804, + "acc_norm": 0.667741935483871, + "acc_norm_stderr": 0.026795560848122804 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4482758620689655, + "acc_stderr": 0.034991131376767445, + "acc_norm": 0.4482758620689655, + "acc_norm_stderr": 0.034991131376767445 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6787878787878788, + "acc_stderr": 0.0364620496325381, + "acc_norm": 0.6787878787878788, + "acc_norm_stderr": 0.0364620496325381 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.03191178226713545, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.03191178226713545 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8134715025906736, + "acc_stderr": 0.028112091210117467, + "acc_norm": 0.8134715025906736, + "acc_norm_stderr": 0.028112091210117467 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5153846153846153, + "acc_stderr": 0.025339003010106515, + "acc_norm": 0.5153846153846153, + "acc_norm_stderr": 0.025339003010106515 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.027634907264178544, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.027634907264178544 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.03196876989195778, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.03196876989195778 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943342, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943342 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7376146788990826, + "acc_stderr": 0.018861885021534734, + "acc_norm": 0.7376146788990826, + "acc_norm_stderr": 0.018861885021534734 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.41203703703703703, + "acc_stderr": 0.03356787758160835, + "acc_norm": 0.41203703703703703, + "acc_norm_stderr": 0.03356787758160835 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7598039215686274, + "acc_stderr": 0.02998373305591361, + "acc_norm": 0.7598039215686274, + "acc_norm_stderr": 0.02998373305591361 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7468354430379747, + "acc_stderr": 0.028304657943035303, + "acc_norm": 0.7468354430379747, + "acc_norm_stderr": 0.028304657943035303 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6771300448430493, + "acc_stderr": 0.03138147637575499, + "acc_norm": 0.6771300448430493, + "acc_norm_stderr": 0.03138147637575499 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.03984979653302873, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.03984979653302873 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.042844679680521934, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.042844679680521934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6809815950920245, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.6809815950920245, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.044328040552915185, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.044328040552915185 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7281553398058253, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.7281553398058253, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.811965811965812, + "acc_stderr": 0.025598193686652258, + "acc_norm": 0.811965811965812, + "acc_norm_stderr": 0.025598193686652258 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7611749680715197, + "acc_stderr": 0.015246803197398675, + "acc_norm": 0.7611749680715197, + "acc_norm_stderr": 0.015246803197398675 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.638728323699422, + "acc_stderr": 0.025862201852277902, + "acc_norm": 0.638728323699422, + "acc_norm_stderr": 0.025862201852277902 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.43687150837988825, + "acc_stderr": 0.01658868086453062, + "acc_norm": 0.43687150837988825, + "acc_norm_stderr": 0.01658868086453062 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6209150326797386, + "acc_stderr": 0.027780141207023344, + "acc_norm": 0.6209150326797386, + "acc_norm_stderr": 0.027780141207023344 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6495176848874598, + "acc_stderr": 0.027098652621301754, + "acc_norm": 0.6495176848874598, + "acc_norm_stderr": 0.027098652621301754 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6327160493827161, + "acc_stderr": 0.02682280175950789, + "acc_norm": 0.6327160493827161, + "acc_norm_stderr": 0.02682280175950789 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.42907801418439717, + "acc_stderr": 0.029525914302558555, + "acc_norm": 0.42907801418439717, + "acc_norm_stderr": 0.029525914302558555 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4276401564537158, + "acc_stderr": 0.012635799922765844, + "acc_norm": 0.4276401564537158, + "acc_norm_stderr": 0.012635799922765844 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5551470588235294, + "acc_stderr": 0.03018753206032938, + "acc_norm": 0.5551470588235294, + "acc_norm_stderr": 0.03018753206032938 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5751633986928104, + "acc_stderr": 0.01999797303545833, + "acc_norm": 0.5751633986928104, + "acc_norm_stderr": 0.01999797303545833 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6408163265306123, + "acc_stderr": 0.03071356045510849, + "acc_norm": 0.6408163265306123, + "acc_norm_stderr": 0.03071356045510849 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7512437810945274, + "acc_stderr": 0.030567675938916707, + "acc_norm": 0.7512437810945274, + "acc_norm_stderr": 0.030567675938916707 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.0348735088019777, + "acc_norm": 0.86, + "acc_norm_stderr": 0.0348735088019777 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.46987951807228917, + "acc_stderr": 0.03885425420866766, + "acc_norm": 0.46987951807228917, + "acc_norm_stderr": 0.03885425420866766 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7660818713450293, + "acc_stderr": 0.03246721765117826, + "acc_norm": 0.7660818713450293, + "acc_norm_stderr": 0.03246721765117826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3378212974296206, + "mc1_stderr": 0.01655716732251688, + "mc2": 0.48136107027773045, + "mc2_stderr": 0.015082983111012829 + }, + "all": { + "acc": 0.5673804756955018, + "acc_stderr": 0.03428328895486213, + "acc_norm": 0.5712986926383822, + "acc_norm_stderr": 0.03426076119801903, + "mc1": 0.3378212974296206, + "mc1_stderr": 0.01655716732251688, + "mc2": 0.48136107027773045, + "mc2_stderr": 0.015082983111012829 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6454.189316987991", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MLewdBoros-L2-13B/results_2023-10-28T22-12-00.775103.json b/eval-results/Undi95/MLewdBoros-L2-13B/results_2023-10-28T22-12-00.775103.json new file mode 100644 index 0000000000000000000000000000000000000000..490672ea9caf3845e03e1e283ba0188248497501 --- /dev/null +++ b/eval-results/Undi95/MLewdBoros-L2-13B/results_2023-10-28T22-12-00.775103.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/MLewdBoros-L2-13B", + "model_sha": "a3033ac5825662f1c66418d7543648dc76980185", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.41820469798657717, + "em_stderr": 0.005051486654118123, + "f1": 0.4659270134228202, + "f1_stderr": 0.0048870842597281815 + }, + "harness|gsm8k|5": { + "acc": 0.10993176648976498, + "acc_stderr": 0.008616195587865394 + }, + "harness|winogrande|5": { + "acc": 0.7695343330702447, + "acc_stderr": 0.01183587216483668 + }, + "all": { + "em": 0.41820469798657717, + "em_stderr": 0.005051486654118123, + "f1": 0.4659270134228202, + "f1_stderr": 0.0048870842597281815, + "acc": 0.4397330497800048, + "acc_stderr": 0.010226033876351036 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "410a854246142b30" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "ca11d769852acfeb" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "970c4e26856e7f53" + }, + "total_evaluation_time_secondes": "26787.64310026169", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MM-ReMM-L2-20B/results_2023-10-03T17-39-23.702108.json b/eval-results/Undi95/MM-ReMM-L2-20B/results_2023-10-03T17-39-23.702108.json new file mode 100644 index 0000000000000000000000000000000000000000..d1ab2af2e3b5dd5dfb582ca3ee4c5fac463224d5 --- /dev/null +++ b/eval-results/Undi95/MM-ReMM-L2-20B/results_2023-10-03T17-39-23.702108.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/MM-ReMM-L2-20B", + "model_sha": "37869800c15fb37d017ea83bb50fec6d6141f6ba", + "model_size": "37.36 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5878839590443686, + "acc_stderr": 0.014383915302225403, + "acc_norm": 0.6083617747440273, + "acc_norm_stderr": 0.014264122124938215 + }, + "harness|hellaswag|10": { + "acc": 0.6595299741087433, + "acc_stderr": 0.004728988167338544, + "acc_norm": 0.851822346146186, + "acc_norm_stderr": 0.0035454991695580518 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411022, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411022 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5460526315789473, + "acc_stderr": 0.04051646342874142, + "acc_norm": 0.5460526315789473, + "acc_norm_stderr": 0.04051646342874142 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5886792452830188, + "acc_stderr": 0.03028500925900979, + "acc_norm": 0.5886792452830188, + "acc_norm_stderr": 0.03028500925900979 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6458333333333334, + "acc_stderr": 0.039994111357535424, + "acc_norm": 0.6458333333333334, + "acc_norm_stderr": 0.039994111357535424 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.45, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5144508670520231, + "acc_stderr": 0.03810871630454764, + "acc_norm": 0.5144508670520231, + "acc_norm_stderr": 0.03810871630454764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04690650298201942, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04690650298201942 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526094, + "acc_norm": 0.67, + "acc_norm_stderr": 0.047258156262526094 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4723404255319149, + "acc_stderr": 0.03263597118409769, + "acc_norm": 0.4723404255319149, + "acc_norm_stderr": 0.03263597118409769 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.22807017543859648, + "acc_stderr": 0.03947152782669415, + "acc_norm": 0.22807017543859648, + "acc_norm_stderr": 0.03947152782669415 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.02397386199899208, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.02397386199899208 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.042163702135578345, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.042163702135578345 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6580645161290323, + "acc_stderr": 0.02698528957655274, + "acc_norm": 0.6580645161290323, + "acc_norm_stderr": 0.02698528957655274 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.03481904844438804, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.03481904844438804 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.036085410115739666, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.036085410115739666 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.03173071239071724, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.03173071239071724 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8134715025906736, + "acc_stderr": 0.028112091210117478, + "acc_norm": 0.8134715025906736, + "acc_norm_stderr": 0.028112091210117478 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.558974358974359, + "acc_stderr": 0.025174048384000745, + "acc_norm": 0.558974358974359, + "acc_norm_stderr": 0.025174048384000745 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32222222222222224, + "acc_stderr": 0.028493465091028604, + "acc_norm": 0.32222222222222224, + "acc_norm_stderr": 0.028493465091028604 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6218487394957983, + "acc_stderr": 0.031499305777849054, + "acc_norm": 0.6218487394957983, + "acc_norm_stderr": 0.031499305777849054 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3509933774834437, + "acc_stderr": 0.03896981964257375, + "acc_norm": 0.3509933774834437, + "acc_norm_stderr": 0.03896981964257375 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7302752293577982, + "acc_stderr": 0.01902848671111544, + "acc_norm": 0.7302752293577982, + "acc_norm_stderr": 0.01902848671111544 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.46296296296296297, + "acc_stderr": 0.03400603625538271, + "acc_norm": 0.46296296296296297, + "acc_norm_stderr": 0.03400603625538271 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.028867431449849313, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.028867431449849313 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7468354430379747, + "acc_stderr": 0.0283046579430353, + "acc_norm": 0.7468354430379747, + "acc_norm_stderr": 0.0283046579430353 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6636771300448431, + "acc_stderr": 0.031708824268455, + "acc_norm": 0.6636771300448431, + "acc_norm_stderr": 0.031708824268455 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.040261875275912073, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.040261875275912073 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.04453197507374983, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.04453197507374983 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6748466257668712, + "acc_stderr": 0.036803503712864616, + "acc_norm": 0.6748466257668712, + "acc_norm_stderr": 0.036803503712864616 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.30357142857142855, + "acc_stderr": 0.04364226155841044, + "acc_norm": 0.30357142857142855, + "acc_norm_stderr": 0.04364226155841044 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6601941747572816, + "acc_stderr": 0.046897659372781335, + "acc_norm": 0.6601941747572816, + "acc_norm_stderr": 0.046897659372781335 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8290598290598291, + "acc_stderr": 0.024662496845209814, + "acc_norm": 0.8290598290598291, + "acc_norm_stderr": 0.024662496845209814 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7535121328224776, + "acc_stderr": 0.015411308769686933, + "acc_norm": 0.7535121328224776, + "acc_norm_stderr": 0.015411308769686933 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6560693641618497, + "acc_stderr": 0.02557412378654667, + "acc_norm": 0.6560693641618497, + "acc_norm_stderr": 0.02557412378654667 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3776536312849162, + "acc_stderr": 0.016214148752136632, + "acc_norm": 0.3776536312849162, + "acc_norm_stderr": 0.016214148752136632 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6470588235294118, + "acc_stderr": 0.02736359328468496, + "acc_norm": 0.6470588235294118, + "acc_norm_stderr": 0.02736359328468496 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6688102893890675, + "acc_stderr": 0.02673062072800491, + "acc_norm": 0.6688102893890675, + "acc_norm_stderr": 0.02673062072800491 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6790123456790124, + "acc_stderr": 0.02597656601086274, + "acc_norm": 0.6790123456790124, + "acc_norm_stderr": 0.02597656601086274 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4397163120567376, + "acc_stderr": 0.029609912075594106, + "acc_norm": 0.4397163120567376, + "acc_norm_stderr": 0.029609912075594106 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.45697522816166886, + "acc_stderr": 0.012722869501611419, + "acc_norm": 0.45697522816166886, + "acc_norm_stderr": 0.012722869501611419 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5551470588235294, + "acc_stderr": 0.030187532060329383, + "acc_norm": 0.5551470588235294, + "acc_norm_stderr": 0.030187532060329383 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.019910377463105932, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.019910377463105932 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6181818181818182, + "acc_stderr": 0.046534298079135075, + "acc_norm": 0.6181818181818182, + "acc_norm_stderr": 0.046534298079135075 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6408163265306123, + "acc_stderr": 0.030713560455108493, + "acc_norm": 0.6408163265306123, + "acc_norm_stderr": 0.030713560455108493 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7213930348258707, + "acc_stderr": 0.031700561834973086, + "acc_norm": 0.7213930348258707, + "acc_norm_stderr": 0.031700561834973086 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.0358870281282637, + "acc_norm": 0.85, + "acc_norm_stderr": 0.0358870281282637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.40963855421686746, + "acc_stderr": 0.03828401115079023, + "acc_norm": 0.40963855421686746, + "acc_norm_stderr": 0.03828401115079023 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7602339181286549, + "acc_stderr": 0.03274485211946956, + "acc_norm": 0.7602339181286549, + "acc_norm_stderr": 0.03274485211946956 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3708690330477356, + "mc1_stderr": 0.016909693580248825, + "mc2": 0.5333434257017081, + "mc2_stderr": 0.015907207649223338 + }, + "all": { + "acc": 0.5665465459002151, + "acc_stderr": 0.034322884462850756, + "acc_norm": 0.5701528202686406, + "acc_norm_stderr": 0.034300794934459776, + "mc1": 0.3708690330477356, + "mc1_stderr": 0.016909693580248825, + "mc2": 0.5333434257017081, + "mc2_stderr": 0.015907207649223338 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "10021.075126886368", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MM-ReMM-L2-20B/results_2023-11-07T18-30-54.641369.json b/eval-results/Undi95/MM-ReMM-L2-20B/results_2023-11-07T18-30-54.641369.json new file mode 100644 index 0000000000000000000000000000000000000000..2d003832752e3bdadea35265aa434334f9da61ff --- /dev/null +++ b/eval-results/Undi95/MM-ReMM-L2-20B/results_2023-11-07T18-30-54.641369.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "Undi95/MM-ReMM-L2-20B", + "model_sha": "37869800c15fb37d017ea83bb50fec6d6141f6ba", + "model_dtype": "torch.float16", + "model_size": "37.36 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.10843120805369127, + "em_stderr": 0.0031841552068159317, + "f1": 0.18655725671140816, + "f1_stderr": 0.003391655697712374 + }, + "harness|gsm8k|5": { + "acc": 0.07733131159969674, + "acc_stderr": 0.00735771352322235 + }, + "harness|winogrande|5": { + "acc": 0.7576953433307024, + "acc_stderr": 0.012042352526174785 + }, + "all": { + "em": 0.10843120805369127, + "em_stderr": 0.0031841552068159317, + "f1": 0.18655725671140816, + "f1_stderr": 0.003391655697712374, + "acc": 0.4175133274651996, + "acc_stderr": 0.009700033024698568 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "d772779ca662f4c1" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "5ad68bb8d2bab381" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "8d80264197b98513" + }, + "truncated": 3, + "non_truncated": 12119, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MXLewd-L2-20B/results_2023-10-03T17-32-13.142085.json b/eval-results/Undi95/MXLewd-L2-20B/results_2023-10-03T17-32-13.142085.json new file mode 100644 index 0000000000000000000000000000000000000000..d777b52112ef36ae84885d0fe4d1e151c81ae127 --- /dev/null +++ b/eval-results/Undi95/MXLewd-L2-20B/results_2023-10-03T17-32-13.142085.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/MXLewd-L2-20B", + "model_sha": "ac279478abd9ddb8d1f5adcc548be0287b963adf", + "model_size": "37.36 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5938566552901023, + "acc_stderr": 0.014351656690097863, + "acc_norm": 0.6322525597269625, + "acc_norm_stderr": 0.014090995618168477 + }, + "harness|hellaswag|10": { + "acc": 0.6700856403106951, + "acc_stderr": 0.004692208279690597, + "acc_norm": 0.8533160724955188, + "acc_norm_stderr": 0.0035306750148923053 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4962962962962963, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.4962962962962963, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5460526315789473, + "acc_stderr": 0.04051646342874142, + "acc_norm": 0.5460526315789473, + "acc_norm_stderr": 0.04051646342874142 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6339622641509434, + "acc_stderr": 0.029647813539365245, + "acc_norm": 0.6339622641509434, + "acc_norm_stderr": 0.029647813539365245 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6736111111111112, + "acc_stderr": 0.03921067198982266, + "acc_norm": 0.6736111111111112, + "acc_norm_stderr": 0.03921067198982266 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5317919075144508, + "acc_stderr": 0.038047497443647646, + "acc_norm": 0.5317919075144508, + "acc_norm_stderr": 0.038047497443647646 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3137254901960784, + "acc_stderr": 0.04617034827006716, + "acc_norm": 0.3137254901960784, + "acc_norm_stderr": 0.04617034827006716 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4765957446808511, + "acc_stderr": 0.03265019475033582, + "acc_norm": 0.4765957446808511, + "acc_norm_stderr": 0.03265019475033582 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5241379310344828, + "acc_stderr": 0.0416180850350153, + "acc_norm": 0.5241379310344828, + "acc_norm_stderr": 0.0416180850350153 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.024552292209342647, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.024552292209342647 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.04306241259127153, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.04306241259127153 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.667741935483871, + "acc_stderr": 0.0267955608481228, + "acc_norm": 0.667741935483871, + "acc_norm_stderr": 0.0267955608481228 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.035158955511656986, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.035158955511656986 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6848484848484848, + "acc_stderr": 0.0362773057502241, + "acc_norm": 0.6848484848484848, + "acc_norm_stderr": 0.0362773057502241 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.03191178226713546, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.03191178226713546 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7979274611398963, + "acc_stderr": 0.028979089794296732, + "acc_norm": 0.7979274611398963, + "acc_norm_stderr": 0.028979089794296732 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5641025641025641, + "acc_stderr": 0.025141801511177495, + "acc_norm": 0.5641025641025641, + "acc_norm_stderr": 0.025141801511177495 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34074074074074073, + "acc_stderr": 0.028897748741131147, + "acc_norm": 0.34074074074074073, + "acc_norm_stderr": 0.028897748741131147 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6134453781512605, + "acc_stderr": 0.03163145807552379, + "acc_norm": 0.6134453781512605, + "acc_norm_stderr": 0.03163145807552379 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.744954128440367, + "acc_stderr": 0.01868850085653584, + "acc_norm": 0.744954128440367, + "acc_norm_stderr": 0.01868850085653584 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5, + "acc_stderr": 0.034099716973523674, + "acc_norm": 0.5, + "acc_norm_stderr": 0.034099716973523674 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7647058823529411, + "acc_stderr": 0.029771775228145617, + "acc_norm": 0.7647058823529411, + "acc_norm_stderr": 0.029771775228145617 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7637130801687764, + "acc_stderr": 0.02765215314415927, + "acc_norm": 0.7637130801687764, + "acc_norm_stderr": 0.02765215314415927 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6860986547085202, + "acc_stderr": 0.03114679648297246, + "acc_norm": 0.6860986547085202, + "acc_norm_stderr": 0.03114679648297246 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6259541984732825, + "acc_stderr": 0.042438692422305246, + "acc_norm": 0.6259541984732825, + "acc_norm_stderr": 0.042438692422305246 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.04026187527591207, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.04026187527591207 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.04330043749650742, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.04330043749650742 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6809815950920245, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.6809815950920245, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.044328040552915185, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.044328040552915185 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7087378640776699, + "acc_stderr": 0.044986763205729224, + "acc_norm": 0.7087378640776699, + "acc_norm_stderr": 0.044986763205729224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8418803418803419, + "acc_stderr": 0.02390232554956041, + "acc_norm": 0.8418803418803419, + "acc_norm_stderr": 0.02390232554956041 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7726692209450831, + "acc_stderr": 0.014987270640946007, + "acc_norm": 0.7726692209450831, + "acc_norm_stderr": 0.014987270640946007 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6647398843930635, + "acc_stderr": 0.02541600377316554, + "acc_norm": 0.6647398843930635, + "acc_norm_stderr": 0.02541600377316554 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4100558659217877, + "acc_stderr": 0.016449708209026078, + "acc_norm": 0.4100558659217877, + "acc_norm_stderr": 0.016449708209026078 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.630718954248366, + "acc_stderr": 0.02763417668960266, + "acc_norm": 0.630718954248366, + "acc_norm_stderr": 0.02763417668960266 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6720257234726688, + "acc_stderr": 0.026664410886937617, + "acc_norm": 0.6720257234726688, + "acc_norm_stderr": 0.026664410886937617 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.025630824975621355, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.025630824975621355 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.44680851063829785, + "acc_stderr": 0.0296582350976669, + "acc_norm": 0.44680851063829785, + "acc_norm_stderr": 0.0296582350976669 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.43089960886571055, + "acc_stderr": 0.012647695889547235, + "acc_norm": 0.43089960886571055, + "acc_norm_stderr": 0.012647695889547235 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5992647058823529, + "acc_stderr": 0.02976826352893311, + "acc_norm": 0.5992647058823529, + "acc_norm_stderr": 0.02976826352893311 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5996732026143791, + "acc_stderr": 0.019821843688271765, + "acc_norm": 0.5996732026143791, + "acc_norm_stderr": 0.019821843688271765 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5454545454545454, + "acc_stderr": 0.04769300568972744, + "acc_norm": 0.5454545454545454, + "acc_norm_stderr": 0.04769300568972744 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.636734693877551, + "acc_stderr": 0.030789051139030806, + "acc_norm": 0.636734693877551, + "acc_norm_stderr": 0.030789051139030806 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7661691542288557, + "acc_stderr": 0.02992941540834839, + "acc_norm": 0.7661691542288557, + "acc_norm_stderr": 0.02992941540834839 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.034873508801977704, + "acc_norm": 0.86, + "acc_norm_stderr": 0.034873508801977704 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4578313253012048, + "acc_stderr": 0.038786267710023595, + "acc_norm": 0.4578313253012048, + "acc_norm_stderr": 0.038786267710023595 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7485380116959064, + "acc_stderr": 0.033275044238468436, + "acc_norm": 0.7485380116959064, + "acc_norm_stderr": 0.033275044238468436 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35128518971848227, + "mc1_stderr": 0.016711358163544403, + "mc2": 0.5164709446147603, + "mc2_stderr": 0.015892065045890465 + }, + "all": { + "acc": 0.5756106760468652, + "acc_stderr": 0.034247535703005975, + "acc_norm": 0.5793670546336733, + "acc_norm_stderr": 0.03422343071424788, + "mc1": 0.35128518971848227, + "mc1_stderr": 0.016711358163544403, + "mc2": 0.5164709446147603, + "mc2_stderr": 0.015892065045890465 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "10006.589550971985", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/MXLewd-L2-20B/results_2023-10-26T15-01-29.901026.json b/eval-results/Undi95/MXLewd-L2-20B/results_2023-10-26T15-01-29.901026.json new file mode 100644 index 0000000000000000000000000000000000000000..781fa9f1f5ee449a068573105f3a807a13246d27 --- /dev/null +++ b/eval-results/Undi95/MXLewd-L2-20B/results_2023-10-26T15-01-29.901026.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/MXLewd-L2-20B", + "model_sha": "ac279478abd9ddb8d1f5adcc548be0287b963adf", + "model_size": "37.36 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0759228187919463, + "em_stderr": 0.002712563641278116, + "f1": 0.1446036073825498, + "f1_stderr": 0.0029538450645220115 + }, + "harness|gsm8k|5": { + "acc": 0.10917361637604246, + "acc_stderr": 0.008590089300511132 + }, + "harness|winogrande|5": { + "acc": 0.760852407261247, + "acc_stderr": 0.01198854184484391 + }, + "all": { + "em": 0.0759228187919463, + "em_stderr": 0.002712563641278116, + "f1": 0.1446036073825498, + "f1_stderr": 0.0029538450645220115, + "acc": 0.43501301181864477, + "acc_stderr": 0.01028931557267752 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "75ddad8d87b475bf" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "eb3dd246d8489442" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "1d0313eee68cf8bb" + }, + "total_evaluation_time_secondes": "21971.495183467865", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Mistral-11B-TestBench10/results_2023-10-11T20-32-37.017457.json b/eval-results/Undi95/Mistral-11B-TestBench10/results_2023-10-11T20-32-37.017457.json new file mode 100644 index 0000000000000000000000000000000000000000..9274a0cf1c1b51cad56cb2e9753f73897d83d38d --- /dev/null +++ b/eval-results/Undi95/Mistral-11B-TestBench10/results_2023-10-11T20-32-37.017457.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/Mistral-11B-TestBench10", + "model_sha": "730429d6132c7702885840098885081c2df878df", + "model_size": "20.74 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6220136518771331, + "acc_stderr": 0.014169664520303098, + "acc_norm": 0.6424914675767918, + "acc_norm_stderr": 0.014005494275916576 + }, + "harness|hellaswag|10": { + "acc": 0.6533559051981677, + "acc_stderr": 0.004749286071559565, + "acc_norm": 0.8423620792670783, + "acc_norm_stderr": 0.003636564286352674 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6222222222222222, + "acc_stderr": 0.04188307537595852, + "acc_norm": 0.6222222222222222, + "acc_norm_stderr": 0.04188307537595852 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6710526315789473, + "acc_stderr": 0.038234289699266046, + "acc_norm": 0.6710526315789473, + "acc_norm_stderr": 0.038234289699266046 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7056603773584905, + "acc_stderr": 0.02804918631569525, + "acc_norm": 0.7056603773584905, + "acc_norm_stderr": 0.02804918631569525 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.037455547914624555, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.037455547914624555 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6705202312138728, + "acc_stderr": 0.03583901754736412, + "acc_norm": 0.6705202312138728, + "acc_norm_stderr": 0.03583901754736412 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.048971049527263666, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.048971049527263666 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5446808510638298, + "acc_stderr": 0.03255525359340354, + "acc_norm": 0.5446808510638298, + "acc_norm_stderr": 0.03255525359340354 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.046970851366478626, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.046970851366478626 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5517241379310345, + "acc_stderr": 0.04144311810878152, + "acc_norm": 0.5517241379310345, + "acc_norm_stderr": 0.04144311810878152 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3941798941798942, + "acc_stderr": 0.02516798233389414, + "acc_norm": 0.3941798941798942, + "acc_norm_stderr": 0.02516798233389414 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.04426266681379909, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.04426266681379909 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7451612903225806, + "acc_stderr": 0.024790118459332208, + "acc_norm": 0.7451612903225806, + "acc_norm_stderr": 0.024790118459332208 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.49261083743842365, + "acc_stderr": 0.035176035403610084, + "acc_norm": 0.49261083743842365, + "acc_norm_stderr": 0.035176035403610084 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7636363636363637, + "acc_stderr": 0.03317505930009182, + "acc_norm": 0.7636363636363637, + "acc_norm_stderr": 0.03317505930009182 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.797979797979798, + "acc_stderr": 0.028606204289229872, + "acc_norm": 0.797979797979798, + "acc_norm_stderr": 0.028606204289229872 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8860103626943006, + "acc_stderr": 0.022935144053919443, + "acc_norm": 0.8860103626943006, + "acc_norm_stderr": 0.022935144053919443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6820512820512821, + "acc_stderr": 0.02361088430892786, + "acc_norm": 0.6820512820512821, + "acc_norm_stderr": 0.02361088430892786 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.36666666666666664, + "acc_stderr": 0.029381620726465073, + "acc_norm": 0.36666666666666664, + "acc_norm_stderr": 0.029381620726465073 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6512605042016807, + "acc_stderr": 0.030956636328566548, + "acc_norm": 0.6512605042016807, + "acc_norm_stderr": 0.030956636328566548 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.038020397601079024, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.038020397601079024 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8348623853211009, + "acc_stderr": 0.015919557829976044, + "acc_norm": 0.8348623853211009, + "acc_norm_stderr": 0.015919557829976044 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5416666666666666, + "acc_stderr": 0.03398110890294636, + "acc_norm": 0.5416666666666666, + "acc_norm_stderr": 0.03398110890294636 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8137254901960784, + "acc_stderr": 0.02732547096671631, + "acc_norm": 0.8137254901960784, + "acc_norm_stderr": 0.02732547096671631 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7890295358649789, + "acc_stderr": 0.02655837250266192, + "acc_norm": 0.7890295358649789, + "acc_norm_stderr": 0.02655837250266192 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6681614349775785, + "acc_stderr": 0.03160295143776679, + "acc_norm": 0.6681614349775785, + "acc_norm_stderr": 0.03160295143776679 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7862595419847328, + "acc_stderr": 0.0359546161177469, + "acc_norm": 0.7862595419847328, + "acc_norm_stderr": 0.0359546161177469 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.768595041322314, + "acc_stderr": 0.03849856098794088, + "acc_norm": 0.768595041322314, + "acc_norm_stderr": 0.03849856098794088 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7852760736196319, + "acc_stderr": 0.032262193772867744, + "acc_norm": 0.7852760736196319, + "acc_norm_stderr": 0.032262193772867744 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7961165048543689, + "acc_stderr": 0.039891398595317706, + "acc_norm": 0.7961165048543689, + "acc_norm_stderr": 0.039891398595317706 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.021901905115073325, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.021901905115073325 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8122605363984674, + "acc_stderr": 0.013964393769899133, + "acc_norm": 0.8122605363984674, + "acc_norm_stderr": 0.013964393769899133 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.708092485549133, + "acc_stderr": 0.024476994076247337, + "acc_norm": 0.708092485549133, + "acc_norm_stderr": 0.024476994076247337 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.40782122905027934, + "acc_stderr": 0.016435865260914742, + "acc_norm": 0.40782122905027934, + "acc_norm_stderr": 0.016435865260914742 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7124183006535948, + "acc_stderr": 0.02591780611714716, + "acc_norm": 0.7124183006535948, + "acc_norm_stderr": 0.02591780611714716 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6977491961414791, + "acc_stderr": 0.02608270069539966, + "acc_norm": 0.6977491961414791, + "acc_norm_stderr": 0.02608270069539966 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7098765432098766, + "acc_stderr": 0.025251173936495026, + "acc_norm": 0.7098765432098766, + "acc_norm_stderr": 0.025251173936495026 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.46099290780141844, + "acc_stderr": 0.029736592526424438, + "acc_norm": 0.46099290780141844, + "acc_norm_stderr": 0.029736592526424438 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4498044328552803, + "acc_stderr": 0.01270572149856511, + "acc_norm": 0.4498044328552803, + "acc_norm_stderr": 0.01270572149856511 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6838235294117647, + "acc_stderr": 0.028245687391462923, + "acc_norm": 0.6838235294117647, + "acc_norm_stderr": 0.028245687391462923 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6633986928104575, + "acc_stderr": 0.019117213911495144, + "acc_norm": 0.6633986928104575, + "acc_norm_stderr": 0.019117213911495144 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7346938775510204, + "acc_stderr": 0.028263889943784603, + "acc_norm": 0.7346938775510204, + "acc_norm_stderr": 0.028263889943784603 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8557213930348259, + "acc_stderr": 0.024845753212306053, + "acc_norm": 0.8557213930348259, + "acc_norm_stderr": 0.024845753212306053 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5421686746987951, + "acc_stderr": 0.0387862677100236, + "acc_norm": 0.5421686746987951, + "acc_norm_stderr": 0.0387862677100236 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.39412484700122397, + "mc1_stderr": 0.017106588140700322, + "mc2": 0.5556543619352063, + "mc2_stderr": 0.015507002997196854 + }, + "all": { + "acc": 0.6389303587566675, + "acc_stderr": 0.033080650268054235, + "acc_norm": 0.6424809348544399, + "acc_norm_stderr": 0.033059008030264514, + "mc1": 0.39412484700122397, + "mc1_stderr": 0.017106588140700322, + "mc2": 0.5556543619352063, + "mc2_stderr": 0.015507002997196854 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "6157.816865444183", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Mistral-11B-TestBench11/results_2023-10-11T20-08-34.702863.json b/eval-results/Undi95/Mistral-11B-TestBench11/results_2023-10-11T20-08-34.702863.json new file mode 100644 index 0000000000000000000000000000000000000000..3f5d4a60e0d502c86382c8f0ec1fd8605eea0f1a --- /dev/null +++ b/eval-results/Undi95/Mistral-11B-TestBench11/results_2023-10-11T20-08-34.702863.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/Mistral-11B-TestBench11", + "model_sha": "9aae2b156b24557bb98e515f3a90c7865529d2e9", + "model_size": "20.74 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6160409556313993, + "acc_stderr": 0.01421244498065189, + "acc_norm": 0.64419795221843, + "acc_norm_stderr": 0.01399057113791876 + }, + "harness|hellaswag|10": { + "acc": 0.6507667795259908, + "acc_stderr": 0.004757534850522272, + "acc_norm": 0.8392750448117905, + "acc_norm_stderr": 0.0036652645638577596 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6370370370370371, + "acc_stderr": 0.04153948404742398, + "acc_norm": 0.6370370370370371, + "acc_norm_stderr": 0.04153948404742398 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6644736842105263, + "acc_stderr": 0.038424985593952694, + "acc_norm": 0.6644736842105263, + "acc_norm_stderr": 0.038424985593952694 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6830188679245283, + "acc_stderr": 0.028637235639800886, + "acc_norm": 0.6830188679245283, + "acc_norm_stderr": 0.028637235639800886 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7361111111111112, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.7361111111111112, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.653179190751445, + "acc_stderr": 0.036291466701596636, + "acc_norm": 0.653179190751445, + "acc_norm_stderr": 0.036291466701596636 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.47058823529411764, + "acc_stderr": 0.04966570903978529, + "acc_norm": 0.47058823529411764, + "acc_norm_stderr": 0.04966570903978529 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5276595744680851, + "acc_stderr": 0.03263597118409769, + "acc_norm": 0.5276595744680851, + "acc_norm_stderr": 0.03263597118409769 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4649122807017544, + "acc_stderr": 0.04692008381368909, + "acc_norm": 0.4649122807017544, + "acc_norm_stderr": 0.04692008381368909 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5517241379310345, + "acc_stderr": 0.04144311810878151, + "acc_norm": 0.5517241379310345, + "acc_norm_stderr": 0.04144311810878151 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.025355741263055256, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.025355741263055256 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7741935483870968, + "acc_stderr": 0.023785577884181012, + "acc_norm": 0.7741935483870968, + "acc_norm_stderr": 0.023785577884181012 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.035158955511656986, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.035158955511656986 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7515151515151515, + "acc_stderr": 0.033744026441394036, + "acc_norm": 0.7515151515151515, + "acc_norm_stderr": 0.033744026441394036 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8080808080808081, + "acc_stderr": 0.028057791672989017, + "acc_norm": 0.8080808080808081, + "acc_norm_stderr": 0.028057791672989017 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8911917098445595, + "acc_stderr": 0.022473253332768766, + "acc_norm": 0.8911917098445595, + "acc_norm_stderr": 0.022473253332768766 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6871794871794872, + "acc_stderr": 0.023507579020645358, + "acc_norm": 0.6871794871794872, + "acc_norm_stderr": 0.023507579020645358 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.028742040903948492, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.028742040903948492 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6848739495798319, + "acc_stderr": 0.030176808288974337, + "acc_norm": 0.6848739495798319, + "acc_norm_stderr": 0.030176808288974337 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8293577981651377, + "acc_stderr": 0.016129271025099878, + "acc_norm": 0.8293577981651377, + "acc_norm_stderr": 0.016129271025099878 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5787037037037037, + "acc_stderr": 0.03367462138896078, + "acc_norm": 0.5787037037037037, + "acc_norm_stderr": 0.03367462138896078 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7941176470588235, + "acc_stderr": 0.028379449451588667, + "acc_norm": 0.7941176470588235, + "acc_norm_stderr": 0.028379449451588667 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7890295358649789, + "acc_stderr": 0.02655837250266192, + "acc_norm": 0.7890295358649789, + "acc_norm_stderr": 0.02655837250266192 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.672645739910314, + "acc_stderr": 0.03149384670994131, + "acc_norm": 0.672645739910314, + "acc_norm_stderr": 0.03149384670994131 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.732824427480916, + "acc_stderr": 0.038808483010823944, + "acc_norm": 0.732824427480916, + "acc_norm_stderr": 0.038808483010823944 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7851239669421488, + "acc_stderr": 0.037494924487096966, + "acc_norm": 0.7851239669421488, + "acc_norm_stderr": 0.037494924487096966 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7484662576687117, + "acc_stderr": 0.03408997886857529, + "acc_norm": 0.7484662576687117, + "acc_norm_stderr": 0.03408997886857529 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.48214285714285715, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.48214285714285715, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7864077669902912, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.7864077669902912, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8547008547008547, + "acc_stderr": 0.0230866350868414, + "acc_norm": 0.8547008547008547, + "acc_norm_stderr": 0.0230866350868414 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.80970625798212, + "acc_stderr": 0.014036945850381398, + "acc_norm": 0.80970625798212, + "acc_norm_stderr": 0.014036945850381398 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6994219653179191, + "acc_stderr": 0.0246853168672578, + "acc_norm": 0.6994219653179191, + "acc_norm_stderr": 0.0246853168672578 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.40670391061452515, + "acc_stderr": 0.016428811915898865, + "acc_norm": 0.40670391061452515, + "acc_norm_stderr": 0.016428811915898865 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7026143790849673, + "acc_stderr": 0.02617390850671858, + "acc_norm": 0.7026143790849673, + "acc_norm_stderr": 0.02617390850671858 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6816720257234726, + "acc_stderr": 0.026457225067811025, + "acc_norm": 0.6816720257234726, + "acc_norm_stderr": 0.026457225067811025 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6975308641975309, + "acc_stderr": 0.02555765398186806, + "acc_norm": 0.6975308641975309, + "acc_norm_stderr": 0.02555765398186806 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.029766675075873866, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.029766675075873866 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44654498044328556, + "acc_stderr": 0.012697046024399684, + "acc_norm": 0.44654498044328556, + "acc_norm_stderr": 0.012697046024399684 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.028418208619406755, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.028418208619406755 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6633986928104575, + "acc_stderr": 0.019117213911495144, + "acc_norm": 0.6633986928104575, + "acc_norm_stderr": 0.019117213911495144 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6818181818181818, + "acc_stderr": 0.04461272175910508, + "acc_norm": 0.6818181818181818, + "acc_norm_stderr": 0.04461272175910508 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.726530612244898, + "acc_stderr": 0.028535560337128438, + "acc_norm": 0.726530612244898, + "acc_norm_stderr": 0.028535560337128438 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8407960199004975, + "acc_stderr": 0.02587064676616913, + "acc_norm": 0.8407960199004975, + "acc_norm_stderr": 0.02587064676616913 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.82, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5301204819277109, + "acc_stderr": 0.03885425420866767, + "acc_norm": 0.5301204819277109, + "acc_norm_stderr": 0.03885425420866767 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8245614035087719, + "acc_stderr": 0.02917088550072767, + "acc_norm": 0.8245614035087719, + "acc_norm_stderr": 0.02917088550072767 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3990208078335373, + "mc1_stderr": 0.017142825728496767, + "mc2": 0.5667907484592799, + "mc2_stderr": 0.01555047138686305 + }, + "all": { + "acc": 0.6380444364756247, + "acc_stderr": 0.033260479160891865, + "acc_norm": 0.6417167290497405, + "acc_norm_stderr": 0.03323820553158004, + "mc1": 0.3990208078335373, + "mc1_stderr": 0.017142825728496767, + "mc2": 0.5667907484592799, + "mc2_stderr": 0.01555047138686305 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "6149.170400619507", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Mistral-11B-TestBench11/results_2023-10-28T01-59-23.177639.json b/eval-results/Undi95/Mistral-11B-TestBench11/results_2023-10-28T01-59-23.177639.json new file mode 100644 index 0000000000000000000000000000000000000000..c786a2f3382d4fac87c8e29ad211ee3ae424b8e0 --- /dev/null +++ b/eval-results/Undi95/Mistral-11B-TestBench11/results_2023-10-28T01-59-23.177639.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/Mistral-11B-TestBench11", + "model_sha": "2c95dc683b5256749b0cad984f59d0af041822a6", + "model_size": "20.74 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.02904781879194631, + "em_stderr": 0.0017198688690203193, + "f1": 0.09573615771812093, + "f1_stderr": 0.0021674728464020697 + }, + "harness|gsm8k|5": { + "acc": 0.14935557240333586, + "acc_stderr": 0.00981809072372729 + }, + "harness|winogrande|5": { + "acc": 0.7774269928966061, + "acc_stderr": 0.011690933809712667 + }, + "all": { + "em": 0.02904781879194631, + "em_stderr": 0.0017198688690203193, + "f1": 0.09573615771812093, + "f1_stderr": 0.0021674728464020697, + "acc": 0.463391282649971, + "acc_stderr": 0.010754512266719978 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "d7ba2a20d0ecc3d2" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "359572aca9338c1f" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2397, + "non-padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "a046a5e185f8240b" + }, + "total_evaluation_time_secondes": "14685.71157336235", + "truncated": 0, + "non-truncated": 13389, + "padded": 2397, + "non-padded": 10992, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Mistral-11B-TestBench3/results_2023-10-11T03-17-36.482892.json b/eval-results/Undi95/Mistral-11B-TestBench3/results_2023-10-11T03-17-36.482892.json new file mode 100644 index 0000000000000000000000000000000000000000..7a19bfdb30428d527ed729e2c8863fc3cd6c2fd9 --- /dev/null +++ b/eval-results/Undi95/Mistral-11B-TestBench3/results_2023-10-11T03-17-36.482892.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/Mistral-11B-TestBench3", + "model_sha": "7eb397ad2ec67400e31dc010f9b364a72d64d965", + "model_size": "20.74 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5784982935153583, + "acc_stderr": 0.014430197069326021, + "acc_norm": 0.6203071672354948, + "acc_norm_stderr": 0.014182119866974872 + }, + "harness|hellaswag|10": { + "acc": 0.6485759808803028, + "acc_stderr": 0.004764393985111036, + "acc_norm": 0.8391754630551683, + "acc_norm_stderr": 0.0036661823284423424 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6148148148148148, + "acc_stderr": 0.04203921040156279, + "acc_norm": 0.6148148148148148, + "acc_norm_stderr": 0.04203921040156279 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6513157894736842, + "acc_stderr": 0.038781398887976104, + "acc_norm": 0.6513157894736842, + "acc_norm_stderr": 0.038781398887976104 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6377358490566037, + "acc_stderr": 0.0295822451283843, + "acc_norm": 0.6377358490566037, + "acc_norm_stderr": 0.0295822451283843 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.03745554791462456, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.03745554791462456 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.42, + "acc_stderr": 0.04960449637488584, + "acc_norm": 0.42, + "acc_norm_stderr": 0.04960449637488584 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6184971098265896, + "acc_stderr": 0.03703851193099521, + "acc_norm": 0.6184971098265896, + "acc_norm_stderr": 0.03703851193099521 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5702127659574469, + "acc_stderr": 0.03236214467715564, + "acc_norm": 0.5702127659574469, + "acc_norm_stderr": 0.03236214467715564 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5, + "acc_stderr": 0.047036043419179864, + "acc_norm": 0.5, + "acc_norm_stderr": 0.047036043419179864 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5655172413793104, + "acc_stderr": 0.04130740879555497, + "acc_norm": 0.5655172413793104, + "acc_norm_stderr": 0.04130740879555497 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.02490699045899257, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.02490699045899257 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.04415438226743744, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.04415438226743744 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7483870967741936, + "acc_stderr": 0.024685979286239963, + "acc_norm": 0.7483870967741936, + "acc_norm_stderr": 0.024685979286239963 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.49261083743842365, + "acc_stderr": 0.035176035403610084, + "acc_norm": 0.49261083743842365, + "acc_norm_stderr": 0.035176035403610084 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526066, + "acc_norm": 0.67, + "acc_norm_stderr": 0.047258156262526066 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.03192271569548301, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.03192271569548301 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7929292929292929, + "acc_stderr": 0.02886977846026705, + "acc_norm": 0.7929292929292929, + "acc_norm_stderr": 0.02886977846026705 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8601036269430051, + "acc_stderr": 0.025033870583015184, + "acc_norm": 0.8601036269430051, + "acc_norm_stderr": 0.025033870583015184 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6743589743589744, + "acc_stderr": 0.02375966576741229, + "acc_norm": 0.6743589743589744, + "acc_norm_stderr": 0.02375966576741229 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32222222222222224, + "acc_stderr": 0.028493465091028593, + "acc_norm": 0.32222222222222224, + "acc_norm_stderr": 0.028493465091028593 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6638655462184874, + "acc_stderr": 0.030684737115135356, + "acc_norm": 0.6638655462184874, + "acc_norm_stderr": 0.030684737115135356 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526732, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526732 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8275229357798165, + "acc_stderr": 0.01619780795684805, + "acc_norm": 0.8275229357798165, + "acc_norm_stderr": 0.01619780795684805 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5, + "acc_stderr": 0.034099716973523674, + "acc_norm": 0.5, + "acc_norm_stderr": 0.034099716973523674 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.028867431449849313, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.028867431449849313 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7426160337552743, + "acc_stderr": 0.028458820991460305, + "acc_norm": 0.7426160337552743, + "acc_norm_stderr": 0.028458820991460305 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6771300448430493, + "acc_stderr": 0.031381476375754995, + "acc_norm": 0.6771300448430493, + "acc_norm_stderr": 0.031381476375754995 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7786259541984732, + "acc_stderr": 0.0364129708131373, + "acc_norm": 0.7786259541984732, + "acc_norm_stderr": 0.0364129708131373 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098823, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098823 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.04236511258094633, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.04236511258094633 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7730061349693251, + "acc_stderr": 0.03291099578615769, + "acc_norm": 0.7730061349693251, + "acc_norm_stderr": 0.03291099578615769 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7864077669902912, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.7864077669902912, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8675213675213675, + "acc_stderr": 0.02220930907316562, + "acc_norm": 0.8675213675213675, + "acc_norm_stderr": 0.02220930907316562 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8007662835249042, + "acc_stderr": 0.014283378044296418, + "acc_norm": 0.8007662835249042, + "acc_norm_stderr": 0.014283378044296418 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6734104046242775, + "acc_stderr": 0.02524826477424284, + "acc_norm": 0.6734104046242775, + "acc_norm_stderr": 0.02524826477424284 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.33519553072625696, + "acc_stderr": 0.015788007190185884, + "acc_norm": 0.33519553072625696, + "acc_norm_stderr": 0.015788007190185884 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7418300653594772, + "acc_stderr": 0.02505850331695815, + "acc_norm": 0.7418300653594772, + "acc_norm_stderr": 0.02505850331695815 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7041800643086816, + "acc_stderr": 0.02592237178881877, + "acc_norm": 0.7041800643086816, + "acc_norm_stderr": 0.02592237178881877 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7160493827160493, + "acc_stderr": 0.025089478523765134, + "acc_norm": 0.7160493827160493, + "acc_norm_stderr": 0.025089478523765134 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.029766675075873866, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.029766675075873866 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44654498044328556, + "acc_stderr": 0.012697046024399673, + "acc_norm": 0.44654498044328556, + "acc_norm_stderr": 0.012697046024399673 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6654411764705882, + "acc_stderr": 0.028661996202335307, + "acc_norm": 0.6654411764705882, + "acc_norm_stderr": 0.028661996202335307 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6454248366013072, + "acc_stderr": 0.019353360547553704, + "acc_norm": 0.6454248366013072, + "acc_norm_stderr": 0.019353360547553704 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.045820048415054174, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.045820048415054174 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7428571428571429, + "acc_stderr": 0.02797982353874455, + "acc_norm": 0.7428571428571429, + "acc_norm_stderr": 0.02797982353874455 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8407960199004975, + "acc_stderr": 0.02587064676616913, + "acc_norm": 0.8407960199004975, + "acc_norm_stderr": 0.02587064676616913 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.03588702812826371, + "acc_norm": 0.85, + "acc_norm_stderr": 0.03588702812826371 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5301204819277109, + "acc_stderr": 0.03885425420866767, + "acc_norm": 0.5301204819277109, + "acc_norm_stderr": 0.03885425420866767 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.37209302325581395, + "mc1_stderr": 0.016921090118814035, + "mc2": 0.5365517381581612, + "mc2_stderr": 0.01561816357163061 + }, + "all": { + "acc": 0.6305202530532628, + "acc_stderr": 0.03327541391769616, + "acc_norm": 0.6344593777294493, + "acc_norm_stderr": 0.03325259546245853, + "mc1": 0.37209302325581395, + "mc1_stderr": 0.016921090118814035, + "mc2": 0.5365517381581612, + "mc2_stderr": 0.01561816357163061 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "5738.2057547569275", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Mistral-11B-TestBench7/results_2023-10-11T16-09-31.642289.json b/eval-results/Undi95/Mistral-11B-TestBench7/results_2023-10-11T16-09-31.642289.json new file mode 100644 index 0000000000000000000000000000000000000000..09d895a74e0f650bf40f4f04949f9dbfc7f76f74 --- /dev/null +++ b/eval-results/Undi95/Mistral-11B-TestBench7/results_2023-10-11T16-09-31.642289.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/Mistral-11B-TestBench7", + "model_sha": "3d4d99f90ec582e0d532e11f6da419d6b962c536", + "model_size": "20.74 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.590443686006826, + "acc_stderr": 0.014370358632472432, + "acc_norm": 0.6331058020477816, + "acc_norm_stderr": 0.014084133118104298 + }, + "harness|hellaswag|10": { + "acc": 0.63433578968333, + "acc_stderr": 0.004806316342709402, + "acc_norm": 0.8286197968532165, + "acc_norm_stderr": 0.0037607069750393053 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6444444444444445, + "acc_stderr": 0.04135176749720385, + "acc_norm": 0.6444444444444445, + "acc_norm_stderr": 0.04135176749720385 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6710526315789473, + "acc_stderr": 0.038234289699266046, + "acc_norm": 0.6710526315789473, + "acc_norm_stderr": 0.038234289699266046 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6943396226415094, + "acc_stderr": 0.028353298073322663, + "acc_norm": 0.6943396226415094, + "acc_norm_stderr": 0.028353298073322663 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7083333333333334, + "acc_stderr": 0.038009680605548594, + "acc_norm": 0.7083333333333334, + "acc_norm_stderr": 0.038009680605548594 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6647398843930635, + "acc_stderr": 0.03599586301247077, + "acc_norm": 0.6647398843930635, + "acc_norm_stderr": 0.03599586301247077 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.049406356306056595, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.049406356306056595 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.79, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5361702127659574, + "acc_stderr": 0.032600385118357715, + "acc_norm": 0.5361702127659574, + "acc_norm_stderr": 0.032600385118357715 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.49122807017543857, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.49122807017543857, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.41534391534391535, + "acc_stderr": 0.025379524910778405, + "acc_norm": 0.41534391534391535, + "acc_norm_stderr": 0.025379524910778405 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.04415438226743744, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.04415438226743744 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7806451612903226, + "acc_stderr": 0.023540799358723295, + "acc_norm": 0.7806451612903226, + "acc_norm_stderr": 0.023540799358723295 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4975369458128079, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.4975369458128079, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7696969696969697, + "acc_stderr": 0.0328766675860349, + "acc_norm": 0.7696969696969697, + "acc_norm_stderr": 0.0328766675860349 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.803030303030303, + "acc_stderr": 0.02833560973246336, + "acc_norm": 0.803030303030303, + "acc_norm_stderr": 0.02833560973246336 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8808290155440415, + "acc_stderr": 0.023381935348121434, + "acc_norm": 0.8808290155440415, + "acc_norm_stderr": 0.023381935348121434 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6846153846153846, + "acc_stderr": 0.023559646983189946, + "acc_norm": 0.6846153846153846, + "acc_norm_stderr": 0.023559646983189946 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.028742040903948496, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.028742040903948496 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6596638655462185, + "acc_stderr": 0.030778057422931673, + "acc_norm": 0.6596638655462185, + "acc_norm_stderr": 0.030778057422931673 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3708609271523179, + "acc_stderr": 0.03943966699183629, + "acc_norm": 0.3708609271523179, + "acc_norm_stderr": 0.03943966699183629 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8293577981651377, + "acc_stderr": 0.01612927102509986, + "acc_norm": 0.8293577981651377, + "acc_norm_stderr": 0.01612927102509986 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.6064814814814815, + "acc_stderr": 0.03331747876370312, + "acc_norm": 0.6064814814814815, + "acc_norm_stderr": 0.03331747876370312 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.803921568627451, + "acc_stderr": 0.027865942286639318, + "acc_norm": 0.803921568627451, + "acc_norm_stderr": 0.027865942286639318 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7721518987341772, + "acc_stderr": 0.02730348459906943, + "acc_norm": 0.7721518987341772, + "acc_norm_stderr": 0.02730348459906943 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6547085201793722, + "acc_stderr": 0.03191100192835794, + "acc_norm": 0.6547085201793722, + "acc_norm_stderr": 0.03191100192835794 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7633587786259542, + "acc_stderr": 0.03727673575596914, + "acc_norm": 0.7633587786259542, + "acc_norm_stderr": 0.03727673575596914 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.768595041322314, + "acc_stderr": 0.03849856098794088, + "acc_norm": 0.768595041322314, + "acc_norm_stderr": 0.03849856098794088 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7685185185185185, + "acc_stderr": 0.04077494709252627, + "acc_norm": 0.7685185185185185, + "acc_norm_stderr": 0.04077494709252627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7607361963190185, + "acc_stderr": 0.0335195387952127, + "acc_norm": 0.7607361963190185, + "acc_norm_stderr": 0.0335195387952127 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4642857142857143, + "acc_stderr": 0.04733667890053756, + "acc_norm": 0.4642857142857143, + "acc_norm_stderr": 0.04733667890053756 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7961165048543689, + "acc_stderr": 0.039891398595317706, + "acc_norm": 0.7961165048543689, + "acc_norm_stderr": 0.039891398595317706 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8632478632478633, + "acc_stderr": 0.02250903393707781, + "acc_norm": 0.8632478632478633, + "acc_norm_stderr": 0.02250903393707781 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8109833971902938, + "acc_stderr": 0.014000791294407006, + "acc_norm": 0.8109833971902938, + "acc_norm_stderr": 0.014000791294407006 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6965317919075145, + "acc_stderr": 0.024752411960917205, + "acc_norm": 0.6965317919075145, + "acc_norm_stderr": 0.024752411960917205 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.38100558659217876, + "acc_stderr": 0.01624202883405362, + "acc_norm": 0.38100558659217876, + "acc_norm_stderr": 0.01624202883405362 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7320261437908496, + "acc_stderr": 0.025360603796242557, + "acc_norm": 0.7320261437908496, + "acc_norm_stderr": 0.025360603796242557 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7041800643086816, + "acc_stderr": 0.025922371788818777, + "acc_norm": 0.7041800643086816, + "acc_norm_stderr": 0.025922371788818777 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7191358024691358, + "acc_stderr": 0.025006469755799208, + "acc_norm": 0.7191358024691358, + "acc_norm_stderr": 0.025006469755799208 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.48226950354609927, + "acc_stderr": 0.02980873964223777, + "acc_norm": 0.48226950354609927, + "acc_norm_stderr": 0.02980873964223777 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4367666232073012, + "acc_stderr": 0.012667701919603662, + "acc_norm": 0.4367666232073012, + "acc_norm_stderr": 0.012667701919603662 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6911764705882353, + "acc_stderr": 0.02806499816704009, + "acc_norm": 0.6911764705882353, + "acc_norm_stderr": 0.02806499816704009 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6601307189542484, + "acc_stderr": 0.019162418588623557, + "acc_norm": 0.6601307189542484, + "acc_norm_stderr": 0.019162418588623557 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.726530612244898, + "acc_stderr": 0.02853556033712844, + "acc_norm": 0.726530612244898, + "acc_norm_stderr": 0.02853556033712844 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.845771144278607, + "acc_stderr": 0.02553843336857833, + "acc_norm": 0.845771144278607, + "acc_norm_stderr": 0.02553843336857833 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.034873508801977704, + "acc_norm": 0.86, + "acc_norm_stderr": 0.034873508801977704 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.536144578313253, + "acc_stderr": 0.038823108508905954, + "acc_norm": 0.536144578313253, + "acc_norm_stderr": 0.038823108508905954 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.29498164014687883, + "mc1_stderr": 0.015964400965589657, + "mc2": 0.4691495265456508, + "mc2_stderr": 0.014857248788144817 + }, + "all": { + "acc": 0.6399052867360159, + "acc_stderr": 0.03310704632621164, + "acc_norm": 0.6439213227226402, + "acc_norm_stderr": 0.03308447285363473, + "mc1": 0.29498164014687883, + "mc1_stderr": 0.015964400965589657, + "mc2": 0.4691495265456508, + "mc2_stderr": 0.014857248788144817 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "6131.232973814011", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Mistral-11B-TestBench9/results_2023-10-11T17-38-21.379151.json b/eval-results/Undi95/Mistral-11B-TestBench9/results_2023-10-11T17-38-21.379151.json new file mode 100644 index 0000000000000000000000000000000000000000..823f551bfcc0355b186c61345f4fcd3d0fa62b60 --- /dev/null +++ b/eval-results/Undi95/Mistral-11B-TestBench9/results_2023-10-11T17-38-21.379151.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/Mistral-11B-TestBench9", + "model_sha": "4ff48527af8c3907129c06160c7f7b7b786a5a79", + "model_size": "20.74 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6203071672354948, + "acc_stderr": 0.014182119866974872, + "acc_norm": 0.6407849829351536, + "acc_norm_stderr": 0.01402022415583916 + }, + "harness|hellaswag|10": { + "acc": 0.652459669388568, + "acc_stderr": 0.004752158936871871, + "acc_norm": 0.8423620792670783, + "acc_norm_stderr": 0.0036365642863526765 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6444444444444445, + "acc_stderr": 0.04135176749720385, + "acc_norm": 0.6444444444444445, + "acc_norm_stderr": 0.04135176749720385 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6578947368421053, + "acc_stderr": 0.03860731599316092, + "acc_norm": 0.6578947368421053, + "acc_norm_stderr": 0.03860731599316092 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6981132075471698, + "acc_stderr": 0.028254200344438655, + "acc_norm": 0.6981132075471698, + "acc_norm_stderr": 0.028254200344438655 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7361111111111112, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.7361111111111112, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.653179190751445, + "acc_stderr": 0.036291466701596636, + "acc_norm": 0.653179190751445, + "acc_norm_stderr": 0.036291466701596636 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.45098039215686275, + "acc_stderr": 0.04951218252396262, + "acc_norm": 0.45098039215686275, + "acc_norm_stderr": 0.04951218252396262 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5234042553191489, + "acc_stderr": 0.03265019475033582, + "acc_norm": 0.5234042553191489, + "acc_norm_stderr": 0.03265019475033582 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.04697085136647863, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.04697085136647863 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5655172413793104, + "acc_stderr": 0.04130740879555498, + "acc_norm": 0.5655172413793104, + "acc_norm_stderr": 0.04130740879555498 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.025279850397404904, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.025279850397404904 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7677419354838709, + "acc_stderr": 0.024022256130308235, + "acc_norm": 0.7677419354838709, + "acc_norm_stderr": 0.024022256130308235 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5024630541871922, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.5024630541871922, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7636363636363637, + "acc_stderr": 0.03317505930009181, + "acc_norm": 0.7636363636363637, + "acc_norm_stderr": 0.03317505930009181 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.803030303030303, + "acc_stderr": 0.028335609732463362, + "acc_norm": 0.803030303030303, + "acc_norm_stderr": 0.028335609732463362 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8963730569948186, + "acc_stderr": 0.02199531196364424, + "acc_norm": 0.8963730569948186, + "acc_norm_stderr": 0.02199531196364424 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6846153846153846, + "acc_stderr": 0.023559646983189946, + "acc_norm": 0.6846153846153846, + "acc_norm_stderr": 0.023559646983189946 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.028742040903948496, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.028742040903948496 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6596638655462185, + "acc_stderr": 0.030778057422931673, + "acc_norm": 0.6596638655462185, + "acc_norm_stderr": 0.030778057422931673 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8366972477064221, + "acc_stderr": 0.015848255806501534, + "acc_norm": 0.8366972477064221, + "acc_norm_stderr": 0.015848255806501534 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5601851851851852, + "acc_stderr": 0.033851779760448106, + "acc_norm": 0.5601851851851852, + "acc_norm_stderr": 0.033851779760448106 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8137254901960784, + "acc_stderr": 0.027325470966716312, + "acc_norm": 0.8137254901960784, + "acc_norm_stderr": 0.027325470966716312 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229966, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229966 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6681614349775785, + "acc_stderr": 0.03160295143776679, + "acc_norm": 0.6681614349775785, + "acc_norm_stderr": 0.03160295143776679 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7480916030534351, + "acc_stderr": 0.03807387116306086, + "acc_norm": 0.7480916030534351, + "acc_norm_stderr": 0.03807387116306086 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.768595041322314, + "acc_stderr": 0.03849856098794088, + "acc_norm": 0.768595041322314, + "acc_norm_stderr": 0.03849856098794088 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7668711656441718, + "acc_stderr": 0.0332201579577674, + "acc_norm": 0.7668711656441718, + "acc_norm_stderr": 0.0332201579577674 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7864077669902912, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.7864077669902912, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8632478632478633, + "acc_stderr": 0.022509033937077812, + "acc_norm": 0.8632478632478633, + "acc_norm_stderr": 0.022509033937077812 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8173690932311622, + "acc_stderr": 0.013816335389973138, + "acc_norm": 0.8173690932311622, + "acc_norm_stderr": 0.013816335389973138 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7023121387283237, + "acc_stderr": 0.024617055388677003, + "acc_norm": 0.7023121387283237, + "acc_norm_stderr": 0.024617055388677003 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4100558659217877, + "acc_stderr": 0.016449708209026078, + "acc_norm": 0.4100558659217877, + "acc_norm_stderr": 0.016449708209026078 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7091503267973857, + "acc_stderr": 0.02600480036395213, + "acc_norm": 0.7091503267973857, + "acc_norm_stderr": 0.02600480036395213 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6816720257234726, + "acc_stderr": 0.026457225067811025, + "acc_norm": 0.6816720257234726, + "acc_norm_stderr": 0.026457225067811025 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7067901234567902, + "acc_stderr": 0.025329888171900922, + "acc_norm": 0.7067901234567902, + "acc_norm_stderr": 0.025329888171900922 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.46099290780141844, + "acc_stderr": 0.02973659252642444, + "acc_norm": 0.46099290780141844, + "acc_norm_stderr": 0.02973659252642444 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4530638852672751, + "acc_stderr": 0.012713845972358978, + "acc_norm": 0.4530638852672751, + "acc_norm_stderr": 0.012713845972358978 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.028418208619406755, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.028418208619406755 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6683006535947712, + "acc_stderr": 0.01904748523936038, + "acc_norm": 0.6683006535947712, + "acc_norm_stderr": 0.01904748523936038 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.04494290866252089, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.04494290866252089 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7306122448979592, + "acc_stderr": 0.02840125202902294, + "acc_norm": 0.7306122448979592, + "acc_norm_stderr": 0.02840125202902294 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8606965174129353, + "acc_stderr": 0.024484487162913973, + "acc_norm": 0.8606965174129353, + "acc_norm_stderr": 0.024484487162913973 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.82, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5481927710843374, + "acc_stderr": 0.03874371556587952, + "acc_norm": 0.5481927710843374, + "acc_norm_stderr": 0.03874371556587952 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8421052631578947, + "acc_stderr": 0.02796678585916089, + "acc_norm": 0.8421052631578947, + "acc_norm_stderr": 0.02796678585916089 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.401468788249694, + "mc1_stderr": 0.017160273901693657, + "mc2": 0.5618804562751369, + "mc2_stderr": 0.015525700835296153 + }, + "all": { + "acc": 0.6398621215363215, + "acc_stderr": 0.033170910986947626, + "acc_norm": 0.6434278880715447, + "acc_norm_stderr": 0.03314925860793652, + "mc1": 0.401468788249694, + "mc1_stderr": 0.017160273901693657, + "mc2": 0.5618804562751369, + "mc2_stderr": 0.015525700835296153 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "6108.1306710243225", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Mistral-11B-TestBench9/results_2023-11-05T11-48-18.495920.json b/eval-results/Undi95/Mistral-11B-TestBench9/results_2023-11-05T11-48-18.495920.json new file mode 100644 index 0000000000000000000000000000000000000000..7a5a328910ca56e3556420a1376747cbca1e2adf --- /dev/null +++ b/eval-results/Undi95/Mistral-11B-TestBench9/results_2023-11-05T11-48-18.495920.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "Undi95/Mistral-11B-TestBench9", + "model_sha": "5a9a5acc20cab4b312303291eb88e0eba94e4427", + "model_dtype": "torch.bfloat16", + "model_size": "20.74 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.018351510067114093, + "em_stderr": 0.0013745278884539388, + "f1": 0.08351719798657717, + "f1_stderr": 0.0019210059131140958 + }, + "harness|gsm8k|5": { + "acc": 0.16148597422289612, + "acc_stderr": 0.01013595945213431 + }, + "harness|winogrande|5": { + "acc": 0.7845303867403315, + "acc_stderr": 0.011555295286059282 + }, + "all": { + "em": 0.018351510067114093, + "em_stderr": 0.0013745278884539388, + "f1": 0.08351719798657717, + "f1_stderr": 0.0019210059131140958, + "acc": 0.4730081804816138, + "acc_stderr": 0.010845627369096797 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "e2fa9b9f4656d431" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "a9f059a27edb85f7" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2397, + "non_padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "0fe098ae81636ab9" + }, + "truncated": 0, + "non_truncated": 12122, + "padded": 2397, + "non_padded": 10992, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Mistral-11B-TestBench9/results_2023-11-07T07-27-56.824577.json b/eval-results/Undi95/Mistral-11B-TestBench9/results_2023-11-07T07-27-56.824577.json new file mode 100644 index 0000000000000000000000000000000000000000..7a5a328910ca56e3556420a1376747cbca1e2adf --- /dev/null +++ b/eval-results/Undi95/Mistral-11B-TestBench9/results_2023-11-07T07-27-56.824577.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "Undi95/Mistral-11B-TestBench9", + "model_sha": "5a9a5acc20cab4b312303291eb88e0eba94e4427", + "model_dtype": "torch.bfloat16", + "model_size": "20.74 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.018351510067114093, + "em_stderr": 0.0013745278884539388, + "f1": 0.08351719798657717, + "f1_stderr": 0.0019210059131140958 + }, + "harness|gsm8k|5": { + "acc": 0.16148597422289612, + "acc_stderr": 0.01013595945213431 + }, + "harness|winogrande|5": { + "acc": 0.7845303867403315, + "acc_stderr": 0.011555295286059282 + }, + "all": { + "em": 0.018351510067114093, + "em_stderr": 0.0013745278884539388, + "f1": 0.08351719798657717, + "f1_stderr": 0.0019210059131140958, + "acc": 0.4730081804816138, + "acc_stderr": 0.010845627369096797 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "e2fa9b9f4656d431" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "a9f059a27edb85f7" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2397, + "non_padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "0fe098ae81636ab9" + }, + "truncated": 0, + "non_truncated": 12122, + "padded": 2397, + "non_padded": 10992, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Mixtral-8x7B-MoE-RP-Story/results_2023-12-16T21-32-27.266201.json b/eval-results/Undi95/Mixtral-8x7B-MoE-RP-Story/results_2023-12-16T21-32-27.266201.json new file mode 100644 index 0000000000000000000000000000000000000000..c0996263f104b0f9af8b0e0d0112ada3bdd8a837 --- /dev/null +++ b/eval-results/Undi95/Mixtral-8x7B-MoE-RP-Story/results_2023-12-16T21-32-27.266201.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 369040.532453204, + "end_time": 393307.217228402, + "total_evaluation_time_secondes": "24266.684775197995", + "model_name": "Undi95/Mixtral-8x7B-MoE-RP-Story", + "model_sha": "ce4a4e4ffec063a3e338b6ebc328365270b6c5f0", + "model_dtype": "torch.bfloat16", + "model_size": "87.49 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.46501706484641636, + "acc_stderr": 0.014575583922019665, + "acc_norm": 0.515358361774744, + "acc_norm_stderr": 0.014604496129394904 + }, + "harness|hellaswag|10": { + "acc": 0.5017924716191994, + "acc_stderr": 0.004989749347461088, + "acc_norm": 0.6999601672973511, + "acc_norm_stderr": 0.004573383672159088 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.43703703703703706, + "acc_stderr": 0.04284958639753399, + "acc_norm": 0.43703703703703706, + "acc_norm_stderr": 0.04284958639753399 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4934210526315789, + "acc_stderr": 0.040685900502249704, + "acc_norm": 0.4934210526315789, + "acc_norm_stderr": 0.040685900502249704 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.41509433962264153, + "acc_stderr": 0.030325945789286105, + "acc_norm": 0.41509433962264153, + "acc_norm_stderr": 0.030325945789286105 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4513888888888889, + "acc_stderr": 0.041614023984032786, + "acc_norm": 0.4513888888888889, + "acc_norm_stderr": 0.041614023984032786 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952344, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952344 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3988439306358382, + "acc_stderr": 0.037336266553835096, + "acc_norm": 0.3988439306358382, + "acc_norm_stderr": 0.037336266553835096 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.042801058373643966, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.042801058373643966 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3574468085106383, + "acc_stderr": 0.03132941789476425, + "acc_norm": 0.3574468085106383, + "acc_norm_stderr": 0.03132941789476425 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3508771929824561, + "acc_stderr": 0.044895393502707, + "acc_norm": 0.3508771929824561, + "acc_norm_stderr": 0.044895393502707 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.43448275862068964, + "acc_stderr": 0.04130740879555497, + "acc_norm": 0.43448275862068964, + "acc_norm_stderr": 0.04130740879555497 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.023068188848261114, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.023068188848261114 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.03809523809523812, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.03809523809523812 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.41935483870967744, + "acc_stderr": 0.028071588901091838, + "acc_norm": 0.41935483870967744, + "acc_norm_stderr": 0.028071588901091838 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2660098522167488, + "acc_stderr": 0.03108982600293753, + "acc_norm": 0.2660098522167488, + "acc_norm_stderr": 0.03108982600293753 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.36363636363636365, + "acc_stderr": 0.03756335775187896, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.03756335775187896 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5151515151515151, + "acc_stderr": 0.03560716516531061, + "acc_norm": 0.5151515151515151, + "acc_norm_stderr": 0.03560716516531061 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6424870466321243, + "acc_stderr": 0.034588160421810114, + "acc_norm": 0.6424870466321243, + "acc_norm_stderr": 0.034588160421810114 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.44358974358974357, + "acc_stderr": 0.025189149894764205, + "acc_norm": 0.44358974358974357, + "acc_norm_stderr": 0.025189149894764205 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085622, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085622 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4327731092436975, + "acc_stderr": 0.03218358107742613, + "acc_norm": 0.4327731092436975, + "acc_norm_stderr": 0.03218358107742613 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.03802039760107903, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.03802039760107903 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.5045871559633027, + "acc_stderr": 0.021436420955529414, + "acc_norm": 0.5045871559633027, + "acc_norm_stderr": 0.021436420955529414 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3472222222222222, + "acc_stderr": 0.032468872436376486, + "acc_norm": 0.3472222222222222, + "acc_norm_stderr": 0.032468872436376486 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.4166666666666667, + "acc_stderr": 0.0346022832723917, + "acc_norm": 0.4166666666666667, + "acc_norm_stderr": 0.0346022832723917 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.46835443037974683, + "acc_stderr": 0.03248197400511075, + "acc_norm": 0.46835443037974683, + "acc_norm_stderr": 0.03248197400511075 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.45739910313901344, + "acc_stderr": 0.033435777055830646, + "acc_norm": 0.45739910313901344, + "acc_norm_stderr": 0.033435777055830646 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5267175572519084, + "acc_stderr": 0.04379024936553893, + "acc_norm": 0.5267175572519084, + "acc_norm_stderr": 0.04379024936553893 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6528925619834711, + "acc_stderr": 0.043457245702925335, + "acc_norm": 0.6528925619834711, + "acc_norm_stderr": 0.043457245702925335 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.0478034362693679, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.0478034362693679 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4601226993865031, + "acc_stderr": 0.03915857291436972, + "acc_norm": 0.4601226993865031, + "acc_norm_stderr": 0.03915857291436972 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4732142857142857, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.4732142857142857, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5339805825242718, + "acc_stderr": 0.0493929144727348, + "acc_norm": 0.5339805825242718, + "acc_norm_stderr": 0.0493929144727348 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.5726495726495726, + "acc_stderr": 0.03240847393516327, + "acc_norm": 0.5726495726495726, + "acc_norm_stderr": 0.03240847393516327 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5312899106002554, + "acc_stderr": 0.01784491809046855, + "acc_norm": 0.5312899106002554, + "acc_norm_stderr": 0.01784491809046855 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.43641618497109824, + "acc_stderr": 0.026700545424943687, + "acc_norm": 0.43641618497109824, + "acc_norm_stderr": 0.026700545424943687 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.28044692737430166, + "acc_stderr": 0.015024083883322893, + "acc_norm": 0.28044692737430166, + "acc_norm_stderr": 0.015024083883322893 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.45751633986928103, + "acc_stderr": 0.02852638345214263, + "acc_norm": 0.45751633986928103, + "acc_norm_stderr": 0.02852638345214263 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.4533762057877814, + "acc_stderr": 0.02827435985489424, + "acc_norm": 0.4533762057877814, + "acc_norm_stderr": 0.02827435985489424 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.49691358024691357, + "acc_stderr": 0.027820214158594377, + "acc_norm": 0.49691358024691357, + "acc_norm_stderr": 0.027820214158594377 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.36879432624113473, + "acc_stderr": 0.028782227561347257, + "acc_norm": 0.36879432624113473, + "acc_norm_stderr": 0.028782227561347257 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.31290743155149936, + "acc_stderr": 0.011842529823062995, + "acc_norm": 0.31290743155149936, + "acc_norm_stderr": 0.011842529823062995 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.45588235294117646, + "acc_stderr": 0.030254372573976694, + "acc_norm": 0.45588235294117646, + "acc_norm_stderr": 0.030254372573976694 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.01991037746310593, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.01991037746310593 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.509090909090909, + "acc_stderr": 0.04788339768702861, + "acc_norm": 0.509090909090909, + "acc_norm_stderr": 0.04788339768702861 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5918367346938775, + "acc_stderr": 0.03146465712827423, + "acc_norm": 0.5918367346938775, + "acc_norm_stderr": 0.03146465712827423 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.572139303482587, + "acc_stderr": 0.03498541988407795, + "acc_norm": 0.572139303482587, + "acc_norm_stderr": 0.03498541988407795 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.39156626506024095, + "acc_stderr": 0.03799857454479636, + "acc_norm": 0.39156626506024095, + "acc_norm_stderr": 0.03799857454479636 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6081871345029239, + "acc_stderr": 0.03743979825926399, + "acc_norm": 0.6081871345029239, + "acc_norm_stderr": 0.03743979825926399 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.26438188494492043, + "mc1_stderr": 0.015438211119522514, + "mc2": 0.41531240642156975, + "mc2_stderr": 0.01492327563743382 + }, + "harness|winogrande|5": { + "acc": 0.6732438831886346, + "acc_stderr": 0.013181997302131366 + }, + "harness|gsm8k|5": { + "acc": 0.09931766489764973, + "acc_stderr": 0.008238371412683963 + }, + "all": { + "acc": 0.43068446823982826, + "acc_stderr": 0.03444996506735285, + "acc_norm": 0.43640169503643583, + "acc_norm_stderr": 0.03524813638857257, + "mc1": 0.26438188494492043, + "mc1_stderr": 0.015438211119522514, + "mc2": 0.41531240642156975, + "mc2_stderr": 0.01492327563743382 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "2990d9f64ac40784" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "23e173b318e92eba" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Nous-Hermes-13B-Code/results_2023-09-05T02-42-01.860222.json b/eval-results/Undi95/Nous-Hermes-13B-Code/results_2023-09-05T02-42-01.860222.json new file mode 100644 index 0000000000000000000000000000000000000000..0505b73b3f949129ab338322240ecbb4fb913b44 --- /dev/null +++ b/eval-results/Undi95/Nous-Hermes-13B-Code/results_2023-09-05T02-42-01.860222.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "Undi95/Nous-Hermes-13B-Code", + "model_sha": "5a45cb2a6442581ce32cc19c561c49cec1db4ebb", + "model_dtype": "torch.float16", + "lighteval_sha": "9f7699e1a44b5b4d7bd4f326b57a34db83b67c3f", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5793515358361775, + "acc_stderr": 0.014426211252508394, + "acc_norm": 0.6117747440273038, + "acc_norm_stderr": 0.014241614207414046 + }, + "harness|hellaswag|10": { + "acc": 0.6333399721171081, + "acc_stderr": 0.004809077205343495, + "acc_norm": 0.832105158334993, + "acc_norm_stderr": 0.0037300899105375787 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5037037037037037, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.5037037037037037, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5394736842105263, + "acc_stderr": 0.04056242252249033, + "acc_norm": 0.5394736842105263, + "acc_norm_stderr": 0.04056242252249033 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5773584905660377, + "acc_stderr": 0.03040233144576954, + "acc_norm": 0.5773584905660377, + "acc_norm_stderr": 0.03040233144576954 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6388888888888888, + "acc_stderr": 0.04016660030451233, + "acc_norm": 0.6388888888888888, + "acc_norm_stderr": 0.04016660030451233 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5144508670520231, + "acc_stderr": 0.03810871630454764, + "acc_norm": 0.5144508670520231, + "acc_norm_stderr": 0.03810871630454764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.04440521906179328, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.04440521906179328 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.65, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.65, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.46382978723404256, + "acc_stderr": 0.032600385118357715, + "acc_norm": 0.46382978723404256, + "acc_norm_stderr": 0.032600385118357715 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.043727482902780064, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.043727482902780064 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.335978835978836, + "acc_stderr": 0.024326310529149138, + "acc_norm": 0.335978835978836, + "acc_norm_stderr": 0.024326310529149138 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.043435254289490965, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.043435254289490965 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6290322580645161, + "acc_stderr": 0.027480541887953586, + "acc_norm": 0.6290322580645161, + "acc_norm_stderr": 0.027480541887953586 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43349753694581283, + "acc_stderr": 0.034867317274198714, + "acc_norm": 0.43349753694581283, + "acc_norm_stderr": 0.034867317274198714 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6242424242424243, + "acc_stderr": 0.03781887353205982, + "acc_norm": 0.6242424242424243, + "acc_norm_stderr": 0.03781887353205982 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6868686868686869, + "acc_stderr": 0.033042050878136525, + "acc_norm": 0.6868686868686869, + "acc_norm_stderr": 0.033042050878136525 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7616580310880829, + "acc_stderr": 0.03074890536390989, + "acc_norm": 0.7616580310880829, + "acc_norm_stderr": 0.03074890536390989 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4846153846153846, + "acc_stderr": 0.02533900301010651, + "acc_norm": 0.4846153846153846, + "acc_norm_stderr": 0.02533900301010651 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32222222222222224, + "acc_stderr": 0.028493465091028604, + "acc_norm": 0.32222222222222224, + "acc_norm_stderr": 0.028493465091028604 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.032252942323996406, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.032252942323996406 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4105960264900662, + "acc_stderr": 0.04016689594849928, + "acc_norm": 0.4105960264900662, + "acc_norm_stderr": 0.04016689594849928 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7302752293577982, + "acc_stderr": 0.01902848671111544, + "acc_norm": 0.7302752293577982, + "acc_norm_stderr": 0.01902848671111544 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.47685185185185186, + "acc_stderr": 0.03406315360711507, + "acc_norm": 0.47685185185185186, + "acc_norm_stderr": 0.03406315360711507 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7647058823529411, + "acc_stderr": 0.02977177522814563, + "acc_norm": 0.7647058823529411, + "acc_norm_stderr": 0.02977177522814563 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7215189873417721, + "acc_stderr": 0.029178682304842538, + "acc_norm": 0.7215189873417721, + "acc_norm_stderr": 0.029178682304842538 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6098654708520179, + "acc_stderr": 0.03273766725459157, + "acc_norm": 0.6098654708520179, + "acc_norm_stderr": 0.03273766725459157 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5801526717557252, + "acc_stderr": 0.04328577215262971, + "acc_norm": 0.5801526717557252, + "acc_norm_stderr": 0.04328577215262971 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908706, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908706 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7037037037037037, + "acc_stderr": 0.044143436668549335, + "acc_norm": 0.7037037037037037, + "acc_norm_stderr": 0.044143436668549335 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6625766871165644, + "acc_stderr": 0.03714908409935574, + "acc_norm": 0.6625766871165644, + "acc_norm_stderr": 0.03714908409935574 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.04521829902833586, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.04521829902833586 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7184466019417476, + "acc_stderr": 0.04453254836326468, + "acc_norm": 0.7184466019417476, + "acc_norm_stderr": 0.04453254836326468 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7606837606837606, + "acc_stderr": 0.027951826808924333, + "acc_norm": 0.7606837606837606, + "acc_norm_stderr": 0.027951826808924333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7675606641123882, + "acc_stderr": 0.015104550008905718, + "acc_norm": 0.7675606641123882, + "acc_norm_stderr": 0.015104550008905718 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5867052023121387, + "acc_stderr": 0.02651126136940924, + "acc_norm": 0.5867052023121387, + "acc_norm_stderr": 0.02651126136940924 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.376536312849162, + "acc_stderr": 0.0162046723851066, + "acc_norm": 0.376536312849162, + "acc_norm_stderr": 0.0162046723851066 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5784313725490197, + "acc_stderr": 0.028275490156791455, + "acc_norm": 0.5784313725490197, + "acc_norm_stderr": 0.028275490156791455 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.617363344051447, + "acc_stderr": 0.02760468902858198, + "acc_norm": 0.617363344051447, + "acc_norm_stderr": 0.02760468902858198 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6327160493827161, + "acc_stderr": 0.026822801759507894, + "acc_norm": 0.6327160493827161, + "acc_norm_stderr": 0.026822801759507894 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.40425531914893614, + "acc_stderr": 0.029275532159704725, + "acc_norm": 0.40425531914893614, + "acc_norm_stderr": 0.029275532159704725 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4152542372881356, + "acc_stderr": 0.01258547179340066, + "acc_norm": 0.4152542372881356, + "acc_norm_stderr": 0.01258547179340066 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5330882352941176, + "acc_stderr": 0.030306257722468317, + "acc_norm": 0.5330882352941176, + "acc_norm_stderr": 0.030306257722468317 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5441176470588235, + "acc_stderr": 0.020148939420415745, + "acc_norm": 0.5441176470588235, + "acc_norm_stderr": 0.020148939420415745 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5818181818181818, + "acc_stderr": 0.04724577405731572, + "acc_norm": 0.5818181818181818, + "acc_norm_stderr": 0.04724577405731572 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5836734693877551, + "acc_stderr": 0.031557828165561644, + "acc_norm": 0.5836734693877551, + "acc_norm_stderr": 0.031557828165561644 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7512437810945274, + "acc_stderr": 0.030567675938916718, + "acc_norm": 0.7512437810945274, + "acc_norm_stderr": 0.030567675938916718 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.43373493975903615, + "acc_stderr": 0.03858158940685516, + "acc_norm": 0.43373493975903615, + "acc_norm_stderr": 0.03858158940685516 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7719298245614035, + "acc_stderr": 0.03218093795602357, + "acc_norm": 0.7719298245614035, + "acc_norm_stderr": 0.03218093795602357 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35006119951040393, + "mc1_stderr": 0.016697949420151032, + "mc2": 0.505550629963065, + "mc2_stderr": 0.01590974900800537 + }, + "all": { + "acc": 0.5531820683673332, + "acc_stderr": 0.03477068808309721, + "acc_norm": 0.5571005157301979, + "acc_norm_stderr": 0.034749271399370084, + "mc1": 0.35006119951040393, + "mc1_stderr": 0.016697949420151032, + "mc2": 0.505550629963065, + "mc2_stderr": 0.01590974900800537 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6337.8339104652405", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Nous-Hermes-13B-Code/results_2023-10-17T01-46-49.269980.json b/eval-results/Undi95/Nous-Hermes-13B-Code/results_2023-10-17T01-46-49.269980.json new file mode 100644 index 0000000000000000000000000000000000000000..286e05d614023748628841ef452eaa78fc76d7d2 --- /dev/null +++ b/eval-results/Undi95/Nous-Hermes-13B-Code/results_2023-10-17T01-46-49.269980.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/Nous-Hermes-13B-Code", + "model_sha": "5952f55603553777996ca7fd30736c512f4f0e65", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.19043624161073824, + "em_stderr": 0.004021054701391535, + "f1": 0.28277894295302086, + "f1_stderr": 0.004086388636430754 + }, + "harness|gsm8k|5": { + "acc": 0.10386656557998483, + "acc_stderr": 0.008403622228924035 + }, + "harness|winogrande|5": { + "acc": 0.7513812154696132, + "acc_stderr": 0.012147314713403108 + }, + "all": { + "em": 0.19043624161073824, + "em_stderr": 0.004021054701391535, + "f1": 0.28277894295302086, + "f1_stderr": 0.004086388636430754, + "acc": 0.42762389052479904, + "acc_stderr": 0.010275468471163573 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "cb6b750e72c8437a" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "953fd315056b093a" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "c911c25fcad181f3" + }, + "total_evaluation_time_secondes": "11097.49946975708", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/OpenRP-13B/results_2023-09-18T13-48-59.614981.json b/eval-results/Undi95/OpenRP-13B/results_2023-09-18T13-48-59.614981.json new file mode 100644 index 0000000000000000000000000000000000000000..3181efaf214df82a4bd36df026df68b5acec9564 --- /dev/null +++ b/eval-results/Undi95/OpenRP-13B/results_2023-09-18T13-48-59.614981.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/OpenRP-13B", + "model_sha": "d11815287c51ef51485fb003f8f72773cf6f19a4", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5878839590443686, + "acc_stderr": 0.0143839153022254, + "acc_norm": 0.621160409556314, + "acc_norm_stderr": 0.014175915490000322 + }, + "harness|hellaswag|10": { + "acc": 0.6258713403704441, + "acc_stderr": 0.004829081532826502, + "acc_norm": 0.8260306711810397, + "acc_norm_stderr": 0.0037830836739860575 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4740740740740741, + "acc_stderr": 0.04313531696750574, + "acc_norm": 0.4740740740740741, + "acc_norm_stderr": 0.04313531696750574 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5723684210526315, + "acc_stderr": 0.04026097083296563, + "acc_norm": 0.5723684210526315, + "acc_norm_stderr": 0.04026097083296563 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.0498887651569859, + "acc_norm": 0.56, + "acc_norm_stderr": 0.0498887651569859 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6264150943396226, + "acc_stderr": 0.029773082713319875, + "acc_norm": 0.6264150943396226, + "acc_norm_stderr": 0.029773082713319875 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6041666666666666, + "acc_stderr": 0.04089465449325582, + "acc_norm": 0.6041666666666666, + "acc_norm_stderr": 0.04089465449325582 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237101, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237101 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.45, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5491329479768786, + "acc_stderr": 0.037940126746970296, + "acc_norm": 0.5491329479768786, + "acc_norm_stderr": 0.037940126746970296 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.04533838195929777, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.04533838195929777 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.03261936918467381, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.03261936918467381 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537315, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537315 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3306878306878307, + "acc_stderr": 0.024229965298425082, + "acc_norm": 0.3306878306878307, + "acc_norm_stderr": 0.024229965298425082 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.042857142857142816, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.042857142857142816 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6838709677419355, + "acc_stderr": 0.026450874489042774, + "acc_norm": 0.6838709677419355, + "acc_norm_stderr": 0.026450874489042774 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.47783251231527096, + "acc_stderr": 0.03514528562175008, + "acc_norm": 0.47783251231527096, + "acc_norm_stderr": 0.03514528562175008 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.03663974994391244, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.03663974994391244 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.03173071239071724, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.03173071239071724 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8186528497409327, + "acc_stderr": 0.02780703236068609, + "acc_norm": 0.8186528497409327, + "acc_norm_stderr": 0.02780703236068609 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5205128205128206, + "acc_stderr": 0.02532966316348994, + "acc_norm": 0.5205128205128206, + "acc_norm_stderr": 0.02532966316348994 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.027940457136228416, + "acc_norm": 0.3, + "acc_norm_stderr": 0.027940457136228416 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6092436974789915, + "acc_stderr": 0.03169380235712996, + "acc_norm": 0.6092436974789915, + "acc_norm_stderr": 0.03169380235712996 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3509933774834437, + "acc_stderr": 0.03896981964257375, + "acc_norm": 0.3509933774834437, + "acc_norm_stderr": 0.03896981964257375 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7651376146788991, + "acc_stderr": 0.01817511051034356, + "acc_norm": 0.7651376146788991, + "acc_norm_stderr": 0.01817511051034356 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4166666666666667, + "acc_stderr": 0.03362277436608044, + "acc_norm": 0.4166666666666667, + "acc_norm_stderr": 0.03362277436608044 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7745098039215687, + "acc_stderr": 0.029331162294251735, + "acc_norm": 0.7745098039215687, + "acc_norm_stderr": 0.029331162294251735 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7510548523206751, + "acc_stderr": 0.028146970599422644, + "acc_norm": 0.7510548523206751, + "acc_norm_stderr": 0.028146970599422644 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.03102441174057221, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.03102441174057221 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6335877862595419, + "acc_stderr": 0.042258754519696365, + "acc_norm": 0.6335877862595419, + "acc_norm_stderr": 0.042258754519696365 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.039849796533028725, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.039849796533028725 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7685185185185185, + "acc_stderr": 0.04077494709252626, + "acc_norm": 0.7685185185185185, + "acc_norm_stderr": 0.04077494709252626 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6625766871165644, + "acc_stderr": 0.03714908409935574, + "acc_norm": 0.6625766871165644, + "acc_norm_stderr": 0.03714908409935574 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.04432804055291518, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.04432804055291518 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7378640776699029, + "acc_stderr": 0.04354631077260594, + "acc_norm": 0.7378640776699029, + "acc_norm_stderr": 0.04354631077260594 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.782051282051282, + "acc_stderr": 0.02704685763071669, + "acc_norm": 0.782051282051282, + "acc_norm_stderr": 0.02704685763071669 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.62, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.62, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7701149425287356, + "acc_stderr": 0.015046301846691805, + "acc_norm": 0.7701149425287356, + "acc_norm_stderr": 0.015046301846691805 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.025722802200895803, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.025722802200895803 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.47262569832402235, + "acc_stderr": 0.016697420650642752, + "acc_norm": 0.47262569832402235, + "acc_norm_stderr": 0.016697420650642752 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6372549019607843, + "acc_stderr": 0.027530078447110303, + "acc_norm": 0.6372549019607843, + "acc_norm_stderr": 0.027530078447110303 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6527331189710611, + "acc_stderr": 0.027040745502307336, + "acc_norm": 0.6527331189710611, + "acc_norm_stderr": 0.027040745502307336 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.654320987654321, + "acc_stderr": 0.02646248777700187, + "acc_norm": 0.654320987654321, + "acc_norm_stderr": 0.02646248777700187 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.450354609929078, + "acc_stderr": 0.02968010556502904, + "acc_norm": 0.450354609929078, + "acc_norm_stderr": 0.02968010556502904 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.42894393741851367, + "acc_stderr": 0.012640625443067358, + "acc_norm": 0.42894393741851367, + "acc_norm_stderr": 0.012640625443067358 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5441176470588235, + "acc_stderr": 0.030254372573976722, + "acc_norm": 0.5441176470588235, + "acc_norm_stderr": 0.030254372573976722 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.576797385620915, + "acc_stderr": 0.019987809769482064, + "acc_norm": 0.576797385620915, + "acc_norm_stderr": 0.019987809769482064 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.0449429086625209, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.0449429086625209 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6530612244897959, + "acc_stderr": 0.030472526026726496, + "acc_norm": 0.6530612244897959, + "acc_norm_stderr": 0.030472526026726496 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7761194029850746, + "acc_stderr": 0.029475250236017193, + "acc_norm": 0.7761194029850746, + "acc_norm_stderr": 0.029475250236017193 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.0358870281282637, + "acc_norm": 0.85, + "acc_norm_stderr": 0.0358870281282637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.45180722891566266, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.45180722891566266, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7660818713450293, + "acc_stderr": 0.03246721765117826, + "acc_norm": 0.7660818713450293, + "acc_norm_stderr": 0.03246721765117826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3378212974296206, + "mc1_stderr": 0.016557167322516875, + "mc2": 0.48286539138692774, + "mc2_stderr": 0.015189076635393605 + }, + "all": { + "acc": 0.5760926841629516, + "acc_stderr": 0.03424711249500131, + "acc_norm": 0.5800492228294355, + "acc_norm_stderr": 0.03422585829718664, + "mc1": 0.3378212974296206, + "mc1_stderr": 0.016557167322516875, + "mc2": 0.48286539138692774, + "mc2_stderr": 0.015189076635393605 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6396.943919181824", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/OpenRP-13B/results_2023-10-29T00-54-50.325458.json b/eval-results/Undi95/OpenRP-13B/results_2023-10-29T00-54-50.325458.json new file mode 100644 index 0000000000000000000000000000000000000000..0ccd4acfb2f92d614cfc6f9cadb159681e4dfd26 --- /dev/null +++ b/eval-results/Undi95/OpenRP-13B/results_2023-10-29T00-54-50.325458.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/OpenRP-13B", + "model_sha": "d11815287c51ef51485fb003f8f72773cf6f19a4", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.27632130872483224, + "em_stderr": 0.0045795175994957325, + "f1": 0.3337751677852358, + "f1_stderr": 0.004476795348022121 + }, + "harness|gsm8k|5": { + "acc": 0.1288855193328279, + "acc_stderr": 0.009229580761400263 + }, + "harness|winogrande|5": { + "acc": 0.7600631412786109, + "acc_stderr": 0.012002078629485742 + }, + "all": { + "em": 0.27632130872483224, + "em_stderr": 0.0045795175994957325, + "f1": 0.3337751677852358, + "f1_stderr": 0.004476795348022121, + "acc": 0.44447433030571937, + "acc_stderr": 0.010615829695443002 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "c417b4d5876394a4" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "01e1a9ad0c90cfcf" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "32cabb457ed6e972" + }, + "total_evaluation_time_secondes": "36037.68720960617", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/ReMM-L2-13B-PIPPA/results_2023-09-05T05-29-49.738166.json b/eval-results/Undi95/ReMM-L2-13B-PIPPA/results_2023-09-05T05-29-49.738166.json new file mode 100644 index 0000000000000000000000000000000000000000..ffb5617fc7a218b8dc5cd6ef125abd36d0a9bfeb --- /dev/null +++ b/eval-results/Undi95/ReMM-L2-13B-PIPPA/results_2023-09-05T05-29-49.738166.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "Undi95/ReMM-L2-13B-PIPPA", + "model_sha": "79e711178c6881496ae1f5635b08bc193f370709", + "model_dtype": "torch.float16", + "lighteval_sha": "9f7699e1a44b5b4d7bd4f326b57a34db83b67c3f", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5725255972696246, + "acc_stderr": 0.014456862944650654, + "acc_norm": 0.5972696245733788, + "acc_norm_stderr": 0.014332236306790149 + }, + "harness|hellaswag|10": { + "acc": 0.6374228241386178, + "acc_stderr": 0.0047976167543723105, + "acc_norm": 0.8312089225253934, + "acc_norm_stderr": 0.0037380177340378636 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5111111111111111, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.5111111111111111, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4934210526315789, + "acc_stderr": 0.040685900502249704, + "acc_norm": 0.4934210526315789, + "acc_norm_stderr": 0.040685900502249704 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6075471698113207, + "acc_stderr": 0.03005258057955785, + "acc_norm": 0.6075471698113207, + "acc_norm_stderr": 0.03005258057955785 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5972222222222222, + "acc_stderr": 0.04101405519842426, + "acc_norm": 0.5972222222222222, + "acc_norm_stderr": 0.04101405519842426 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411019, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411019 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5028901734104047, + "acc_stderr": 0.038124005659748335, + "acc_norm": 0.5028901734104047, + "acc_norm_stderr": 0.038124005659748335 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.043898699568087764, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.043898699568087764 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.42127659574468085, + "acc_stderr": 0.03227834510146268, + "acc_norm": 0.42127659574468085, + "acc_norm_stderr": 0.03227834510146268 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5103448275862069, + "acc_stderr": 0.04165774775728763, + "acc_norm": 0.5103448275862069, + "acc_norm_stderr": 0.04165774775728763 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.328042328042328, + "acc_stderr": 0.024180497164376896, + "acc_norm": 0.328042328042328, + "acc_norm_stderr": 0.024180497164376896 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.04163453031302859, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.04163453031302859 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6258064516129033, + "acc_stderr": 0.027528904299845704, + "acc_norm": 0.6258064516129033, + "acc_norm_stderr": 0.027528904299845704 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4088669950738916, + "acc_stderr": 0.034590588158832314, + "acc_norm": 0.4088669950738916, + "acc_norm_stderr": 0.034590588158832314 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.0368105086916155, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.0368105086916155 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6818181818181818, + "acc_stderr": 0.0331847733384533, + "acc_norm": 0.6818181818181818, + "acc_norm_stderr": 0.0331847733384533 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7875647668393783, + "acc_stderr": 0.029519282616817247, + "acc_norm": 0.7875647668393783, + "acc_norm_stderr": 0.029519282616817247 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.517948717948718, + "acc_stderr": 0.025334667080954925, + "acc_norm": 0.517948717948718, + "acc_norm_stderr": 0.025334667080954925 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085622, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085622 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5252100840336135, + "acc_stderr": 0.03243718055137411, + "acc_norm": 0.5252100840336135, + "acc_norm_stderr": 0.03243718055137411 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.710091743119266, + "acc_stderr": 0.019453066609201597, + "acc_norm": 0.710091743119266, + "acc_norm_stderr": 0.019453066609201597 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.033247089118091176, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.033247089118091176 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7696078431372549, + "acc_stderr": 0.029554292605695066, + "acc_norm": 0.7696078431372549, + "acc_norm_stderr": 0.029554292605695066 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7130801687763713, + "acc_stderr": 0.02944377302259469, + "acc_norm": 0.7130801687763713, + "acc_norm_stderr": 0.02944377302259469 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6636771300448431, + "acc_stderr": 0.031708824268455005, + "acc_norm": 0.6636771300448431, + "acc_norm_stderr": 0.031708824268455005 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6564885496183206, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.6564885496183206, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908705, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.042844679680521934, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.042844679680521934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6625766871165644, + "acc_stderr": 0.03714908409935575, + "acc_norm": 0.6625766871165644, + "acc_norm_stderr": 0.03714908409935575 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285714, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285714 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6504854368932039, + "acc_stderr": 0.04721188506097172, + "acc_norm": 0.6504854368932039, + "acc_norm_stderr": 0.04721188506097172 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7692307692307693, + "acc_stderr": 0.02760192138141759, + "acc_norm": 0.7692307692307693, + "acc_norm_stderr": 0.02760192138141759 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.735632183908046, + "acc_stderr": 0.01576998484069052, + "acc_norm": 0.735632183908046, + "acc_norm_stderr": 0.01576998484069052 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6329479768786127, + "acc_stderr": 0.025950054337654068, + "acc_norm": 0.6329479768786127, + "acc_norm_stderr": 0.025950054337654068 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3039106145251397, + "acc_stderr": 0.015382845587584525, + "acc_norm": 0.3039106145251397, + "acc_norm_stderr": 0.015382845587584525 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6143790849673203, + "acc_stderr": 0.02787074527829028, + "acc_norm": 0.6143790849673203, + "acc_norm_stderr": 0.02787074527829028 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6366559485530546, + "acc_stderr": 0.027316847674192707, + "acc_norm": 0.6366559485530546, + "acc_norm_stderr": 0.027316847674192707 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6141975308641975, + "acc_stderr": 0.027085401226132146, + "acc_norm": 0.6141975308641975, + "acc_norm_stderr": 0.027085401226132146 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.40425531914893614, + "acc_stderr": 0.02927553215970473, + "acc_norm": 0.40425531914893614, + "acc_norm_stderr": 0.02927553215970473 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4211212516297262, + "acc_stderr": 0.012610325733489906, + "acc_norm": 0.4211212516297262, + "acc_norm_stderr": 0.012610325733489906 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5073529411764706, + "acc_stderr": 0.030369552523902173, + "acc_norm": 0.5073529411764706, + "acc_norm_stderr": 0.030369552523902173 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.553921568627451, + "acc_stderr": 0.02010986454718136, + "acc_norm": 0.553921568627451, + "acc_norm_stderr": 0.02010986454718136 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.046075820907199756, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.046075820907199756 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6204081632653061, + "acc_stderr": 0.031067211262872468, + "acc_norm": 0.6204081632653061, + "acc_norm_stderr": 0.031067211262872468 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7014925373134329, + "acc_stderr": 0.03235743789355042, + "acc_norm": 0.7014925373134329, + "acc_norm_stderr": 0.03235743789355042 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774708, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774708 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4397590361445783, + "acc_stderr": 0.03864139923699122, + "acc_norm": 0.4397590361445783, + "acc_norm_stderr": 0.03864139923699122 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7309941520467836, + "acc_stderr": 0.03401052620104089, + "acc_norm": 0.7309941520467836, + "acc_norm_stderr": 0.03401052620104089 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35128518971848227, + "mc1_stderr": 0.016711358163544403, + "mc2": 0.49935182390993416, + "mc2_stderr": 0.01574809606103773 + }, + "all": { + "acc": 0.5431763390415001, + "acc_stderr": 0.034452564290971044, + "acc_norm": 0.5468802394769329, + "acc_norm_stderr": 0.03443249266964571, + "mc1": 0.35128518971848227, + "mc1_stderr": 0.016711358163544403, + "mc2": 0.49935182390993416, + "mc2_stderr": 0.01574809606103773 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6332.805841207504", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/ReMM-L2-13B-PIPPA/results_2023-10-15T22-47-55.884527.json b/eval-results/Undi95/ReMM-L2-13B-PIPPA/results_2023-10-15T22-47-55.884527.json new file mode 100644 index 0000000000000000000000000000000000000000..13011220f2e5ff02e70f16bd0d1247cf37f412ac --- /dev/null +++ b/eval-results/Undi95/ReMM-L2-13B-PIPPA/results_2023-10-15T22-47-55.884527.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/ReMM-L2-13B-PIPPA", + "model_sha": "c74d621ec58b012523f51d9155d821e4d849e93b", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.3598993288590604, + "em_stderr": 0.004915348455608255, + "f1": 0.4368917785234919, + "f1_stderr": 0.004726186762311207 + }, + "harness|gsm8k|5": { + "acc": 0.029567854435178165, + "acc_stderr": 0.004665893134220799 + }, + "harness|winogrande|5": { + "acc": 0.745067087608524, + "acc_stderr": 0.012248806969376422 + }, + "all": { + "em": 0.3598993288590604, + "em_stderr": 0.004915348455608255, + "f1": 0.4368917785234919, + "f1_stderr": 0.004726186762311207, + "acc": 0.3873174710218511, + "acc_stderr": 0.008457350051798611 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "747c09bdda9de208" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "9a611b97409a6729" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "dc4509a2f6dfafc9" + }, + "total_evaluation_time_secondes": "6852.5477504730225", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/ReMM-L2-13B/results_2023-09-03T14-15-27.893202.json b/eval-results/Undi95/ReMM-L2-13B/results_2023-09-03T14-15-27.893202.json new file mode 100644 index 0000000000000000000000000000000000000000..bcd1f3c4b8bfe6d29c865eaf9b497a362fd4b736 --- /dev/null +++ b/eval-results/Undi95/ReMM-L2-13B/results_2023-09-03T14-15-27.893202.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "Undi95/ReMM-L2-13B", + "model_sha": "c4710577003a23ca8e9040d16dfb8f3e9bc5d636", + "model_dtype": "torch.float16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5716723549488054, + "acc_stderr": 0.014460496367599015, + "acc_norm": 0.5972696245733788, + "acc_norm_stderr": 0.014332236306790149 + }, + "harness|hellaswag|10": { + "acc": 0.63752240589524, + "acc_stderr": 0.004797332565990075, + "acc_norm": 0.831009759012149, + "acc_norm_stderr": 0.003739774285418524 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5111111111111111, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.5111111111111111, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5, + "acc_stderr": 0.04068942293855797, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04068942293855797 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6037735849056604, + "acc_stderr": 0.030102793781791197, + "acc_norm": 0.6037735849056604, + "acc_norm_stderr": 0.030102793781791197 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5972222222222222, + "acc_stderr": 0.04101405519842426, + "acc_norm": 0.5972222222222222, + "acc_norm_stderr": 0.04101405519842426 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5028901734104047, + "acc_stderr": 0.038124005659748335, + "acc_norm": 0.5028901734104047, + "acc_norm_stderr": 0.038124005659748335 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.043898699568087764, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.043898699568087764 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.42127659574468085, + "acc_stderr": 0.03227834510146268, + "acc_norm": 0.42127659574468085, + "acc_norm_stderr": 0.03227834510146268 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5103448275862069, + "acc_stderr": 0.04165774775728763, + "acc_norm": 0.5103448275862069, + "acc_norm_stderr": 0.04165774775728763 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.328042328042328, + "acc_stderr": 0.024180497164376896, + "acc_norm": 0.328042328042328, + "acc_norm_stderr": 0.024180497164376896 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.04163453031302859, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.04163453031302859 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6258064516129033, + "acc_stderr": 0.027528904299845704, + "acc_norm": 0.6258064516129033, + "acc_norm_stderr": 0.027528904299845704 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4088669950738916, + "acc_stderr": 0.034590588158832314, + "acc_norm": 0.4088669950738916, + "acc_norm_stderr": 0.034590588158832314 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.0368105086916155, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.0368105086916155 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6767676767676768, + "acc_stderr": 0.033322999210706444, + "acc_norm": 0.6767676767676768, + "acc_norm_stderr": 0.033322999210706444 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7875647668393783, + "acc_stderr": 0.029519282616817247, + "acc_norm": 0.7875647668393783, + "acc_norm_stderr": 0.029519282616817247 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5153846153846153, + "acc_stderr": 0.025339003010106515, + "acc_norm": 0.5153846153846153, + "acc_norm_stderr": 0.025339003010106515 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.02730914058823018, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.02730914058823018 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5252100840336135, + "acc_stderr": 0.03243718055137411, + "acc_norm": 0.5252100840336135, + "acc_norm_stderr": 0.03243718055137411 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7119266055045872, + "acc_stderr": 0.01941644589263603, + "acc_norm": 0.7119266055045872, + "acc_norm_stderr": 0.01941644589263603 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.033247089118091176, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.033247089118091176 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7745098039215687, + "acc_stderr": 0.029331162294251742, + "acc_norm": 0.7745098039215687, + "acc_norm_stderr": 0.029331162294251742 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7130801687763713, + "acc_stderr": 0.02944377302259469, + "acc_norm": 0.7130801687763713, + "acc_norm_stderr": 0.02944377302259469 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6636771300448431, + "acc_stderr": 0.031708824268455005, + "acc_norm": 0.6636771300448431, + "acc_norm_stderr": 0.031708824268455005 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6564885496183206, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.6564885496183206, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908705, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.042844679680521934, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.042844679680521934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6625766871165644, + "acc_stderr": 0.03714908409935575, + "acc_norm": 0.6625766871165644, + "acc_norm_stderr": 0.03714908409935575 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285714, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285714 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6504854368932039, + "acc_stderr": 0.04721188506097172, + "acc_norm": 0.6504854368932039, + "acc_norm_stderr": 0.04721188506097172 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7649572649572649, + "acc_stderr": 0.02777883590493543, + "acc_norm": 0.7649572649572649, + "acc_norm_stderr": 0.02777883590493543 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7343550446998723, + "acc_stderr": 0.015794302487888726, + "acc_norm": 0.7343550446998723, + "acc_norm_stderr": 0.015794302487888726 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.630057803468208, + "acc_stderr": 0.0259924720293064, + "acc_norm": 0.630057803468208, + "acc_norm_stderr": 0.0259924720293064 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3016759776536313, + "acc_stderr": 0.015350767572220286, + "acc_norm": 0.3016759776536313, + "acc_norm_stderr": 0.015350767572220286 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6143790849673203, + "acc_stderr": 0.02787074527829028, + "acc_norm": 0.6143790849673203, + "acc_norm_stderr": 0.02787074527829028 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6366559485530546, + "acc_stderr": 0.027316847674192707, + "acc_norm": 0.6366559485530546, + "acc_norm_stderr": 0.027316847674192707 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6141975308641975, + "acc_stderr": 0.027085401226132146, + "acc_norm": 0.6141975308641975, + "acc_norm_stderr": 0.027085401226132146 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4078014184397163, + "acc_stderr": 0.029316011776343555, + "acc_norm": 0.4078014184397163, + "acc_norm_stderr": 0.029316011776343555 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4211212516297262, + "acc_stderr": 0.012610325733489906, + "acc_norm": 0.4211212516297262, + "acc_norm_stderr": 0.012610325733489906 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5036764705882353, + "acc_stderr": 0.030372015885428195, + "acc_norm": 0.5036764705882353, + "acc_norm_stderr": 0.030372015885428195 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5522875816993464, + "acc_stderr": 0.020116925347422425, + "acc_norm": 0.5522875816993464, + "acc_norm_stderr": 0.020116925347422425 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.046075820907199756, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.046075820907199756 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6163265306122448, + "acc_stderr": 0.031130880396235946, + "acc_norm": 0.6163265306122448, + "acc_norm_stderr": 0.031130880396235946 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7014925373134329, + "acc_stderr": 0.03235743789355042, + "acc_norm": 0.7014925373134329, + "acc_norm_stderr": 0.03235743789355042 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774708, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774708 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4397590361445783, + "acc_stderr": 0.03864139923699122, + "acc_norm": 0.4397590361445783, + "acc_norm_stderr": 0.03864139923699122 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7309941520467836, + "acc_stderr": 0.03401052620104089, + "acc_norm": 0.7309941520467836, + "acc_norm_stderr": 0.03401052620104089 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35128518971848227, + "mc1_stderr": 0.016711358163544403, + "mc2": 0.49938332230288074, + "mc2_stderr": 0.015748300557574715 + }, + "all": { + "acc": 0.5432971544031889, + "acc_stderr": 0.03447079755702233, + "acc_norm": 0.5470104530937225, + "acc_norm_stderr": 0.034450698941066726, + "mc1": 0.35128518971848227, + "mc1_stderr": 0.016711358163544403, + "mc2": 0.49938332230288074, + "mc2_stderr": 0.015748300557574715 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6365.294131994247", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/ReMM-L2-13B/results_2023-10-16T18-10-03.763192.json b/eval-results/Undi95/ReMM-L2-13B/results_2023-10-16T18-10-03.763192.json new file mode 100644 index 0000000000000000000000000000000000000000..da06cf5439fb61b8919d93bfc02299b7dfdf7936 --- /dev/null +++ b/eval-results/Undi95/ReMM-L2-13B/results_2023-10-16T18-10-03.763192.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/ReMM-L2-13B", + "model_sha": "2b4e118fa8879df5bfff87cc50d0dba9afeaaa9f", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.3603187919463087, + "em_stderr": 0.004916600307723715, + "f1": 0.4369505033557066, + "f1_stderr": 0.004727018193601331 + }, + "harness|gsm8k|5": { + "acc": 0.029567854435178165, + "acc_stderr": 0.004665893134220799 + }, + "harness|winogrande|5": { + "acc": 0.745067087608524, + "acc_stderr": 0.012248806969376422 + }, + "all": { + "em": 0.3603187919463087, + "em_stderr": 0.004916600307723715, + "f1": 0.4369505033557066, + "f1_stderr": 0.004727018193601331, + "acc": 0.3873174710218511, + "acc_stderr": 0.008457350051798611 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "937d819caf1e9e89" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "7eb07ec58b3c0d6e" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "a5fcb1c48580bb2b" + }, + "total_evaluation_time_secondes": "6870.640323400497", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/ReMM-Mistral-13B/results_2023-10-04T08-43-52.595565.json b/eval-results/Undi95/ReMM-Mistral-13B/results_2023-10-04T08-43-52.595565.json new file mode 100644 index 0000000000000000000000000000000000000000..27b3dc9c789b02655df0de11491b57fc6c92592f --- /dev/null +++ b/eval-results/Undi95/ReMM-Mistral-13B/results_2023-10-04T08-43-52.595565.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/ReMM-Mistral-13B", + "model_sha": "a5ef9385d9430a81778183d71b58eb2b869d6a7e", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5895904436860068, + "acc_stderr": 0.014374922192642664, + "acc_norm": 0.6220136518771331, + "acc_norm_stderr": 0.0141696645203031 + }, + "harness|hellaswag|10": { + "acc": 0.642899820752838, + "acc_stderr": 0.004781654610857137, + "acc_norm": 0.8381796454889464, + "acc_norm_stderr": 0.0036753325906810734 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4962962962962963, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.4962962962962963, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5328947368421053, + "acc_stderr": 0.040601270352363966, + "acc_norm": 0.5328947368421053, + "acc_norm_stderr": 0.040601270352363966 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5660377358490566, + "acc_stderr": 0.030503292013342592, + "acc_norm": 0.5660377358490566, + "acc_norm_stderr": 0.030503292013342592 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5972222222222222, + "acc_stderr": 0.04101405519842426, + "acc_norm": 0.5972222222222222, + "acc_norm_stderr": 0.04101405519842426 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.043364327079931785, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.043364327079931785 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.49361702127659574, + "acc_stderr": 0.032683358999363366, + "acc_norm": 0.49361702127659574, + "acc_norm_stderr": 0.032683358999363366 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.04372748290278007, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.04372748290278007 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.496551724137931, + "acc_stderr": 0.04166567577101579, + "acc_norm": 0.496551724137931, + "acc_norm_stderr": 0.04166567577101579 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3439153439153439, + "acc_stderr": 0.024464426625596433, + "acc_norm": 0.3439153439153439, + "acc_norm_stderr": 0.024464426625596433 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.04104947269903394, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.04104947269903394 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6419354838709678, + "acc_stderr": 0.027273890594300645, + "acc_norm": 0.6419354838709678, + "acc_norm_stderr": 0.027273890594300645 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.41379310344827586, + "acc_stderr": 0.03465304488406795, + "acc_norm": 0.41379310344827586, + "acc_norm_stderr": 0.03465304488406795 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.55, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.03663974994391244, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.03663974994391244 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6919191919191919, + "acc_stderr": 0.032894773300986155, + "acc_norm": 0.6919191919191919, + "acc_norm_stderr": 0.032894773300986155 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7772020725388601, + "acc_stderr": 0.030031147977641538, + "acc_norm": 0.7772020725388601, + "acc_norm_stderr": 0.030031147977641538 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5128205128205128, + "acc_stderr": 0.025342671293807257, + "acc_norm": 0.5128205128205128, + "acc_norm_stderr": 0.025342671293807257 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3296296296296296, + "acc_stderr": 0.028661201116524575, + "acc_norm": 0.3296296296296296, + "acc_norm_stderr": 0.028661201116524575 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.592436974789916, + "acc_stderr": 0.03191863374478464, + "acc_norm": 0.592436974789916, + "acc_norm_stderr": 0.03191863374478464 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7229357798165138, + "acc_stderr": 0.01918848259016953, + "acc_norm": 0.7229357798165138, + "acc_norm_stderr": 0.01918848259016953 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.37037037037037035, + "acc_stderr": 0.03293377139415191, + "acc_norm": 0.37037037037037035, + "acc_norm_stderr": 0.03293377139415191 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7598039215686274, + "acc_stderr": 0.02998373305591362, + "acc_norm": 0.7598039215686274, + "acc_norm_stderr": 0.02998373305591362 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7552742616033755, + "acc_stderr": 0.027985699387036423, + "acc_norm": 0.7552742616033755, + "acc_norm_stderr": 0.027985699387036423 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6860986547085202, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.6860986547085202, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7107438016528925, + "acc_stderr": 0.04139112727635463, + "acc_norm": 0.7107438016528925, + "acc_norm_stderr": 0.04139112727635463 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.043300437496507416, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.043300437496507416 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6748466257668712, + "acc_stderr": 0.036803503712864616, + "acc_norm": 0.6748466257668712, + "acc_norm_stderr": 0.036803503712864616 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6796116504854369, + "acc_stderr": 0.04620284082280042, + "acc_norm": 0.6796116504854369, + "acc_norm_stderr": 0.04620284082280042 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7948717948717948, + "acc_stderr": 0.02645350805404033, + "acc_norm": 0.7948717948717948, + "acc_norm_stderr": 0.02645350805404033 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7611749680715197, + "acc_stderr": 0.015246803197398679, + "acc_norm": 0.7611749680715197, + "acc_norm_stderr": 0.015246803197398679 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.02607431485165708, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.02607431485165708 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.39329608938547483, + "acc_stderr": 0.016337268694270105, + "acc_norm": 0.39329608938547483, + "acc_norm_stderr": 0.016337268694270105 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.027826109307283697, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.027826109307283697 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6366559485530546, + "acc_stderr": 0.02731684767419271, + "acc_norm": 0.6366559485530546, + "acc_norm_stderr": 0.02731684767419271 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6172839506172839, + "acc_stderr": 0.02704453813840261, + "acc_norm": 0.6172839506172839, + "acc_norm_stderr": 0.02704453813840261 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4219858156028369, + "acc_stderr": 0.029462189233370593, + "acc_norm": 0.4219858156028369, + "acc_norm_stderr": 0.029462189233370593 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.42242503259452413, + "acc_stderr": 0.012615600475734921, + "acc_norm": 0.42242503259452413, + "acc_norm_stderr": 0.012615600475734921 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5183823529411765, + "acc_stderr": 0.030352303395351964, + "acc_norm": 0.5183823529411765, + "acc_norm_stderr": 0.030352303395351964 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5669934640522876, + "acc_stderr": 0.020045442473324224, + "acc_norm": 0.5669934640522876, + "acc_norm_stderr": 0.020045442473324224 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6204081632653061, + "acc_stderr": 0.031067211262872464, + "acc_norm": 0.6204081632653061, + "acc_norm_stderr": 0.031067211262872464 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03333333333333335, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03333333333333335 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.83, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4457831325301205, + "acc_stderr": 0.03869543323472101, + "acc_norm": 0.4457831325301205, + "acc_norm_stderr": 0.03869543323472101 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7602339181286549, + "acc_stderr": 0.03274485211946956, + "acc_norm": 0.7602339181286549, + "acc_norm_stderr": 0.03274485211946956 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.37454100367197063, + "mc1_stderr": 0.016943535128405324, + "mc2": 0.5331836105073876, + "mc2_stderr": 0.015629704316856213 + }, + "all": { + "acc": 0.5564046120566463, + "acc_stderr": 0.03450414630343969, + "acc_norm": 0.560263985496091, + "acc_norm_stderr": 0.03448191613915977, + "mc1": 0.37454100367197063, + "mc1_stderr": 0.016943535128405324, + "mc2": 0.5331836105073876, + "mc2_stderr": 0.015629704316856213 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6395.730857849121", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/ReMM-Mistral-13B/results_2023-10-27T13-48-21.267659.json b/eval-results/Undi95/ReMM-Mistral-13B/results_2023-10-27T13-48-21.267659.json new file mode 100644 index 0000000000000000000000000000000000000000..febe784935c92454d2f4cfca2b465fb49529a5d1 --- /dev/null +++ b/eval-results/Undi95/ReMM-Mistral-13B/results_2023-10-27T13-48-21.267659.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/ReMM-Mistral-13B", + "model_sha": "a5ef9385d9430a81778183d71b58eb2b869d6a7e", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.20679530201342283, + "em_stderr": 0.004147654995169029, + "f1": 0.2796350671140937, + "f1_stderr": 0.004133652397455312 + }, + "harness|gsm8k|5": { + "acc": 0.12054586808188021, + "acc_stderr": 0.008968608285309076 + }, + "harness|winogrande|5": { + "acc": 0.745067087608524, + "acc_stderr": 0.012248806969376422 + }, + "all": { + "em": 0.20679530201342283, + "em_stderr": 0.004147654995169029, + "f1": 0.2796350671140937, + "f1_stderr": 0.004133652397455312, + "acc": 0.4328064778452021, + "acc_stderr": 0.01060870762734275 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "d7b665ed0680d44a" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "fd1b737328be69d9" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "99a5c50462405ca8" + }, + "total_evaluation_time_secondes": "12695.07842206955", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/ReMM-SLERP-L2-13B/results_2023-09-06T13-42-48.770616.json b/eval-results/Undi95/ReMM-SLERP-L2-13B/results_2023-09-06T13-42-48.770616.json new file mode 100644 index 0000000000000000000000000000000000000000..2d86f4947801e87e509b06a1d89e402c6f83cf0f --- /dev/null +++ b/eval-results/Undi95/ReMM-SLERP-L2-13B/results_2023-09-06T13-42-48.770616.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/ReMM-SLERP-L2-13B", + "model_sha": "27baccf242bc1dc34fc39661a40bbf867cbea8b5", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "eb6042f1c54cd5c7c50bab83f4b4bd56b534afb6", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5827645051194539, + "acc_stderr": 0.01440982551840308, + "acc_norm": 0.6092150170648464, + "acc_norm_stderr": 0.01425856388051378 + }, + "harness|hellaswag|10": { + "acc": 0.642302330213105, + "acc_stderr": 0.004783428874273592, + "acc_norm": 0.8355905198167696, + "acc_norm_stderr": 0.0036988923883801003 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5259259259259259, + "acc_stderr": 0.04313531696750575, + "acc_norm": 0.5259259259259259, + "acc_norm_stderr": 0.04313531696750575 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5197368421052632, + "acc_stderr": 0.040657710025626036, + "acc_norm": 0.5197368421052632, + "acc_norm_stderr": 0.040657710025626036 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5471698113207547, + "acc_stderr": 0.030635627957961823, + "acc_norm": 0.5471698113207547, + "acc_norm_stderr": 0.030635627957961823 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5902777777777778, + "acc_stderr": 0.04112490974670788, + "acc_norm": 0.5902777777777778, + "acc_norm_stderr": 0.04112490974670788 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237656, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237656 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4765957446808511, + "acc_stderr": 0.032650194750335815, + "acc_norm": 0.4765957446808511, + "acc_norm_stderr": 0.032650194750335815 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537315, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537315 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.041665675771015785, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.041665675771015785 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30952380952380953, + "acc_stderr": 0.023809523809523864, + "acc_norm": 0.30952380952380953, + "acc_norm_stderr": 0.023809523809523864 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04216370213557835, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04216370213557835 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.635483870967742, + "acc_stderr": 0.027379871229943255, + "acc_norm": 0.635483870967742, + "acc_norm_stderr": 0.027379871229943255 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4088669950738916, + "acc_stderr": 0.034590588158832314, + "acc_norm": 0.4088669950738916, + "acc_norm_stderr": 0.034590588158832314 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.56, + "acc_stderr": 0.049888765156985884, + "acc_norm": 0.56, + "acc_norm_stderr": 0.049888765156985884 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6848484848484848, + "acc_stderr": 0.0362773057502241, + "acc_norm": 0.6848484848484848, + "acc_norm_stderr": 0.0362773057502241 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6868686868686869, + "acc_stderr": 0.033042050878136525, + "acc_norm": 0.6868686868686869, + "acc_norm_stderr": 0.033042050878136525 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.772020725388601, + "acc_stderr": 0.030276909945178277, + "acc_norm": 0.772020725388601, + "acc_norm_stderr": 0.030276909945178277 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5128205128205128, + "acc_stderr": 0.025342671293807257, + "acc_norm": 0.5128205128205128, + "acc_norm_stderr": 0.025342671293807257 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32592592592592595, + "acc_stderr": 0.02857834836547308, + "acc_norm": 0.32592592592592595, + "acc_norm_stderr": 0.02857834836547308 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5756302521008403, + "acc_stderr": 0.032104790510157764, + "acc_norm": 0.5756302521008403, + "acc_norm_stderr": 0.032104790510157764 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.038227469376587525, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.038227469376587525 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7192660550458716, + "acc_stderr": 0.01926605504587161, + "acc_norm": 0.7192660550458716, + "acc_norm_stderr": 0.01926605504587161 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.38425925925925924, + "acc_stderr": 0.03317354514310742, + "acc_norm": 0.38425925925925924, + "acc_norm_stderr": 0.03317354514310742 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7647058823529411, + "acc_stderr": 0.029771775228145635, + "acc_norm": 0.7647058823529411, + "acc_norm_stderr": 0.029771775228145635 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.759493670886076, + "acc_stderr": 0.02782078198114968, + "acc_norm": 0.759493670886076, + "acc_norm_stderr": 0.02782078198114968 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6860986547085202, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.6860986547085202, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6259541984732825, + "acc_stderr": 0.042438692422305246, + "acc_norm": 0.6259541984732825, + "acc_norm_stderr": 0.042438692422305246 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.040261875275912073, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.040261875275912073 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.04330043749650741, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.04330043749650741 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6748466257668712, + "acc_stderr": 0.036803503712864616, + "acc_norm": 0.6748466257668712, + "acc_norm_stderr": 0.036803503712864616 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6796116504854369, + "acc_stderr": 0.04620284082280042, + "acc_norm": 0.6796116504854369, + "acc_norm_stderr": 0.04620284082280042 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7991452991452992, + "acc_stderr": 0.026246772946890488, + "acc_norm": 0.7991452991452992, + "acc_norm_stderr": 0.026246772946890488 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.768837803320562, + "acc_stderr": 0.015075523238101072, + "acc_norm": 0.768837803320562, + "acc_norm_stderr": 0.015075523238101072 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.02607431485165708, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.02607431485165708 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.43910614525139663, + "acc_stderr": 0.016598022120580418, + "acc_norm": 0.43910614525139663, + "acc_norm_stderr": 0.016598022120580418 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5915032679738562, + "acc_stderr": 0.028146405993096358, + "acc_norm": 0.5915032679738562, + "acc_norm_stderr": 0.028146405993096358 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6430868167202572, + "acc_stderr": 0.027210420375934023, + "acc_norm": 0.6430868167202572, + "acc_norm_stderr": 0.027210420375934023 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6265432098765432, + "acc_stderr": 0.026915003011380154, + "acc_norm": 0.6265432098765432, + "acc_norm_stderr": 0.026915003011380154 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.40070921985815605, + "acc_stderr": 0.029233465745573083, + "acc_norm": 0.40070921985815605, + "acc_norm_stderr": 0.029233465745573083 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.42698826597131684, + "acc_stderr": 0.012633353557534423, + "acc_norm": 0.42698826597131684, + "acc_norm_stderr": 0.012633353557534423 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5036764705882353, + "acc_stderr": 0.030372015885428195, + "acc_norm": 0.5036764705882353, + "acc_norm_stderr": 0.030372015885428195 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5751633986928104, + "acc_stderr": 0.019997973035458333, + "acc_norm": 0.5751633986928104, + "acc_norm_stderr": 0.019997973035458333 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.04494290866252091, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.04494290866252091 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6204081632653061, + "acc_stderr": 0.03106721126287247, + "acc_norm": 0.6204081632653061, + "acc_norm_stderr": 0.03106721126287247 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6915422885572139, + "acc_stderr": 0.03265819588512699, + "acc_norm": 0.6915422885572139, + "acc_norm_stderr": 0.03265819588512699 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.463855421686747, + "acc_stderr": 0.03882310850890594, + "acc_norm": 0.463855421686747, + "acc_norm_stderr": 0.03882310850890594 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.031885780176863984, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.031885780176863984 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3659730722154223, + "mc1_stderr": 0.01686294168408838, + "mc2": 0.5197363921890529, + "mc2_stderr": 0.015737419947776412 + }, + "all": { + "acc": 0.5553314222182426, + "acc_stderr": 0.034457143336673, + "acc_norm": 0.5590558069902605, + "acc_norm_stderr": 0.03443619760576142, + "mc1": 0.3659730722154223, + "mc1_stderr": 0.01686294168408838, + "mc2": 0.5197363921890529, + "mc2_stderr": 0.015737419947776412 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6459.55356836319", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/ReMM-SLERP-L2-13B/results_2023-10-29T14-55-07.909290.json b/eval-results/Undi95/ReMM-SLERP-L2-13B/results_2023-10-29T14-55-07.909290.json new file mode 100644 index 0000000000000000000000000000000000000000..217d50aa4ce13ff9c744c7d067076463ce2d7e67 --- /dev/null +++ b/eval-results/Undi95/ReMM-SLERP-L2-13B/results_2023-10-29T14-55-07.909290.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/ReMM-SLERP-L2-13B", + "model_sha": "a93f9e33323a448f4b910120abb335e3e6a68eab", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.13464765100671142, + "em_stderr": 0.0034957110748356193, + "f1": 0.20755138422818709, + "f1_stderr": 0.0036341951060626636 + }, + "harness|gsm8k|5": { + "acc": 0.09173616376042457, + "acc_stderr": 0.00795094214833933 + }, + "harness|winogrande|5": { + "acc": 0.7521704814522494, + "acc_stderr": 0.01213438601986535 + }, + "all": { + "em": 0.13464765100671142, + "em_stderr": 0.0034957110748356193, + "f1": 0.20755138422818709, + "f1_stderr": 0.0036341951060626636, + "acc": 0.421953322606337, + "acc_stderr": 0.01004266408410234 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "db9104513de743c6" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "81b3673004c40c57" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "2205a7514941e111" + }, + "total_evaluation_time_secondes": "40862.28617811203", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/ReMM-v2-L2-13B/results_2023-09-18T13-58-45.934639.json b/eval-results/Undi95/ReMM-v2-L2-13B/results_2023-09-18T13-58-45.934639.json new file mode 100644 index 0000000000000000000000000000000000000000..e534a0ced09000e9625fc868ff5fe6555792ffb3 --- /dev/null +++ b/eval-results/Undi95/ReMM-v2-L2-13B/results_2023-09-18T13-58-45.934639.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/ReMM-v2-L2-13B", + "model_sha": "bc42c77f88482c37c72c85c66135e99972bbca1b", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5878839590443686, + "acc_stderr": 0.014383915302225402, + "acc_norm": 0.6194539249146758, + "acc_norm_stderr": 0.014188277712349812 + }, + "harness|hellaswag|10": { + "acc": 0.6480780720971918, + "acc_stderr": 0.0047659375151971875, + "acc_norm": 0.8399721171081458, + "acc_norm_stderr": 0.0036588262081016167 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411022, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411022 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5197368421052632, + "acc_stderr": 0.040657710025626036, + "acc_norm": 0.5197368421052632, + "acc_norm_stderr": 0.040657710025626036 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6, + "acc_stderr": 0.030151134457776285, + "acc_norm": 0.6, + "acc_norm_stderr": 0.030151134457776285 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5902777777777778, + "acc_stderr": 0.04112490974670788, + "acc_norm": 0.5902777777777778, + "acc_norm_stderr": 0.04112490974670788 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411019, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411019 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.45, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5260115606936416, + "acc_stderr": 0.03807301726504511, + "acc_norm": 0.5260115606936416, + "acc_norm_stderr": 0.03807301726504511 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.04336432707993179, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.04336432707993179 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4851063829787234, + "acc_stderr": 0.032671518489247764, + "acc_norm": 0.4851063829787234, + "acc_norm_stderr": 0.032671518489247764 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.04164188720169377, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.04164188720169377 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3439153439153439, + "acc_stderr": 0.024464426625596433, + "acc_norm": 0.3439153439153439, + "acc_norm_stderr": 0.024464426625596433 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04216370213557835, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04216370213557835 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6387096774193548, + "acc_stderr": 0.027327548447957532, + "acc_norm": 0.6387096774193548, + "acc_norm_stderr": 0.027327548447957532 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.034819048444388045, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.034819048444388045 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.037131580674819135, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.037131580674819135 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7070707070707071, + "acc_stderr": 0.032424979581788166, + "acc_norm": 0.7070707070707071, + "acc_norm_stderr": 0.032424979581788166 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7875647668393783, + "acc_stderr": 0.02951928261681724, + "acc_norm": 0.7875647668393783, + "acc_norm_stderr": 0.02951928261681724 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5205128205128206, + "acc_stderr": 0.02532966316348994, + "acc_norm": 0.5205128205128206, + "acc_norm_stderr": 0.02532966316348994 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32222222222222224, + "acc_stderr": 0.0284934650910286, + "acc_norm": 0.32222222222222224, + "acc_norm_stderr": 0.0284934650910286 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.03196876989195778, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.03196876989195778 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.726605504587156, + "acc_stderr": 0.019109299846098285, + "acc_norm": 0.726605504587156, + "acc_norm_stderr": 0.019109299846098285 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4166666666666667, + "acc_stderr": 0.033622774366080445, + "acc_norm": 0.4166666666666667, + "acc_norm_stderr": 0.033622774366080445 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7598039215686274, + "acc_stderr": 0.02998373305591361, + "acc_norm": 0.7598039215686274, + "acc_norm_stderr": 0.02998373305591361 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.759493670886076, + "acc_stderr": 0.027820781981149685, + "acc_norm": 0.759493670886076, + "acc_norm_stderr": 0.027820781981149685 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.03984979653302873, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.03984979653302873 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.042844679680521934, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.042844679680521934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6871165644171779, + "acc_stderr": 0.036429145782924055, + "acc_norm": 0.6871165644171779, + "acc_norm_stderr": 0.036429145782924055 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.045723723587374296, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.045723723587374296 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.04541609446503948, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.04541609446503948 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7991452991452992, + "acc_stderr": 0.026246772946890484, + "acc_norm": 0.7991452991452992, + "acc_norm_stderr": 0.026246772946890484 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7662835249042146, + "acc_stderr": 0.01513338327898883, + "acc_norm": 0.7662835249042146, + "acc_norm_stderr": 0.01513338327898883 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6329479768786127, + "acc_stderr": 0.02595005433765408, + "acc_norm": 0.6329479768786127, + "acc_norm_stderr": 0.02595005433765408 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4491620111731844, + "acc_stderr": 0.016635838341631914, + "acc_norm": 0.4491620111731844, + "acc_norm_stderr": 0.016635838341631914 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.027826109307283693, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.027826109307283693 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.639871382636656, + "acc_stderr": 0.027264297599804012, + "acc_norm": 0.639871382636656, + "acc_norm_stderr": 0.027264297599804012 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6203703703703703, + "acc_stderr": 0.027002521034516475, + "acc_norm": 0.6203703703703703, + "acc_norm_stderr": 0.027002521034516475 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.425531914893617, + "acc_stderr": 0.02949482760014437, + "acc_norm": 0.425531914893617, + "acc_norm_stderr": 0.02949482760014437 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.42698826597131684, + "acc_stderr": 0.012633353557534423, + "acc_norm": 0.42698826597131684, + "acc_norm_stderr": 0.012633353557534423 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5367647058823529, + "acc_stderr": 0.03029061918048569, + "acc_norm": 0.5367647058823529, + "acc_norm_stderr": 0.03029061918048569 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.019944914136873583, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.019944914136873583 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.046075820907199756, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.046075820907199756 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6204081632653061, + "acc_stderr": 0.03106721126287247, + "acc_norm": 0.6204081632653061, + "acc_norm_stderr": 0.03106721126287247 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7064676616915423, + "acc_stderr": 0.032200241045342054, + "acc_norm": 0.7064676616915423, + "acc_norm_stderr": 0.032200241045342054 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.03588702812826369, + "acc_norm": 0.85, + "acc_norm_stderr": 0.03588702812826369 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.46987951807228917, + "acc_stderr": 0.03885425420866766, + "acc_norm": 0.46987951807228917, + "acc_norm_stderr": 0.03885425420866766 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7719298245614035, + "acc_stderr": 0.032180937956023566, + "acc_norm": 0.7719298245614035, + "acc_norm_stderr": 0.032180937956023566 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3659730722154223, + "mc1_stderr": 0.016862941684088376, + "mc2": 0.5081127343633631, + "mc2_stderr": 0.015610906083140244 + }, + "all": { + "acc": 0.5632990571605488, + "acc_stderr": 0.0344139510970497, + "acc_norm": 0.5670865827687058, + "acc_norm_stderr": 0.03439187060727053, + "mc1": 0.3659730722154223, + "mc1_stderr": 0.016862941684088376, + "mc2": 0.5081127343633631, + "mc2_stderr": 0.015610906083140244 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6403.517452001572", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/ReMM-v2-L2-13B/results_2023-10-24T07-00-18.944945.json b/eval-results/Undi95/ReMM-v2-L2-13B/results_2023-10-24T07-00-18.944945.json new file mode 100644 index 0000000000000000000000000000000000000000..c0384b05229afe32d79dbfdbf17ef16d9abe6fe9 --- /dev/null +++ b/eval-results/Undi95/ReMM-v2-L2-13B/results_2023-10-24T07-00-18.944945.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/ReMM-v2-L2-13B", + "model_sha": "bc42c77f88482c37c72c85c66135e99972bbca1b", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.056312919463087245, + "em_stderr": 0.0023607917437880183, + "f1": 0.12075503355704631, + "f1_stderr": 0.002645290783284543 + }, + "harness|gsm8k|5": { + "acc": 0.13191811978771797, + "acc_stderr": 0.009321265253857515 + }, + "harness|winogrande|5": { + "acc": 0.7584846093133386, + "acc_stderr": 0.012028983782011872 + }, + "all": { + "em": 0.056312919463087245, + "em_stderr": 0.0023607917437880183, + "f1": 0.12075503355704631, + "f1_stderr": 0.002645290783284543, + "acc": 0.4452013645505283, + "acc_stderr": 0.010675124517934693 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "56f7e09bc7743f5b" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "f8ccf49f74cbd99f" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "ca6e4c3ff8563083" + }, + "total_evaluation_time_secondes": "41015.328003406525", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/ReMM-v2.1-L2-13B/results_2023-09-18T13-43-56.304128.json b/eval-results/Undi95/ReMM-v2.1-L2-13B/results_2023-09-18T13-43-56.304128.json new file mode 100644 index 0000000000000000000000000000000000000000..1bef79679d274037b2da5f5a0c528eb6e0f5832c --- /dev/null +++ b/eval-results/Undi95/ReMM-v2.1-L2-13B/results_2023-09-18T13-43-56.304128.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/ReMM-v2.1-L2-13B", + "model_sha": "e6b5ac97f74355cb281a621261debe5720fb4da2", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5844709897610921, + "acc_stderr": 0.01440136664121638, + "acc_norm": 0.6143344709897611, + "acc_norm_stderr": 0.01422425097325718 + }, + "harness|hellaswag|10": { + "acc": 0.6468830910177256, + "acc_stderr": 0.004769618829196511, + "acc_norm": 0.8391754630551683, + "acc_norm_stderr": 0.0036661823284423437 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5263157894736842, + "acc_stderr": 0.04063302731486671, + "acc_norm": 0.5263157894736842, + "acc_norm_stderr": 0.04063302731486671 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5886792452830188, + "acc_stderr": 0.030285009259009794, + "acc_norm": 0.5886792452830188, + "acc_norm_stderr": 0.030285009259009794 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5972222222222222, + "acc_stderr": 0.04101405519842426, + "acc_norm": 0.5972222222222222, + "acc_norm_stderr": 0.04101405519842426 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5317919075144508, + "acc_stderr": 0.03804749744364764, + "acc_norm": 0.5317919075144508, + "acc_norm_stderr": 0.03804749744364764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.042801058373643966, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.042801058373643966 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4808510638297872, + "acc_stderr": 0.03266204299064678, + "acc_norm": 0.4808510638297872, + "acc_norm_stderr": 0.03266204299064678 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.041665675771015785, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.041665675771015785 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3386243386243386, + "acc_stderr": 0.024373197867983063, + "acc_norm": 0.3386243386243386, + "acc_norm_stderr": 0.024373197867983063 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.041905964388711366, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.041905964388711366 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.635483870967742, + "acc_stderr": 0.027379871229943255, + "acc_norm": 0.635483870967742, + "acc_norm_stderr": 0.027379871229943255 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4433497536945813, + "acc_stderr": 0.03495334582162934, + "acc_norm": 0.4433497536945813, + "acc_norm_stderr": 0.03495334582162934 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.037131580674819135, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.037131580674819135 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7121212121212122, + "acc_stderr": 0.03225883512300992, + "acc_norm": 0.7121212121212122, + "acc_norm_stderr": 0.03225883512300992 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7927461139896373, + "acc_stderr": 0.029252823291803638, + "acc_norm": 0.7927461139896373, + "acc_norm_stderr": 0.029252823291803638 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5205128205128206, + "acc_stderr": 0.02532966316348994, + "acc_norm": 0.5205128205128206, + "acc_norm_stderr": 0.02532966316348994 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.31851851851851853, + "acc_stderr": 0.02840653309060846, + "acc_norm": 0.31851851851851853, + "acc_norm_stderr": 0.02840653309060846 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5840336134453782, + "acc_stderr": 0.032016501007396114, + "acc_norm": 0.5840336134453782, + "acc_norm_stderr": 0.032016501007396114 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7174311926605504, + "acc_stderr": 0.01930424349770715, + "acc_norm": 0.7174311926605504, + "acc_norm_stderr": 0.01930424349770715 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.41203703703703703, + "acc_stderr": 0.03356787758160835, + "acc_norm": 0.41203703703703703, + "acc_norm_stderr": 0.03356787758160835 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7598039215686274, + "acc_stderr": 0.02998373305591361, + "acc_norm": 0.7598039215686274, + "acc_norm_stderr": 0.02998373305591361 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.759493670886076, + "acc_stderr": 0.02782078198114968, + "acc_norm": 0.759493670886076, + "acc_norm_stderr": 0.02782078198114968 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6771300448430493, + "acc_stderr": 0.03138147637575499, + "acc_norm": 0.6771300448430493, + "acc_norm_stderr": 0.03138147637575499 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6183206106870229, + "acc_stderr": 0.04260735157644559, + "acc_norm": 0.6183206106870229, + "acc_norm_stderr": 0.04260735157644559 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.039849796533028725, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.039849796533028725 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.75, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6809815950920245, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.6809815950920245, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764376, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.04541609446503948, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.04541609446503948 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7948717948717948, + "acc_stderr": 0.026453508054040332, + "acc_norm": 0.7948717948717948, + "acc_norm_stderr": 0.026453508054040332 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7662835249042146, + "acc_stderr": 0.01513338327898883, + "acc_norm": 0.7662835249042146, + "acc_norm_stderr": 0.01513338327898883 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6329479768786127, + "acc_stderr": 0.02595005433765407, + "acc_norm": 0.6329479768786127, + "acc_norm_stderr": 0.02595005433765407 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.45027932960893857, + "acc_stderr": 0.016639615236845803, + "acc_norm": 0.45027932960893857, + "acc_norm_stderr": 0.016639615236845803 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6078431372549019, + "acc_stderr": 0.027956046165424523, + "acc_norm": 0.6078431372549019, + "acc_norm_stderr": 0.027956046165424523 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6430868167202572, + "acc_stderr": 0.027210420375934023, + "acc_norm": 0.6430868167202572, + "acc_norm_stderr": 0.027210420375934023 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6203703703703703, + "acc_stderr": 0.027002521034516478, + "acc_norm": 0.6203703703703703, + "acc_norm_stderr": 0.027002521034516478 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4219858156028369, + "acc_stderr": 0.029462189233370593, + "acc_norm": 0.4219858156028369, + "acc_norm_stderr": 0.029462189233370593 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4276401564537158, + "acc_stderr": 0.012635799922765848, + "acc_norm": 0.4276401564537158, + "acc_norm_stderr": 0.012635799922765848 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5147058823529411, + "acc_stderr": 0.03035969707904612, + "acc_norm": 0.5147058823529411, + "acc_norm_stderr": 0.03035969707904612 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.019944914136873586, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.019944914136873586 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.046075820907199756, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.046075820907199756 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6204081632653061, + "acc_stderr": 0.03106721126287247, + "acc_norm": 0.6204081632653061, + "acc_norm_stderr": 0.03106721126287247 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.681592039800995, + "acc_stderr": 0.03294118479054095, + "acc_norm": 0.681592039800995, + "acc_norm_stderr": 0.03294118479054095 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774708, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774708 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4759036144578313, + "acc_stderr": 0.03887971849597264, + "acc_norm": 0.4759036144578313, + "acc_norm_stderr": 0.03887971849597264 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.783625730994152, + "acc_stderr": 0.03158149539338733, + "acc_norm": 0.783625730994152, + "acc_norm_stderr": 0.03158149539338733 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3659730722154223, + "mc1_stderr": 0.016862941684088376, + "mc2": 0.502982198851368, + "mc2_stderr": 0.015602709737779776 + }, + "all": { + "acc": 0.5614512577762221, + "acc_stderr": 0.03441984146887643, + "acc_norm": 0.5652166112214104, + "acc_norm_stderr": 0.03439813719483044, + "mc1": 0.3659730722154223, + "mc1_stderr": 0.016862941684088376, + "mc2": 0.502982198851368, + "mc2_stderr": 0.015602709737779776 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6461.523278474808", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/ReMM-v2.1-L2-13B/results_2023-10-28T01-20-40.320894.json b/eval-results/Undi95/ReMM-v2.1-L2-13B/results_2023-10-28T01-20-40.320894.json new file mode 100644 index 0000000000000000000000000000000000000000..8be025e82e2cbe49eef85ff56d034aa5ef81a28d --- /dev/null +++ b/eval-results/Undi95/ReMM-v2.1-L2-13B/results_2023-10-28T01-20-40.320894.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/ReMM-v2.1-L2-13B", + "model_sha": "e6b5ac97f74355cb281a621261debe5720fb4da2", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.061556208053691275, + "em_stderr": 0.0024613859292232257, + "f1": 0.12617135067114038, + "f1_stderr": 0.002725179835134867 + }, + "harness|gsm8k|5": { + "acc": 0.12736921910538287, + "acc_stderr": 0.009183110326737822 + }, + "harness|winogrande|5": { + "acc": 0.7592738752959748, + "acc_stderr": 0.012015559212224178 + }, + "all": { + "em": 0.061556208053691275, + "em_stderr": 0.0024613859292232257, + "f1": 0.12617135067114038, + "f1_stderr": 0.002725179835134867, + "acc": 0.44332154720067884, + "acc_stderr": 0.010599334769481 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "2d343c735e5be172" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "84d68d356ee939ab" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "863adca1b6395d66" + }, + "total_evaluation_time_secondes": "41044.25022959709", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/ReMM-v2.2-L2-13B/results_2023-10-03T16-45-21.105610.json b/eval-results/Undi95/ReMM-v2.2-L2-13B/results_2023-10-03T16-45-21.105610.json new file mode 100644 index 0000000000000000000000000000000000000000..13e54915f5d31949232f17a7b624f825db70c7f3 --- /dev/null +++ b/eval-results/Undi95/ReMM-v2.2-L2-13B/results_2023-10-03T16-45-21.105610.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/ReMM-v2.2-L2-13B", + "model_sha": "d55031fbcd41d749bc0c0ffbcd85636718d373b6", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5836177474402731, + "acc_stderr": 0.01440561827943618, + "acc_norm": 0.6126279863481229, + "acc_norm_stderr": 0.014235872487909869 + }, + "harness|hellaswag|10": { + "acc": 0.6488747261501693, + "acc_stderr": 0.004763465139038559, + "acc_norm": 0.8415654252141008, + "acc_norm_stderr": 0.003644017383711596 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5111111111111111, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.5111111111111111, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5328947368421053, + "acc_stderr": 0.040601270352363966, + "acc_norm": 0.5328947368421053, + "acc_norm_stderr": 0.040601270352363966 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5924528301886792, + "acc_stderr": 0.030242233800854494, + "acc_norm": 0.5924528301886792, + "acc_norm_stderr": 0.030242233800854494 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6041666666666666, + "acc_stderr": 0.04089465449325582, + "acc_norm": 0.6041666666666666, + "acc_norm_stderr": 0.04089465449325582 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5317919075144508, + "acc_stderr": 0.03804749744364764, + "acc_norm": 0.5317919075144508, + "acc_norm_stderr": 0.03804749744364764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.04280105837364397, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.04280105837364397 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.04560480215720685, + "acc_norm": 0.71, + "acc_norm_stderr": 0.04560480215720685 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4723404255319149, + "acc_stderr": 0.03263597118409769, + "acc_norm": 0.4723404255319149, + "acc_norm_stderr": 0.03263597118409769 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3439153439153439, + "acc_stderr": 0.024464426625596433, + "acc_norm": 0.3439153439153439, + "acc_norm_stderr": 0.024464426625596433 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.04263906892795132, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.04263906892795132 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6483870967741936, + "acc_stderr": 0.02716253782694846, + "acc_norm": 0.6483870967741936, + "acc_norm_stderr": 0.02716253782694846 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43842364532019706, + "acc_stderr": 0.03491207857486518, + "acc_norm": 0.43842364532019706, + "acc_norm_stderr": 0.03491207857486518 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.0368105086916155, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.0368105086916155 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.702020202020202, + "acc_stderr": 0.03258630383836556, + "acc_norm": 0.702020202020202, + "acc_norm_stderr": 0.03258630383836556 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7979274611398963, + "acc_stderr": 0.028979089794296732, + "acc_norm": 0.7979274611398963, + "acc_norm_stderr": 0.028979089794296732 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5256410256410257, + "acc_stderr": 0.025317649726448663, + "acc_norm": 0.5256410256410257, + "acc_norm_stderr": 0.025317649726448663 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32592592592592595, + "acc_stderr": 0.028578348365473072, + "acc_norm": 0.32592592592592595, + "acc_norm_stderr": 0.028578348365473072 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5966386554621849, + "acc_stderr": 0.031866081214088314, + "acc_norm": 0.5966386554621849, + "acc_norm_stderr": 0.031866081214088314 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3509933774834437, + "acc_stderr": 0.03896981964257375, + "acc_norm": 0.3509933774834437, + "acc_norm_stderr": 0.03896981964257375 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7302752293577982, + "acc_stderr": 0.01902848671111544, + "acc_norm": 0.7302752293577982, + "acc_norm_stderr": 0.01902848671111544 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.41203703703703703, + "acc_stderr": 0.03356787758160835, + "acc_norm": 0.41203703703703703, + "acc_norm_stderr": 0.03356787758160835 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7647058823529411, + "acc_stderr": 0.029771775228145628, + "acc_norm": 0.7647058823529411, + "acc_norm_stderr": 0.029771775228145628 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7637130801687764, + "acc_stderr": 0.02765215314415927, + "acc_norm": 0.7637130801687764, + "acc_norm_stderr": 0.02765215314415927 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6771300448430493, + "acc_stderr": 0.03138147637575499, + "acc_norm": 0.6771300448430493, + "acc_norm_stderr": 0.03138147637575499 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6183206106870229, + "acc_stderr": 0.04260735157644559, + "acc_norm": 0.6183206106870229, + "acc_norm_stderr": 0.04260735157644559 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908705, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.75, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6993865030674846, + "acc_stderr": 0.03602511318806771, + "acc_norm": 0.6993865030674846, + "acc_norm_stderr": 0.03602511318806771 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.0457237235873743, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.0457237235873743 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7087378640776699, + "acc_stderr": 0.044986763205729224, + "acc_norm": 0.7087378640776699, + "acc_norm_stderr": 0.044986763205729224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7948717948717948, + "acc_stderr": 0.026453508054040332, + "acc_norm": 0.7948717948717948, + "acc_norm_stderr": 0.026453508054040332 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7611749680715197, + "acc_stderr": 0.015246803197398675, + "acc_norm": 0.7611749680715197, + "acc_norm_stderr": 0.015246803197398675 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6358381502890174, + "acc_stderr": 0.025906632631016127, + "acc_norm": 0.6358381502890174, + "acc_norm_stderr": 0.025906632631016127 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.45363128491620114, + "acc_stderr": 0.016650437588269073, + "acc_norm": 0.45363128491620114, + "acc_norm_stderr": 0.016650437588269073 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6143790849673203, + "acc_stderr": 0.02787074527829028, + "acc_norm": 0.6143790849673203, + "acc_norm_stderr": 0.02787074527829028 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6430868167202572, + "acc_stderr": 0.027210420375934023, + "acc_norm": 0.6430868167202572, + "acc_norm_stderr": 0.027210420375934023 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6203703703703703, + "acc_stderr": 0.027002521034516478, + "acc_norm": 0.6203703703703703, + "acc_norm_stderr": 0.027002521034516478 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.425531914893617, + "acc_stderr": 0.029494827600144373, + "acc_norm": 0.425531914893617, + "acc_norm_stderr": 0.029494827600144373 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.42503259452411996, + "acc_stderr": 0.012625879884891998, + "acc_norm": 0.42503259452411996, + "acc_norm_stderr": 0.012625879884891998 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5183823529411765, + "acc_stderr": 0.030352303395351964, + "acc_norm": 0.5183823529411765, + "acc_norm_stderr": 0.030352303395351964 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5816993464052288, + "acc_stderr": 0.019955975145835546, + "acc_norm": 0.5816993464052288, + "acc_norm_stderr": 0.019955975145835546 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6244897959183674, + "acc_stderr": 0.03100120903989484, + "acc_norm": 0.6244897959183674, + "acc_norm_stderr": 0.03100120903989484 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6865671641791045, + "acc_stderr": 0.032801882053486435, + "acc_norm": 0.6865671641791045, + "acc_norm_stderr": 0.032801882053486435 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.463855421686747, + "acc_stderr": 0.03882310850890594, + "acc_norm": 0.463855421686747, + "acc_norm_stderr": 0.03882310850890594 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7602339181286549, + "acc_stderr": 0.03274485211946956, + "acc_norm": 0.7602339181286549, + "acc_norm_stderr": 0.03274485211946956 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.36964504283965727, + "mc1_stderr": 0.016898180706973888, + "mc2": 0.5135116682163505, + "mc2_stderr": 0.015657648011440012 + }, + "all": { + "acc": 0.564002539739021, + "acc_stderr": 0.034462415944422044, + "acc_norm": 0.567760182755492, + "acc_norm_stderr": 0.034440565206339785, + "mc1": 0.36964504283965727, + "mc1_stderr": 0.016898180706973888, + "mc2": 0.5135116682163505, + "mc2_stderr": 0.015657648011440012 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6402.331243515015", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/ReMM-v2.2-L2-13B/results_2023-10-23T13-54-57.235808.json b/eval-results/Undi95/ReMM-v2.2-L2-13B/results_2023-10-23T13-54-57.235808.json new file mode 100644 index 0000000000000000000000000000000000000000..da695cfe2ce616f90e6907b50a9495233c4eb59e --- /dev/null +++ b/eval-results/Undi95/ReMM-v2.2-L2-13B/results_2023-10-23T13-54-57.235808.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/ReMM-v2.2-L2-13B", + "model_sha": "d55031fbcd41d749bc0c0ffbcd85636718d373b6", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.037751677852348994, + "em_stderr": 0.0019518721243716466, + "f1": 0.10559354026845587, + "f1_stderr": 0.00235422441511938 + }, + "harness|gsm8k|5": { + "acc": 0.14025777103866566, + "acc_stderr": 0.009565108281428673 + }, + "harness|winogrande|5": { + "acc": 0.7561168113654302, + "acc_stderr": 0.012068923278908192 + }, + "all": { + "em": 0.037751677852348994, + "em_stderr": 0.0019518721243716466, + "f1": 0.10559354026845587, + "f1_stderr": 0.00235422441511938, + "acc": 0.4481872912020479, + "acc_stderr": 0.010817015780168433 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "f257715986d77784" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "9c96d515aa3c05bf" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "cc59410c0197230e" + }, + "total_evaluation_time_secondes": "12539.735690116882", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/U-Amethyst-20B/results_2023-10-03T18-44-08.205769.json b/eval-results/Undi95/U-Amethyst-20B/results_2023-10-03T18-44-08.205769.json new file mode 100644 index 0000000000000000000000000000000000000000..18d0cbeec8b501afece387e4025321bd5eaf4fce --- /dev/null +++ b/eval-results/Undi95/U-Amethyst-20B/results_2023-10-03T18-44-08.205769.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/U-Amethyst-20B", + "model_sha": "c0cbe0b3c88041bb6beef27dbe85146af8dddec9", + "model_size": "37.36 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5878839590443686, + "acc_stderr": 0.014383915302225403, + "acc_norm": 0.6220136518771331, + "acc_norm_stderr": 0.014169664520303101 + }, + "harness|hellaswag|10": { + "acc": 0.6400119498107947, + "acc_stderr": 0.004790155370993448, + "acc_norm": 0.8311093407687712, + "acc_norm_stderr": 0.0037388962449538144 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.39, + "acc_stderr": 0.049020713000019756, + "acc_norm": 0.39, + "acc_norm_stderr": 0.049020713000019756 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5526315789473685, + "acc_stderr": 0.04046336883978251, + "acc_norm": 0.5526315789473685, + "acc_norm_stderr": 0.04046336883978251 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6037735849056604, + "acc_stderr": 0.030102793781791197, + "acc_norm": 0.6037735849056604, + "acc_norm_stderr": 0.030102793781791197 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6180555555555556, + "acc_stderr": 0.040629907841466674, + "acc_norm": 0.6180555555555556, + "acc_norm_stderr": 0.040629907841466674 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.45, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.046550104113196156, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.046550104113196156 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.03261936918467382, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.03261936918467382 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4896551724137931, + "acc_stderr": 0.04165774775728763, + "acc_norm": 0.4896551724137931, + "acc_norm_stderr": 0.04165774775728763 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3201058201058201, + "acc_stderr": 0.024026846392873502, + "acc_norm": 0.3201058201058201, + "acc_norm_stderr": 0.024026846392873502 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.0404061017820884, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.0404061017820884 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6548387096774193, + "acc_stderr": 0.02704574657353433, + "acc_norm": 0.6548387096774193, + "acc_norm_stderr": 0.02704574657353433 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.458128078817734, + "acc_stderr": 0.03505630140785741, + "acc_norm": 0.458128078817734, + "acc_norm_stderr": 0.03505630140785741 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.03663974994391244, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.03663974994391244 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.03191178226713547, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.03191178226713547 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8134715025906736, + "acc_stderr": 0.02811209121011748, + "acc_norm": 0.8134715025906736, + "acc_norm_stderr": 0.02811209121011748 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5666666666666667, + "acc_stderr": 0.025124653525885117, + "acc_norm": 0.5666666666666667, + "acc_norm_stderr": 0.025124653525885117 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085622, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085622 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.03156663099215416, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.03156663099215416 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.03879687024073327, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.03879687024073327 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7137614678899082, + "acc_stderr": 0.01937943662891998, + "acc_norm": 0.7137614678899082, + "acc_norm_stderr": 0.01937943662891998 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.03388857118502326, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.03388857118502326 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7696078431372549, + "acc_stderr": 0.029554292605695066, + "acc_norm": 0.7696078431372549, + "acc_norm_stderr": 0.029554292605695066 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7341772151898734, + "acc_stderr": 0.028756799629658346, + "acc_norm": 0.7341772151898734, + "acc_norm_stderr": 0.028756799629658346 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6771300448430493, + "acc_stderr": 0.03138147637575499, + "acc_norm": 0.6771300448430493, + "acc_norm_stderr": 0.03138147637575499 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6335877862595419, + "acc_stderr": 0.042258754519696365, + "acc_norm": 0.6335877862595419, + "acc_norm_stderr": 0.042258754519696365 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7520661157024794, + "acc_stderr": 0.03941897526516304, + "acc_norm": 0.7520661157024794, + "acc_norm_stderr": 0.03941897526516304 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7037037037037037, + "acc_stderr": 0.044143436668549335, + "acc_norm": 0.7037037037037037, + "acc_norm_stderr": 0.044143436668549335 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6687116564417178, + "acc_stderr": 0.03697983910025588, + "acc_norm": 0.6687116564417178, + "acc_norm_stderr": 0.03697983910025588 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.29464285714285715, + "acc_stderr": 0.043270409325787296, + "acc_norm": 0.29464285714285715, + "acc_norm_stderr": 0.043270409325787296 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6601941747572816, + "acc_stderr": 0.046897659372781335, + "acc_norm": 0.6601941747572816, + "acc_norm_stderr": 0.046897659372781335 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8076923076923077, + "acc_stderr": 0.025819233256483706, + "acc_norm": 0.8076923076923077, + "acc_norm_stderr": 0.025819233256483706 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7266922094508301, + "acc_stderr": 0.015936681062628556, + "acc_norm": 0.7266922094508301, + "acc_norm_stderr": 0.015936681062628556 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6502890173410405, + "acc_stderr": 0.02567428145653102, + "acc_norm": 0.6502890173410405, + "acc_norm_stderr": 0.02567428145653102 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3206703910614525, + "acc_stderr": 0.0156099295593484, + "acc_norm": 0.3206703910614525, + "acc_norm_stderr": 0.0156099295593484 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6568627450980392, + "acc_stderr": 0.02718449890994162, + "acc_norm": 0.6568627450980392, + "acc_norm_stderr": 0.02718449890994162 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.662379421221865, + "acc_stderr": 0.026858825879488544, + "acc_norm": 0.662379421221865, + "acc_norm_stderr": 0.026858825879488544 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6728395061728395, + "acc_stderr": 0.026105673861409814, + "acc_norm": 0.6728395061728395, + "acc_norm_stderr": 0.026105673861409814 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.425531914893617, + "acc_stderr": 0.02949482760014437, + "acc_norm": 0.425531914893617, + "acc_norm_stderr": 0.02949482760014437 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4445893089960887, + "acc_stderr": 0.012691575792657114, + "acc_norm": 0.4445893089960887, + "acc_norm_stderr": 0.012691575792657114 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5477941176470589, + "acc_stderr": 0.03023375855159644, + "acc_norm": 0.5477941176470589, + "acc_norm_stderr": 0.03023375855159644 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5784313725490197, + "acc_stderr": 0.019977422600227474, + "acc_norm": 0.5784313725490197, + "acc_norm_stderr": 0.019977422600227474 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5818181818181818, + "acc_stderr": 0.04724577405731572, + "acc_norm": 0.5818181818181818, + "acc_norm_stderr": 0.04724577405731572 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6204081632653061, + "acc_stderr": 0.03106721126287247, + "acc_norm": 0.6204081632653061, + "acc_norm_stderr": 0.03106721126287247 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7263681592039801, + "acc_stderr": 0.03152439186555402, + "acc_norm": 0.7263681592039801, + "acc_norm_stderr": 0.03152439186555402 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.82, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42771084337349397, + "acc_stderr": 0.038515976837185335, + "acc_norm": 0.42771084337349397, + "acc_norm_stderr": 0.038515976837185335 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7485380116959064, + "acc_stderr": 0.033275044238468436, + "acc_norm": 0.7485380116959064, + "acc_norm_stderr": 0.033275044238468436 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3598531211750306, + "mc1_stderr": 0.016801860466677143, + "mc2": 0.5320122237340842, + "mc2_stderr": 0.015624089171491088 + }, + "all": { + "acc": 0.5606420063064564, + "acc_stderr": 0.034398732723866246, + "acc_norm": 0.5644594145062996, + "acc_norm_stderr": 0.034377283403392314, + "mc1": 0.3598531211750306, + "mc1_stderr": 0.016801860466677143, + "mc2": 0.5320122237340842, + "mc2_stderr": 0.015624089171491088 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "10031.033591985703", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/U-Amethyst-20B/results_2023-11-07T19-04-15.043213.json b/eval-results/Undi95/U-Amethyst-20B/results_2023-11-07T19-04-15.043213.json new file mode 100644 index 0000000000000000000000000000000000000000..197e96411e1fafdd754dd9d4a8cbc405edbab971 --- /dev/null +++ b/eval-results/Undi95/U-Amethyst-20B/results_2023-11-07T19-04-15.043213.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "Undi95/U-Amethyst-20B", + "model_sha": "c0cbe0b3c88041bb6beef27dbe85146af8dddec9", + "model_dtype": "torch.float16", + "model_size": "37.36 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.10622902684563758, + "em_stderr": 0.003155544985138621, + "f1": 0.18749056208053588, + "f1_stderr": 0.003364638518499387 + }, + "harness|gsm8k|5": { + "acc": 0.05307050796057619, + "acc_stderr": 0.0061748688586383774 + }, + "harness|winogrande|5": { + "acc": 0.7419100236779794, + "acc_stderr": 0.012298278833972387 + }, + "all": { + "em": 0.10622902684563758, + "em_stderr": 0.003155544985138621, + "f1": 0.18749056208053588, + "f1_stderr": 0.003364638518499387, + "acc": 0.3974902658192778, + "acc_stderr": 0.009236573846305381 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "3b9797d01407a1b5" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "7501dde384bd88aa" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "9918cf1d5b6a13c0" + }, + "truncated": 3, + "non_truncated": 12119, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/UndiMix-v1-13b/results_2023-09-01T00-10-45.842963.json b/eval-results/Undi95/UndiMix-v1-13b/results_2023-09-01T00-10-45.842963.json new file mode 100644 index 0000000000000000000000000000000000000000..fd8b9ff7497b45cc99774ca3cbe457476e574e43 --- /dev/null +++ b/eval-results/Undi95/UndiMix-v1-13b/results_2023-09-01T00-10-45.842963.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "Undi95/UndiMix-v1-13b", + "model_sha": "fd311f52648825d6988d2f945918468ceb32289f", + "model_dtype": "torch.float16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5554607508532423, + "acc_stderr": 0.01452122640562708, + "acc_norm": 0.5947098976109215, + "acc_norm_stderr": 0.014346869060229321 + }, + "harness|hellaswag|10": { + "acc": 0.6336387173869747, + "acc_stderr": 0.004808251269682437, + "acc_norm": 0.8245369448317068, + "acc_norm_stderr": 0.0037958533012440137 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45925925925925926, + "acc_stderr": 0.04304979692464243, + "acc_norm": 0.45925925925925926, + "acc_norm_stderr": 0.04304979692464243 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5197368421052632, + "acc_stderr": 0.04065771002562603, + "acc_norm": 0.5197368421052632, + "acc_norm_stderr": 0.04065771002562603 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5849056603773585, + "acc_stderr": 0.03032594578928611, + "acc_norm": 0.5849056603773585, + "acc_norm_stderr": 0.03032594578928611 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5972222222222222, + "acc_stderr": 0.04101405519842426, + "acc_norm": 0.5972222222222222, + "acc_norm_stderr": 0.04101405519842426 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4797687861271676, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.4797687861271676, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.046550104113196177, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.046550104113196177 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.04560480215720685, + "acc_norm": 0.71, + "acc_norm_stderr": 0.04560480215720685 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4595744680851064, + "acc_stderr": 0.032579014820998356, + "acc_norm": 0.4595744680851064, + "acc_norm_stderr": 0.032579014820998356 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.04372748290278007, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.04372748290278007 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3201058201058201, + "acc_stderr": 0.0240268463928735, + "acc_norm": 0.3201058201058201, + "acc_norm_stderr": 0.0240268463928735 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04285714285714281, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04285714285714281 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6838709677419355, + "acc_stderr": 0.02645087448904277, + "acc_norm": 0.6838709677419355, + "acc_norm_stderr": 0.02645087448904277 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43349753694581283, + "acc_stderr": 0.03486731727419872, + "acc_norm": 0.43349753694581283, + "acc_norm_stderr": 0.03486731727419872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.036085410115739666, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.036085410115739666 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.702020202020202, + "acc_stderr": 0.03258630383836556, + "acc_norm": 0.702020202020202, + "acc_norm_stderr": 0.03258630383836556 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7979274611398963, + "acc_stderr": 0.028979089794296732, + "acc_norm": 0.7979274611398963, + "acc_norm_stderr": 0.028979089794296732 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5205128205128206, + "acc_stderr": 0.02532966316348994, + "acc_norm": 0.5205128205128206, + "acc_norm_stderr": 0.02532966316348994 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.027309140588230196, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.027309140588230196 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5756302521008403, + "acc_stderr": 0.032104790510157764, + "acc_norm": 0.5756302521008403, + "acc_norm_stderr": 0.032104790510157764 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7376146788990826, + "acc_stderr": 0.01886188502153473, + "acc_norm": 0.7376146788990826, + "acc_norm_stderr": 0.01886188502153473 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.39351851851851855, + "acc_stderr": 0.03331747876370312, + "acc_norm": 0.39351851851851855, + "acc_norm_stderr": 0.03331747876370312 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7352941176470589, + "acc_stderr": 0.030964517926923403, + "acc_norm": 0.7352941176470589, + "acc_norm_stderr": 0.030964517926923403 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7215189873417721, + "acc_stderr": 0.029178682304842534, + "acc_norm": 0.7215189873417721, + "acc_norm_stderr": 0.029178682304842534 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6564885496183206, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.6564885496183206, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7520661157024794, + "acc_stderr": 0.039418975265163025, + "acc_norm": 0.7520661157024794, + "acc_norm_stderr": 0.039418975265163025 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.042844679680521934, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.042844679680521934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6748466257668712, + "acc_stderr": 0.036803503712864616, + "acc_norm": 0.6748466257668712, + "acc_norm_stderr": 0.036803503712864616 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.045723723587374296, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.045723723587374296 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6796116504854369, + "acc_stderr": 0.04620284082280041, + "acc_norm": 0.6796116504854369, + "acc_norm_stderr": 0.04620284082280041 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8076923076923077, + "acc_stderr": 0.025819233256483717, + "acc_norm": 0.8076923076923077, + "acc_norm_stderr": 0.025819233256483717 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7675606641123882, + "acc_stderr": 0.01510455000890572, + "acc_norm": 0.7675606641123882, + "acc_norm_stderr": 0.01510455000890572 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6184971098265896, + "acc_stderr": 0.0261521986197268, + "acc_norm": 0.6184971098265896, + "acc_norm_stderr": 0.0261521986197268 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3787709497206704, + "acc_stderr": 0.016223533510365113, + "acc_norm": 0.3787709497206704, + "acc_norm_stderr": 0.016223533510365113 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6143790849673203, + "acc_stderr": 0.02787074527829027, + "acc_norm": 0.6143790849673203, + "acc_norm_stderr": 0.02787074527829027 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6366559485530546, + "acc_stderr": 0.027316847674192714, + "acc_norm": 0.6366559485530546, + "acc_norm_stderr": 0.027316847674192714 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6265432098765432, + "acc_stderr": 0.026915003011380154, + "acc_norm": 0.6265432098765432, + "acc_norm_stderr": 0.026915003011380154 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.40425531914893614, + "acc_stderr": 0.029275532159704732, + "acc_norm": 0.40425531914893614, + "acc_norm_stderr": 0.029275532159704732 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4165580182529335, + "acc_stderr": 0.012591153245057388, + "acc_norm": 0.4165580182529335, + "acc_norm_stderr": 0.012591153245057388 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5183823529411765, + "acc_stderr": 0.030352303395351964, + "acc_norm": 0.5183823529411765, + "acc_norm_stderr": 0.030352303395351964 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.02008736207670286, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.02008736207670286 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.673469387755102, + "acc_stderr": 0.03002105623844031, + "acc_norm": 0.673469387755102, + "acc_norm_stderr": 0.03002105623844031 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7512437810945274, + "acc_stderr": 0.030567675938916714, + "acc_norm": 0.7512437810945274, + "acc_norm_stderr": 0.030567675938916714 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.46987951807228917, + "acc_stderr": 0.03885425420866766, + "acc_norm": 0.46987951807228917, + "acc_norm_stderr": 0.03885425420866766 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7368421052631579, + "acc_stderr": 0.03377310252209204, + "acc_norm": 0.7368421052631579, + "acc_norm_stderr": 0.03377310252209204 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3268053855569155, + "mc1_stderr": 0.01641987473113503, + "mc2": 0.49776482696964136, + "mc2_stderr": 0.01587298728062371 + }, + "all": { + "acc": 0.5594909108393362, + "acc_stderr": 0.03448055976295334, + "acc_norm": 0.5633917137919195, + "acc_norm_stderr": 0.03446044526610866, + "mc1": 0.3268053855569155, + "mc1_stderr": 0.01641987473113503, + "mc2": 0.49776482696964136, + "mc2_stderr": 0.01587298728062371 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6377.304112434387", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/UndiMix-v1-13b/results_2023-10-16T16-31-03.720074.json b/eval-results/Undi95/UndiMix-v1-13b/results_2023-10-16T16-31-03.720074.json new file mode 100644 index 0000000000000000000000000000000000000000..869bf69fde45724afa73f922fc881468f5964699 --- /dev/null +++ b/eval-results/Undi95/UndiMix-v1-13b/results_2023-10-16T16-31-03.720074.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/UndiMix-v1-13b", + "model_sha": "0822a26bc00373dc5f9d6a19b479860f6aaeeac8", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.2600671140939597, + "em_stderr": 0.004492401208347132, + "f1": 0.34945260067114264, + "f1_stderr": 0.004422869896423944 + }, + "harness|gsm8k|5": { + "acc": 0.10007581501137225, + "acc_stderr": 0.008266274528685646 + }, + "harness|winogrande|5": { + "acc": 0.7545382794001578, + "acc_stderr": 0.012095272937183644 + }, + "all": { + "em": 0.2600671140939597, + "em_stderr": 0.004492401208347132, + "f1": 0.34945260067114264, + "f1_stderr": 0.004422869896423944, + "acc": 0.42730704720576507, + "acc_stderr": 0.010180773732934644 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "beb56688ed058bed" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "1094bab54630dbfd" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "2a75f50b8de396df" + }, + "total_evaluation_time_secondes": "10301.73979473114", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/UndiMix-v4-13B/results_2023-09-18T13-45-54.862257.json b/eval-results/Undi95/UndiMix-v4-13B/results_2023-09-18T13-45-54.862257.json new file mode 100644 index 0000000000000000000000000000000000000000..5f1f57a730c06499968047e239d11adb7e654d38 --- /dev/null +++ b/eval-results/Undi95/UndiMix-v4-13B/results_2023-09-18T13-45-54.862257.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/UndiMix-v4-13B", + "model_sha": "6dd97c74cfe1d22432d5c993814e230f333ba401", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.575938566552901, + "acc_stderr": 0.014441889627464396, + "acc_norm": 0.6194539249146758, + "acc_norm_stderr": 0.014188277712349814 + }, + "harness|hellaswag|10": { + "acc": 0.6444931288587931, + "acc_stderr": 0.0047768836327226165, + "acc_norm": 0.8387771360286795, + "acc_norm_stderr": 0.0036698484004877773 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5592105263157895, + "acc_stderr": 0.04040311062490436, + "acc_norm": 0.5592105263157895, + "acc_norm_stderr": 0.04040311062490436 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6377358490566037, + "acc_stderr": 0.0295822451283843, + "acc_norm": 0.6377358490566037, + "acc_norm_stderr": 0.0295822451283843 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5972222222222222, + "acc_stderr": 0.04101405519842426, + "acc_norm": 0.5972222222222222, + "acc_norm_stderr": 0.04101405519842426 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5491329479768786, + "acc_stderr": 0.0379401267469703, + "acc_norm": 0.5491329479768786, + "acc_norm_stderr": 0.0379401267469703 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.04280105837364396, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.04280105837364396 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.73, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.73, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.44680851063829785, + "acc_stderr": 0.0325005368436584, + "acc_norm": 0.44680851063829785, + "acc_norm_stderr": 0.0325005368436584 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.0433913832257986, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.0433913832257986 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.04164188720169377, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.04164188720169377 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3386243386243386, + "acc_stderr": 0.02437319786798306, + "acc_norm": 0.3386243386243386, + "acc_norm_stderr": 0.02437319786798306 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3412698412698413, + "acc_stderr": 0.04240799327574924, + "acc_norm": 0.3412698412698413, + "acc_norm_stderr": 0.04240799327574924 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6645161290322581, + "acc_stderr": 0.026860206444724345, + "acc_norm": 0.6645161290322581, + "acc_norm_stderr": 0.026860206444724345 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.45320197044334976, + "acc_stderr": 0.035025446508458714, + "acc_norm": 0.45320197044334976, + "acc_norm_stderr": 0.035025446508458714 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6787878787878788, + "acc_stderr": 0.036462049632538115, + "acc_norm": 0.6787878787878788, + "acc_norm_stderr": 0.036462049632538115 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7373737373737373, + "acc_stderr": 0.03135305009533086, + "acc_norm": 0.7373737373737373, + "acc_norm_stderr": 0.03135305009533086 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8134715025906736, + "acc_stderr": 0.028112091210117478, + "acc_norm": 0.8134715025906736, + "acc_norm_stderr": 0.028112091210117478 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5307692307692308, + "acc_stderr": 0.025302958890850154, + "acc_norm": 0.5307692307692308, + "acc_norm_stderr": 0.025302958890850154 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3296296296296296, + "acc_stderr": 0.02866120111652458, + "acc_norm": 0.3296296296296296, + "acc_norm_stderr": 0.02866120111652458 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6008403361344538, + "acc_stderr": 0.03181110032413926, + "acc_norm": 0.6008403361344538, + "acc_norm_stderr": 0.03181110032413926 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.744954128440367, + "acc_stderr": 0.01868850085653584, + "acc_norm": 0.744954128440367, + "acc_norm_stderr": 0.01868850085653584 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.375, + "acc_stderr": 0.033016908987210894, + "acc_norm": 0.375, + "acc_norm_stderr": 0.033016908987210894 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7647058823529411, + "acc_stderr": 0.02977177522814563, + "acc_norm": 0.7647058823529411, + "acc_norm_stderr": 0.02977177522814563 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7341772151898734, + "acc_stderr": 0.02875679962965834, + "acc_norm": 0.7341772151898734, + "acc_norm_stderr": 0.02875679962965834 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7085201793721974, + "acc_stderr": 0.030500283176545857, + "acc_norm": 0.7085201793721974, + "acc_norm_stderr": 0.030500283176545857 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7520661157024794, + "acc_stderr": 0.03941897526516303, + "acc_norm": 0.7520661157024794, + "acc_norm_stderr": 0.03941897526516303 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.04133119440243838, + "acc_norm": 0.7592592592592593, + "acc_norm_stderr": 0.04133119440243838 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7116564417177914, + "acc_stderr": 0.03559039531617342, + "acc_norm": 0.7116564417177914, + "acc_norm_stderr": 0.03559039531617342 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.38392857142857145, + "acc_stderr": 0.04616143075028547, + "acc_norm": 0.38392857142857145, + "acc_norm_stderr": 0.04616143075028547 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7184466019417476, + "acc_stderr": 0.044532548363264673, + "acc_norm": 0.7184466019417476, + "acc_norm_stderr": 0.044532548363264673 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8076923076923077, + "acc_stderr": 0.02581923325648372, + "acc_norm": 0.8076923076923077, + "acc_norm_stderr": 0.02581923325648372 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.768837803320562, + "acc_stderr": 0.015075523238101074, + "acc_norm": 0.768837803320562, + "acc_norm_stderr": 0.015075523238101074 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6329479768786127, + "acc_stderr": 0.025950054337654075, + "acc_norm": 0.6329479768786127, + "acc_norm_stderr": 0.025950054337654075 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.441340782122905, + "acc_stderr": 0.016607021781050873, + "acc_norm": 0.441340782122905, + "acc_norm_stderr": 0.016607021781050873 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6209150326797386, + "acc_stderr": 0.027780141207023344, + "acc_norm": 0.6209150326797386, + "acc_norm_stderr": 0.027780141207023344 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6463022508038585, + "acc_stderr": 0.02715520810320086, + "acc_norm": 0.6463022508038585, + "acc_norm_stderr": 0.02715520810320086 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6327160493827161, + "acc_stderr": 0.026822801759507894, + "acc_norm": 0.6327160493827161, + "acc_norm_stderr": 0.026822801759507894 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4219858156028369, + "acc_stderr": 0.029462189233370593, + "acc_norm": 0.4219858156028369, + "acc_norm_stderr": 0.029462189233370593 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4315514993481095, + "acc_stderr": 0.012650007999463878, + "acc_norm": 0.4315514993481095, + "acc_norm_stderr": 0.012650007999463878 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5330882352941176, + "acc_stderr": 0.030306257722468307, + "acc_norm": 0.5330882352941176, + "acc_norm_stderr": 0.030306257722468307 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.01991037746310594, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.01991037746310594 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6181818181818182, + "acc_stderr": 0.04653429807913507, + "acc_norm": 0.6181818181818182, + "acc_norm_stderr": 0.04653429807913507 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6653061224489796, + "acc_stderr": 0.030209235226242307, + "acc_norm": 0.6653061224489796, + "acc_norm_stderr": 0.030209235226242307 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7164179104477612, + "acc_stderr": 0.03187187537919798, + "acc_norm": 0.7164179104477612, + "acc_norm_stderr": 0.03187187537919798 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.0358870281282637, + "acc_norm": 0.85, + "acc_norm_stderr": 0.0358870281282637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7543859649122807, + "acc_stderr": 0.03301405946987249, + "acc_norm": 0.7543859649122807, + "acc_norm_stderr": 0.03301405946987249 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.33659730722154224, + "mc1_stderr": 0.01654241280949489, + "mc2": 0.48955195668610224, + "mc2_stderr": 0.015400278901450503 + }, + "all": { + "acc": 0.5703924546400917, + "acc_stderr": 0.03420615142613721, + "acc_norm": 0.5744229523609673, + "acc_norm_stderr": 0.03418308961008044, + "mc1": 0.33659730722154224, + "mc1_stderr": 0.01654241280949489, + "mc2": 0.48955195668610224, + "mc2_stderr": 0.015400278901450503 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6399.578526735306", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/UndiMix-v4-13B/results_2023-10-27T04-12-01.560692.json b/eval-results/Undi95/UndiMix-v4-13B/results_2023-10-27T04-12-01.560692.json new file mode 100644 index 0000000000000000000000000000000000000000..605530141d8a54b5b0cb0a49381efcc4bea92f51 --- /dev/null +++ b/eval-results/Undi95/UndiMix-v4-13B/results_2023-10-27T04-12-01.560692.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/UndiMix-v4-13B", + "model_sha": "6dd97c74cfe1d22432d5c993814e230f333ba401", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.14146392617449666, + "em_stderr": 0.003568960808825645, + "f1": 0.20818477348993217, + "f1_stderr": 0.0036692979641845653 + }, + "harness|gsm8k|5": { + "acc": 0.1372251705837756, + "acc_stderr": 0.009477808244600401 + }, + "harness|winogrande|5": { + "acc": 0.7616416732438832, + "acc_stderr": 0.011974948667702308 + }, + "all": { + "em": 0.14146392617449666, + "em_stderr": 0.003568960808825645, + "f1": 0.20818477348993217, + "f1_stderr": 0.0036692979641845653, + "acc": 0.4494334219138294, + "acc_stderr": 0.010726378456151354 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "f2c60078607d0c7b" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "0a5cb55df1d5f997" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "505083f44602e742" + }, + "total_evaluation_time_secondes": "37433.45782971382", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Unholy-v1-12L-13B/results_2023-09-18T13-52-19.375562.json b/eval-results/Undi95/Unholy-v1-12L-13B/results_2023-09-18T13-52-19.375562.json new file mode 100644 index 0000000000000000000000000000000000000000..27ea98ded7d2e47a09303aceadb64723d224576c --- /dev/null +++ b/eval-results/Undi95/Unholy-v1-12L-13B/results_2023-09-18T13-52-19.375562.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Undi95/Unholy-v1-12L-13B", + "model_sha": "ee25c078f08b0812d82597afa3f5e877c19a5c83", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6143344709897611, + "acc_stderr": 0.01422425097325718, + "acc_norm": 0.6356655290102389, + "acc_norm_stderr": 0.014063260279882417 + }, + "harness|hellaswag|10": { + "acc": 0.6395140410276837, + "acc_stderr": 0.004791601975612764, + "acc_norm": 0.8374825731925911, + "acc_norm_stderr": 0.003681708282581456 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5855263157894737, + "acc_stderr": 0.04008973785779206, + "acc_norm": 0.5855263157894737, + "acc_norm_stderr": 0.04008973785779206 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6264150943396226, + "acc_stderr": 0.029773082713319878, + "acc_norm": 0.6264150943396226, + "acc_norm_stderr": 0.029773082713319878 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6597222222222222, + "acc_stderr": 0.039621355734862175, + "acc_norm": 0.6597222222222222, + "acc_norm_stderr": 0.039621355734862175 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5491329479768786, + "acc_stderr": 0.0379401267469703, + "acc_norm": 0.5491329479768786, + "acc_norm_stderr": 0.0379401267469703 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.04440521906179328, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.04440521906179328 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.48936170212765956, + "acc_stderr": 0.03267862331014063, + "acc_norm": 0.48936170212765956, + "acc_norm_stderr": 0.03267862331014063 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5448275862068965, + "acc_stderr": 0.04149886942192117, + "acc_norm": 0.5448275862068965, + "acc_norm_stderr": 0.04149886942192117 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.023973861998992072, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.023973861998992072 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.042857142857142816, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.042857142857142816 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6709677419354839, + "acc_stderr": 0.02672949906834996, + "acc_norm": 0.6709677419354839, + "acc_norm_stderr": 0.02672949906834996 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.458128078817734, + "acc_stderr": 0.03505630140785741, + "acc_norm": 0.458128078817734, + "acc_norm_stderr": 0.03505630140785741 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.703030303030303, + "acc_stderr": 0.035679697722680495, + "acc_norm": 0.703030303030303, + "acc_norm_stderr": 0.035679697722680495 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.031911782267135466, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.031911782267135466 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8341968911917098, + "acc_stderr": 0.026839845022314415, + "acc_norm": 0.8341968911917098, + "acc_norm_stderr": 0.026839845022314415 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5564102564102564, + "acc_stderr": 0.0251891498947642, + "acc_norm": 0.5564102564102564, + "acc_norm_stderr": 0.0251891498947642 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.337037037037037, + "acc_stderr": 0.028820884666253252, + "acc_norm": 0.337037037037037, + "acc_norm_stderr": 0.028820884666253252 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5966386554621849, + "acc_stderr": 0.031866081214088314, + "acc_norm": 0.5966386554621849, + "acc_norm_stderr": 0.031866081214088314 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.03802039760107903, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.03802039760107903 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7559633027522936, + "acc_stderr": 0.018415286351416406, + "acc_norm": 0.7559633027522936, + "acc_norm_stderr": 0.018415286351416406 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.39814814814814814, + "acc_stderr": 0.033384734032074016, + "acc_norm": 0.39814814814814814, + "acc_norm_stderr": 0.033384734032074016 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.028125972265654373, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.028125972265654373 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7721518987341772, + "acc_stderr": 0.02730348459906942, + "acc_norm": 0.7721518987341772, + "acc_norm_stderr": 0.02730348459906942 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.695067264573991, + "acc_stderr": 0.030898610882477515, + "acc_norm": 0.695067264573991, + "acc_norm_stderr": 0.030898610882477515 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6259541984732825, + "acc_stderr": 0.042438692422305246, + "acc_norm": 0.6259541984732825, + "acc_norm_stderr": 0.042438692422305246 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7603305785123967, + "acc_stderr": 0.03896878985070416, + "acc_norm": 0.7603305785123967, + "acc_norm_stderr": 0.03896878985070416 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6871165644171779, + "acc_stderr": 0.03642914578292406, + "acc_norm": 0.6871165644171779, + "acc_norm_stderr": 0.03642914578292406 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3392857142857143, + "acc_stderr": 0.04493949068613539, + "acc_norm": 0.3392857142857143, + "acc_norm_stderr": 0.04493949068613539 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7184466019417476, + "acc_stderr": 0.04453254836326468, + "acc_norm": 0.7184466019417476, + "acc_norm_stderr": 0.04453254836326468 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8076923076923077, + "acc_stderr": 0.025819233256483724, + "acc_norm": 0.8076923076923077, + "acc_norm_stderr": 0.025819233256483724 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.65, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.65, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7701149425287356, + "acc_stderr": 0.015046301846691805, + "acc_norm": 0.7701149425287356, + "acc_norm_stderr": 0.015046301846691805 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.025722802200895803, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.025722802200895803 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.48156424581005586, + "acc_stderr": 0.01671113049778282, + "acc_norm": 0.48156424581005586, + "acc_norm_stderr": 0.01671113049778282 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6339869281045751, + "acc_stderr": 0.027582811415159614, + "acc_norm": 0.6339869281045751, + "acc_norm_stderr": 0.027582811415159614 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.639871382636656, + "acc_stderr": 0.02726429759980401, + "acc_norm": 0.639871382636656, + "acc_norm_stderr": 0.02726429759980401 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6759259259259259, + "acc_stderr": 0.026041766202717163, + "acc_norm": 0.6759259259259259, + "acc_norm_stderr": 0.026041766202717163 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4397163120567376, + "acc_stderr": 0.029609912075594106, + "acc_norm": 0.4397163120567376, + "acc_norm_stderr": 0.029609912075594106 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44654498044328556, + "acc_stderr": 0.012697046024399685, + "acc_norm": 0.44654498044328556, + "acc_norm_stderr": 0.012697046024399685 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.03016191193076711, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.03016191193076711 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5800653594771242, + "acc_stderr": 0.019966811178256477, + "acc_norm": 0.5800653594771242, + "acc_norm_stderr": 0.019966811178256477 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.0449429086625209, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.0449429086625209 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6775510204081633, + "acc_stderr": 0.029923100563683913, + "acc_norm": 0.6775510204081633, + "acc_norm_stderr": 0.029923100563683913 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7810945273631841, + "acc_stderr": 0.029239174636647, + "acc_norm": 0.7810945273631841, + "acc_norm_stderr": 0.029239174636647 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.0358870281282637, + "acc_norm": 0.85, + "acc_norm_stderr": 0.0358870281282637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4759036144578313, + "acc_stderr": 0.038879718495972646, + "acc_norm": 0.4759036144578313, + "acc_norm_stderr": 0.038879718495972646 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7660818713450293, + "acc_stderr": 0.03246721765117826, + "acc_norm": 0.7660818713450293, + "acc_norm_stderr": 0.03246721765117826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.36964504283965727, + "mc1_stderr": 0.01689818070697389, + "mc2": 0.5109377575595978, + "mc2_stderr": 0.015388241246569968 + }, + "all": { + "acc": 0.5823767213037238, + "acc_stderr": 0.03403833440142264, + "acc_norm": 0.5860936635102556, + "acc_norm_stderr": 0.034016793988093735, + "mc1": 0.36964504283965727, + "mc1_stderr": 0.01689818070697389, + "mc2": 0.5109377575595978, + "mc2_stderr": 0.015388241246569968 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6379.087675333023", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/Unholy-v1-12L-13B/results_2023-10-29T08-07-07.360378.json b/eval-results/Undi95/Unholy-v1-12L-13B/results_2023-10-29T08-07-07.360378.json new file mode 100644 index 0000000000000000000000000000000000000000..7be410302eeb293d5928f6b419269b6694aa4296 --- /dev/null +++ b/eval-results/Undi95/Unholy-v1-12L-13B/results_2023-10-29T08-07-07.360378.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Undi95/Unholy-v1-12L-13B", + "model_sha": "ee25c078f08b0812d82597afa3f5e877c19a5c83", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.022651006711409395, + "em_stderr": 0.0015237307803438198, + "f1": 0.09728712248322129, + "f1_stderr": 0.00210132435826052 + }, + "harness|gsm8k|5": { + "acc": 0.1106899166034875, + "acc_stderr": 0.008642172551392465 + }, + "harness|winogrande|5": { + "acc": 0.7726913970007893, + "acc_stderr": 0.011778612167091087 + }, + "all": { + "em": 0.022651006711409395, + "em_stderr": 0.0015237307803438198, + "f1": 0.09728712248322129, + "f1_stderr": 0.00210132435826052, + "acc": 0.44169065680213837, + "acc_stderr": 0.010210392359241776 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "532030fa22d616ee" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "3f90a2b1039f7a20" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "f03455aa46771edd" + }, + "total_evaluation_time_secondes": "12449.205129146576", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/X-MythoChronos-13B/results_2023-12-09T15-55-58.756519.json b/eval-results/Undi95/X-MythoChronos-13B/results_2023-12-09T15-55-58.756519.json new file mode 100644 index 0000000000000000000000000000000000000000..de1d48ed277da9d3f7e180e25714df5ebece8e07 --- /dev/null +++ b/eval-results/Undi95/X-MythoChronos-13B/results_2023-12-09T15-55-58.756519.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 582339.792244841, + "end_time": 592229.896971659, + "total_evaluation_time_secondes": "9890.104726818041", + "model_name": "Undi95/X-MythoChronos-13B", + "model_sha": "8d302741466512f0621a594fce6bf5b8125c8d4c", + "model_dtype": "torch.float16", + "model_size": "24.32 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5844709897610921, + "acc_stderr": 0.014401366641216383, + "acc_norm": 0.5972696245733788, + "acc_norm_stderr": 0.01433223630679015 + }, + "harness|hellaswag|10": { + "acc": 0.6448914558852819, + "acc_stderr": 0.004775681871529864, + "acc_norm": 0.8338976299541924, + "acc_norm_stderr": 0.0037141188843173825 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5037037037037037, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.5037037037037037, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5657894736842105, + "acc_stderr": 0.040335656678483205, + "acc_norm": 0.5657894736842105, + "acc_norm_stderr": 0.040335656678483205 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5886792452830188, + "acc_stderr": 0.030285009259009798, + "acc_norm": 0.5886792452830188, + "acc_norm_stderr": 0.030285009259009798 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.04076663253918567, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.04076663253918567 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.45, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171452, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171452 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4723404255319149, + "acc_stderr": 0.03263597118409769, + "acc_norm": 0.4723404255319149, + "acc_norm_stderr": 0.03263597118409769 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.044346007015849245, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.044346007015849245 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.041546596717075474, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.041546596717075474 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29894179894179895, + "acc_stderr": 0.023577604791655802, + "acc_norm": 0.29894179894179895, + "acc_norm_stderr": 0.023577604791655802 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04216370213557835, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04216370213557835 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6645161290322581, + "acc_stderr": 0.02686020644472434, + "acc_norm": 0.6645161290322581, + "acc_norm_stderr": 0.02686020644472434 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4187192118226601, + "acc_stderr": 0.03471192860518468, + "acc_norm": 0.4187192118226601, + "acc_norm_stderr": 0.03471192860518468 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.03663974994391244, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.03663974994391244 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.031911782267135466, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.031911782267135466 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8082901554404145, + "acc_stderr": 0.02840895362624526, + "acc_norm": 0.8082901554404145, + "acc_norm_stderr": 0.02840895362624526 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5256410256410257, + "acc_stderr": 0.025317649726448663, + "acc_norm": 0.5256410256410257, + "acc_norm_stderr": 0.025317649726448663 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32222222222222224, + "acc_stderr": 0.028493465091028597, + "acc_norm": 0.32222222222222224, + "acc_norm_stderr": 0.028493465091028597 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5756302521008403, + "acc_stderr": 0.032104790510157764, + "acc_norm": 0.5756302521008403, + "acc_norm_stderr": 0.032104790510157764 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.03879687024073327, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.03879687024073327 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7467889908256881, + "acc_stderr": 0.01864407304137504, + "acc_norm": 0.7467889908256881, + "acc_norm_stderr": 0.01864407304137504 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4027777777777778, + "acc_stderr": 0.033448873829978666, + "acc_norm": 0.4027777777777778, + "acc_norm_stderr": 0.033448873829978666 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7450980392156863, + "acc_stderr": 0.030587591351604243, + "acc_norm": 0.7450980392156863, + "acc_norm_stderr": 0.030587591351604243 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7679324894514767, + "acc_stderr": 0.027479744550808503, + "acc_norm": 0.7679324894514767, + "acc_norm_stderr": 0.027479744550808503 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6412213740458015, + "acc_stderr": 0.04206739313864908, + "acc_norm": 0.6412213740458015, + "acc_norm_stderr": 0.04206739313864908 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7520661157024794, + "acc_stderr": 0.03941897526516303, + "acc_norm": 0.7520661157024794, + "acc_norm_stderr": 0.03941897526516303 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.043300437496507416, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.043300437496507416 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6871165644171779, + "acc_stderr": 0.036429145782924055, + "acc_norm": 0.6871165644171779, + "acc_norm_stderr": 0.036429145782924055 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.045416094465039476, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.045416094465039476 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.024414947304543678, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.024414947304543678 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.55, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.55, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7611749680715197, + "acc_stderr": 0.015246803197398675, + "acc_norm": 0.7611749680715197, + "acc_norm_stderr": 0.015246803197398675 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6416184971098265, + "acc_stderr": 0.025816756791584194, + "acc_norm": 0.6416184971098265, + "acc_norm_stderr": 0.025816756791584194 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.488268156424581, + "acc_stderr": 0.016717897676932162, + "acc_norm": 0.488268156424581, + "acc_norm_stderr": 0.016717897676932162 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.027914055510467998, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.027914055510467998 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6463022508038585, + "acc_stderr": 0.027155208103200865, + "acc_norm": 0.6463022508038585, + "acc_norm_stderr": 0.027155208103200865 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6419753086419753, + "acc_stderr": 0.026675611926037106, + "acc_norm": 0.6419753086419753, + "acc_norm_stderr": 0.026675611926037106 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.41843971631205673, + "acc_stderr": 0.02942799403941999, + "acc_norm": 0.41843971631205673, + "acc_norm_stderr": 0.02942799403941999 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44002607561929596, + "acc_stderr": 0.012678037478574513, + "acc_norm": 0.44002607561929596, + "acc_norm_stderr": 0.012678037478574513 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.03032024326500413, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.03032024326500413 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5751633986928104, + "acc_stderr": 0.019997973035458333, + "acc_norm": 0.5751633986928104, + "acc_norm_stderr": 0.019997973035458333 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.046075820907199756, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.046075820907199756 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6326530612244898, + "acc_stderr": 0.03086214492108756, + "acc_norm": 0.6326530612244898, + "acc_norm_stderr": 0.03086214492108756 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7661691542288557, + "acc_stderr": 0.029929415408348384, + "acc_norm": 0.7661691542288557, + "acc_norm_stderr": 0.029929415408348384 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.83, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.46987951807228917, + "acc_stderr": 0.03885425420866766, + "acc_norm": 0.46987951807228917, + "acc_norm_stderr": 0.03885425420866766 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7894736842105263, + "acc_stderr": 0.031267817146631786, + "acc_norm": 0.7894736842105263, + "acc_norm_stderr": 0.031267817146631786 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.37821297429620565, + "mc1_stderr": 0.01697633590754687, + "mc2": 0.535496493693775, + "mc2_stderr": 0.015937525418247476 + }, + "harness|winogrande|5": { + "acc": 0.744277821625888, + "acc_stderr": 0.012261253845440474 + }, + "harness|gsm8k|5": { + "acc": 0.22971948445792267, + "acc_stderr": 0.011586857544997501 + }, + "all": { + "acc": 0.5641085013010667, + "acc_stderr": 0.0335879510752552, + "acc_norm": 0.570142814951906, + "acc_norm_stderr": 0.03430315611658459, + "mc1": 0.37821297429620565, + "mc1_stderr": 0.01697633590754687, + "mc2": 0.535496493693775, + "mc2_stderr": 0.015937525418247476 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "1bb12cc1fc18106a" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "a8fa53915153e1db", + "hash_cont_tokens": "ccd798e74e305913" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/llama2-to-mistral-diff/results_2023-10-10T12-55-48.397880.json b/eval-results/Undi95/llama2-to-mistral-diff/results_2023-10-10T12-55-48.397880.json new file mode 100644 index 0000000000000000000000000000000000000000..2e9c283ec35e85656384adc4b58477ab2cb97e7a --- /dev/null +++ b/eval-results/Undi95/llama2-to-mistral-diff/results_2023-10-10T12-55-48.397880.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "undi95/llama2-to-mistral-diff", + "model_sha": "16c279c5e7d12b8a6ff7771881808ef253a406b9", + "model_size": "12.61 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.49402730375426623, + "acc_stderr": 0.014610348300255793, + "acc_norm": 0.5341296928327645, + "acc_norm_stderr": 0.014577311315231102 + }, + "harness|hellaswag|10": { + "acc": 0.5888269269069907, + "acc_stderr": 0.004910409150135491, + "acc_norm": 0.7856004779924318, + "acc_norm_stderr": 0.004095663731959219 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45185185185185184, + "acc_stderr": 0.04299268905480864, + "acc_norm": 0.45185185185185184, + "acc_norm_stderr": 0.04299268905480864 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.40131578947368424, + "acc_stderr": 0.03988903703336284, + "acc_norm": 0.40131578947368424, + "acc_norm_stderr": 0.03988903703336284 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4641509433962264, + "acc_stderr": 0.030693675018458003, + "acc_norm": 0.4641509433962264, + "acc_norm_stderr": 0.030693675018458003 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4513888888888889, + "acc_stderr": 0.04161402398403279, + "acc_norm": 0.4513888888888889, + "acc_norm_stderr": 0.04161402398403279 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4161849710982659, + "acc_stderr": 0.03758517775404948, + "acc_norm": 0.4161849710982659, + "acc_norm_stderr": 0.03758517775404948 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171453, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171453 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4340425531914894, + "acc_stderr": 0.03240038086792747, + "acc_norm": 0.4340425531914894, + "acc_norm_stderr": 0.03240038086792747 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159393, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159393 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.26455026455026454, + "acc_stderr": 0.022717467897708624, + "acc_norm": 0.26455026455026454, + "acc_norm_stderr": 0.022717467897708624 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.04104947269903394, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.04104947269903394 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.49032258064516127, + "acc_stderr": 0.02843867799890955, + "acc_norm": 0.49032258064516127, + "acc_norm_stderr": 0.02843867799890955 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.35960591133004927, + "acc_stderr": 0.03376458246509566, + "acc_norm": 0.35960591133004927, + "acc_norm_stderr": 0.03376458246509566 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6, + "acc_stderr": 0.038254602783800246, + "acc_norm": 0.6, + "acc_norm_stderr": 0.038254602783800246 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.4797979797979798, + "acc_stderr": 0.0355944356556392, + "acc_norm": 0.4797979797979798, + "acc_norm_stderr": 0.0355944356556392 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6994818652849741, + "acc_stderr": 0.03308818594415751, + "acc_norm": 0.6994818652849741, + "acc_norm_stderr": 0.03308818594415751 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.44871794871794873, + "acc_stderr": 0.025217315184846482, + "acc_norm": 0.44871794871794873, + "acc_norm_stderr": 0.025217315184846482 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.027634907264178544, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.027634907264178544 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.032145368597886394, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.032145368597886394 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.03734535676787198, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.03734535676787198 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6293577981651376, + "acc_stderr": 0.02070745816435298, + "acc_norm": 0.6293577981651376, + "acc_norm_stderr": 0.02070745816435298 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.030225226160012393, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.030225226160012393 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5490196078431373, + "acc_stderr": 0.03492406104163613, + "acc_norm": 0.5490196078431373, + "acc_norm_stderr": 0.03492406104163613 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6033755274261603, + "acc_stderr": 0.03184399873811225, + "acc_norm": 0.6033755274261603, + "acc_norm_stderr": 0.03184399873811225 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5560538116591929, + "acc_stderr": 0.03334625674242728, + "acc_norm": 0.5560538116591929, + "acc_norm_stderr": 0.03334625674242728 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5419847328244275, + "acc_stderr": 0.04369802690578756, + "acc_norm": 0.5419847328244275, + "acc_norm_stderr": 0.04369802690578756 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6528925619834711, + "acc_stderr": 0.04345724570292534, + "acc_norm": 0.6528925619834711, + "acc_norm_stderr": 0.04345724570292534 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.04830366024635331, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.04830366024635331 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5276073619631901, + "acc_stderr": 0.0392237829061099, + "acc_norm": 0.5276073619631901, + "acc_norm_stderr": 0.0392237829061099 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5533980582524272, + "acc_stderr": 0.04922424153458933, + "acc_norm": 0.5533980582524272, + "acc_norm_stderr": 0.04922424153458933 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6923076923076923, + "acc_stderr": 0.03023638994217309, + "acc_norm": 0.6923076923076923, + "acc_norm_stderr": 0.03023638994217309 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6411238825031929, + "acc_stderr": 0.017152991797501342, + "acc_norm": 0.6411238825031929, + "acc_norm_stderr": 0.017152991797501342 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.49421965317919075, + "acc_stderr": 0.026917296179149116, + "acc_norm": 0.49421965317919075, + "acc_norm_stderr": 0.026917296179149116 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.49673202614379086, + "acc_stderr": 0.02862930519400354, + "acc_norm": 0.49673202614379086, + "acc_norm_stderr": 0.02862930519400354 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6012861736334405, + "acc_stderr": 0.0278093225857745, + "acc_norm": 0.6012861736334405, + "acc_norm_stderr": 0.0278093225857745 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5, + "acc_stderr": 0.02782074420373286, + "acc_norm": 0.5, + "acc_norm_stderr": 0.02782074420373286 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3546099290780142, + "acc_stderr": 0.028538650028878638, + "acc_norm": 0.3546099290780142, + "acc_norm_stderr": 0.028538650028878638 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.37222946544980445, + "acc_stderr": 0.012346241297204368, + "acc_norm": 0.37222946544980445, + "acc_norm_stderr": 0.012346241297204368 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5073529411764706, + "acc_stderr": 0.030369552523902173, + "acc_norm": 0.5073529411764706, + "acc_norm_stderr": 0.030369552523902173 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.020102583895887184, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.020102583895887184 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5363636363636364, + "acc_stderr": 0.04776449162396197, + "acc_norm": 0.5363636363636364, + "acc_norm_stderr": 0.04776449162396197 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.49387755102040815, + "acc_stderr": 0.03200682020163908, + "acc_norm": 0.49387755102040815, + "acc_norm_stderr": 0.03200682020163908 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6318407960199005, + "acc_stderr": 0.03410410565495301, + "acc_norm": 0.6318407960199005, + "acc_norm_stderr": 0.03410410565495301 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42771084337349397, + "acc_stderr": 0.038515976837185335, + "acc_norm": 0.42771084337349397, + "acc_norm_stderr": 0.038515976837185335 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.695906432748538, + "acc_stderr": 0.0352821125824523, + "acc_norm": 0.695906432748538, + "acc_norm_stderr": 0.0352821125824523 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2484700122399021, + "mc1_stderr": 0.01512742709652068, + "mc2": 0.38714596689664715, + "mc2_stderr": 0.013504367947573348 + }, + "all": { + "acc": 0.4668758865288765, + "acc_stderr": 0.03526795867551185, + "acc_norm": 0.47089073297233314, + "acc_norm_stderr": 0.03525358948223725, + "mc1": 0.2484700122399021, + "mc1_stderr": 0.01512742709652068, + "mc2": 0.38714596689664715, + "mc2_stderr": 0.013504367947573348 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "8276.640435218811", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/llama2-to-mistral-diff/results_2023-10-24T07-59-15.869817.json b/eval-results/Undi95/llama2-to-mistral-diff/results_2023-10-24T07-59-15.869817.json new file mode 100644 index 0000000000000000000000000000000000000000..1597937eb58bd0ec3f495f1ef8e92f5c0104f954 --- /dev/null +++ b/eval-results/Undi95/llama2-to-mistral-diff/results_2023-10-24T07-59-15.869817.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "undi95/llama2-to-mistral-diff", + "model_sha": "16c279c5e7d12b8a6ff7771881808ef253a406b9", + "model_size": "12.61 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001153523489932886, + "em_stderr": 0.00034761798968571027, + "f1": 0.05605494966442959, + "f1_stderr": 0.0013169501309663063 + }, + "harness|gsm8k|5": { + "acc": 0.07505686125852919, + "acc_stderr": 0.007257633145486643 + }, + "harness|winogrande|5": { + "acc": 0.7403314917127072, + "acc_stderr": 0.012322700705552667 + }, + "all": { + "em": 0.001153523489932886, + "em_stderr": 0.00034761798968571027, + "f1": 0.05605494966442959, + "f1_stderr": 0.0013169501309663063, + "acc": 0.4076941764856182, + "acc_stderr": 0.009790166925519655 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "0ed35b9d228b63b4" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "dc5f6bcad3229164" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "777f4953bd865b2f" + }, + "total_evaluation_time_secondes": "10273.64843583107", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Undi95/llama2-to-mistral-diff/results_2023-10-25T09-37-53.083823.json b/eval-results/Undi95/llama2-to-mistral-diff/results_2023-10-25T09-37-53.083823.json new file mode 100644 index 0000000000000000000000000000000000000000..b66f5dd06b257b6f907af1888bac3208607bf5a5 --- /dev/null +++ b/eval-results/Undi95/llama2-to-mistral-diff/results_2023-10-25T09-37-53.083823.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "undi95/llama2-to-mistral-diff", + "model_sha": "16c279c5e7d12b8a6ff7771881808ef253a406b9", + "model_size": "12.61 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001153523489932886, + "em_stderr": 0.00034761798968571027, + "f1": 0.05605494966442959, + "f1_stderr": 0.0013169501309663063 + }, + "harness|gsm8k|5": { + "acc": 0.07505686125852919, + "acc_stderr": 0.007257633145486643 + }, + "harness|winogrande|5": { + "acc": 0.7403314917127072, + "acc_stderr": 0.012322700705552667 + }, + "all": { + "em": 0.001153523489932886, + "em_stderr": 0.00034761798968571027, + "f1": 0.05605494966442959, + "f1_stderr": 0.0013169501309663063, + "acc": 0.4076941764856182, + "acc_stderr": 0.009790166925519655 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "0ed35b9d228b63b4" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "dc5f6bcad3229164" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "777f4953bd865b2f" + }, + "total_evaluation_time_secondes": "10305.683577537537", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/WhoTookMyAmogusNickname/NewHope_HF_not_official/results_2023-08-22T14-04-45.383046.json b/eval-results/WhoTookMyAmogusNickname/NewHope_HF_not_official/results_2023-08-22T14-04-45.383046.json new file mode 100644 index 0000000000000000000000000000000000000000..885c8057035c48df4a91f2afd8eba8729d4b032b --- /dev/null +++ b/eval-results/WhoTookMyAmogusNickname/NewHope_HF_not_official/results_2023-08-22T14-04-45.383046.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.575938566552901, + "acc_stderr": 0.014441889627464394, + "acc_norm": 0.6109215017064846, + "acc_norm_stderr": 0.014247309976045607 + }, + "harness|hellaswag|10": { + "acc": 0.6371240788687512, + "acc_stderr": 0.004798467983635773, + "acc_norm": 0.8402708623780123, + "acc_norm_stderr": 0.0036560593900501147 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.43703703703703706, + "acc_stderr": 0.04284958639753399, + "acc_norm": 0.43703703703703706, + "acc_norm_stderr": 0.04284958639753399 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5526315789473685, + "acc_stderr": 0.04046336883978251, + "acc_norm": 0.5526315789473685, + "acc_norm_stderr": 0.04046336883978251 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5924528301886792, + "acc_stderr": 0.030242233800854494, + "acc_norm": 0.5924528301886792, + "acc_norm_stderr": 0.030242233800854494 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5625, + "acc_stderr": 0.04148415739394154, + "acc_norm": 0.5625, + "acc_norm_stderr": 0.04148415739394154 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5260115606936416, + "acc_stderr": 0.038073017265045125, + "acc_norm": 0.5260115606936416, + "acc_norm_stderr": 0.038073017265045125 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.043898699568087764, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.043898699568087764 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526094, + "acc_norm": 0.67, + "acc_norm_stderr": 0.047258156262526094 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.46382978723404256, + "acc_stderr": 0.032600385118357715, + "acc_norm": 0.46382978723404256, + "acc_norm_stderr": 0.032600385118357715 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748141, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748141 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.46206896551724136, + "acc_stderr": 0.04154659671707546, + "acc_norm": 0.46206896551724136, + "acc_norm_stderr": 0.04154659671707546 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.32275132275132273, + "acc_stderr": 0.024078943243597016, + "acc_norm": 0.32275132275132273, + "acc_norm_stderr": 0.024078943243597016 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.04325506042017086, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.04325506042017086 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7064516129032258, + "acc_stderr": 0.0259060870213193, + "acc_norm": 0.7064516129032258, + "acc_norm_stderr": 0.0259060870213193 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4088669950738916, + "acc_stderr": 0.034590588158832314, + "acc_norm": 0.4088669950738916, + "acc_norm_stderr": 0.034590588158832314 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6606060606060606, + "acc_stderr": 0.03697442205031595, + "acc_norm": 0.6606060606060606, + "acc_norm_stderr": 0.03697442205031595 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.702020202020202, + "acc_stderr": 0.03258630383836556, + "acc_norm": 0.702020202020202, + "acc_norm_stderr": 0.03258630383836556 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8341968911917098, + "acc_stderr": 0.026839845022314415, + "acc_norm": 0.8341968911917098, + "acc_norm_stderr": 0.026839845022314415 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5307692307692308, + "acc_stderr": 0.025302958890850154, + "acc_norm": 0.5307692307692308, + "acc_norm_stderr": 0.025302958890850154 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.028317533496066468, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.028317533496066468 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.03156663099215416, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.03156663099215416 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.037579499229433426, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.037579499229433426 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7247706422018348, + "acc_stderr": 0.019149093743155196, + "acc_norm": 0.7247706422018348, + "acc_norm_stderr": 0.019149093743155196 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.46296296296296297, + "acc_stderr": 0.03400603625538271, + "acc_norm": 0.46296296296296297, + "acc_norm_stderr": 0.03400603625538271 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7696078431372549, + "acc_stderr": 0.02955429260569506, + "acc_norm": 0.7696078431372549, + "acc_norm_stderr": 0.02955429260569506 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7341772151898734, + "acc_stderr": 0.028756799629658342, + "acc_norm": 0.7341772151898734, + "acc_norm_stderr": 0.028756799629658342 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6591928251121076, + "acc_stderr": 0.0318114974705536, + "acc_norm": 0.6591928251121076, + "acc_norm_stderr": 0.0318114974705536 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6183206106870229, + "acc_stderr": 0.042607351576445594, + "acc_norm": 0.6183206106870229, + "acc_norm_stderr": 0.042607351576445594 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7107438016528925, + "acc_stderr": 0.04139112727635463, + "acc_norm": 0.7107438016528925, + "acc_norm_stderr": 0.04139112727635463 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.04453197507374983, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.04453197507374983 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6441717791411042, + "acc_stderr": 0.03761521380046734, + "acc_norm": 0.6441717791411042, + "acc_norm_stderr": 0.03761521380046734 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2767857142857143, + "acc_stderr": 0.04246624336697625, + "acc_norm": 0.2767857142857143, + "acc_norm_stderr": 0.04246624336697625 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7991452991452992, + "acc_stderr": 0.026246772946890474, + "acc_norm": 0.7991452991452992, + "acc_norm_stderr": 0.026246772946890474 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7522349936143039, + "acc_stderr": 0.015438083080568972, + "acc_norm": 0.7522349936143039, + "acc_norm_stderr": 0.015438083080568972 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6358381502890174, + "acc_stderr": 0.025906632631016127, + "acc_norm": 0.6358381502890174, + "acc_norm_stderr": 0.025906632631016127 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3776536312849162, + "acc_stderr": 0.01621414875213663, + "acc_norm": 0.3776536312849162, + "acc_norm_stderr": 0.01621414875213663 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6405228758169934, + "acc_stderr": 0.027475969910660952, + "acc_norm": 0.6405228758169934, + "acc_norm_stderr": 0.027475969910660952 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6430868167202572, + "acc_stderr": 0.027210420375934023, + "acc_norm": 0.6430868167202572, + "acc_norm_stderr": 0.027210420375934023 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6327160493827161, + "acc_stderr": 0.026822801759507894, + "acc_norm": 0.6327160493827161, + "acc_norm_stderr": 0.026822801759507894 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.42907801418439717, + "acc_stderr": 0.02952591430255855, + "acc_norm": 0.42907801418439717, + "acc_norm_stderr": 0.02952591430255855 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4380704041720991, + "acc_stderr": 0.012671902782567659, + "acc_norm": 0.4380704041720991, + "acc_norm_stderr": 0.012671902782567659 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5625, + "acc_stderr": 0.030134614954403924, + "acc_norm": 0.5625, + "acc_norm_stderr": 0.030134614954403924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5441176470588235, + "acc_stderr": 0.020148939420415745, + "acc_norm": 0.5441176470588235, + "acc_norm_stderr": 0.020148939420415745 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6326530612244898, + "acc_stderr": 0.030862144921087558, + "acc_norm": 0.6326530612244898, + "acc_norm_stderr": 0.030862144921087558 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7412935323383084, + "acc_stderr": 0.03096590312357302, + "acc_norm": 0.7412935323383084, + "acc_norm_stderr": 0.03096590312357302 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.783625730994152, + "acc_stderr": 0.03158149539338734, + "acc_norm": 0.783625730994152, + "acc_norm_stderr": 0.03158149539338734 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3219094247246022, + "mc1_stderr": 0.0163555676119604, + "mc2": 0.44963822928546104, + "mc2_stderr": 0.015152242331554478 + }, + "all": { + "acc": 0.5589250234846714, + "acc_stderr": 0.034366655429358066, + "acc_norm": 0.5629611204111602, + "acc_norm_stderr": 0.03434399461164612, + "mc1": 0.3219094247246022, + "mc1_stderr": 0.0163555676119604, + "mc2": 0.44963822928546104, + "mc2_stderr": 0.015152242331554478 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "WhoTookMyAmogusNickname/NewHope_HF_not_official", + "model_sha": "f587f4a31de6818f4200d9cdc7f116ca8ba1cdc2", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "6950.735807180405", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/WhoTookMyAmogusNickname/NewHope_HF_not_official/results_2023-09-17T06-38-00.301208.json b/eval-results/WhoTookMyAmogusNickname/NewHope_HF_not_official/results_2023-09-17T06-38-00.301208.json new file mode 100644 index 0000000000000000000000000000000000000000..1ea13cfdb1a9e69fcea35ec8c9ce53db1d38a3fc --- /dev/null +++ b/eval-results/WhoTookMyAmogusNickname/NewHope_HF_not_official/results_2023-09-17T06-38-00.301208.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "WhoTookMyAmogusNickname/NewHope_HF_not_official", + "model_sha": "f587f4a31de6818f4200d9cdc7f116ca8ba1cdc2", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.19693791946308725, + "em_stderr": 0.004072666833657848, + "f1": 0.2666285654362424, + "f1_stderr": 0.004068431318455121 + }, + "harness|gsm8k|5": { + "acc": 0.15845337376800606, + "acc_stderr": 0.010058474790238971 + }, + "harness|winogrande|5": { + "acc": 0.749802683504341, + "acc_stderr": 0.012173009642449151 + }, + "all": { + "em": 0.19693791946308725, + "em_stderr": 0.004072666833657848, + "f1": 0.2666285654362424, + "f1_stderr": 0.004068431318455121, + "acc": 0.4541280286361735, + "acc_stderr": 0.011115742216344062 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "5bcc6a0d3257f718" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "5bbe8ff839da2365" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "978d0727f4e66702" + }, + "total_evaluation_time_secondes": "53089.931958675385", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Yehoon/yehoon_llama2/results_2023-09-12T12-52-12.986563.json b/eval-results/Yehoon/yehoon_llama2/results_2023-09-12T12-52-12.986563.json new file mode 100644 index 0000000000000000000000000000000000000000..3182b7983707c5e498a69c7766f1b5521166fc7b --- /dev/null +++ b/eval-results/Yehoon/yehoon_llama2/results_2023-09-12T12-52-12.986563.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "Yehoon/yehoon_llama2", + "model_sha": "443cb81ce988ea6c0b1e20132c170463d559367e", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5221843003412969, + "acc_stderr": 0.014597001927076133, + "acc_norm": 0.5477815699658704, + "acc_norm_stderr": 0.014544519880633827 + }, + "harness|hellaswag|10": { + "acc": 0.59699263095001, + "acc_stderr": 0.004894997736719051, + "acc_norm": 0.7897829117705636, + "acc_norm_stderr": 0.004066299761478503 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.42105263157894735, + "acc_stderr": 0.04017901275981748, + "acc_norm": 0.42105263157894735, + "acc_norm_stderr": 0.04017901275981748 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5584905660377358, + "acc_stderr": 0.030561590426731837, + "acc_norm": 0.5584905660377358, + "acc_norm_stderr": 0.030561590426731837 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.04174752578923185, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.04174752578923185 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.45664739884393063, + "acc_stderr": 0.03798106566014498, + "acc_norm": 0.45664739884393063, + "acc_norm_stderr": 0.03798106566014498 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.044405219061793275, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.044405219061793275 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.451063829787234, + "acc_stderr": 0.032529096196131965, + "acc_norm": 0.451063829787234, + "acc_norm_stderr": 0.032529096196131965 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.32456140350877194, + "acc_stderr": 0.044045561573747664, + "acc_norm": 0.32456140350877194, + "acc_norm_stderr": 0.044045561573747664 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.45517241379310347, + "acc_stderr": 0.04149886942192117, + "acc_norm": 0.45517241379310347, + "acc_norm_stderr": 0.04149886942192117 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29894179894179895, + "acc_stderr": 0.0235776047916558, + "acc_norm": 0.29894179894179895, + "acc_norm_stderr": 0.0235776047916558 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3412698412698413, + "acc_stderr": 0.042407993275749255, + "acc_norm": 0.3412698412698413, + "acc_norm_stderr": 0.042407993275749255 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5516129032258065, + "acc_stderr": 0.02829205683011273, + "acc_norm": 0.5516129032258065, + "acc_norm_stderr": 0.02829205683011273 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3645320197044335, + "acc_stderr": 0.0338640574606209, + "acc_norm": 0.3645320197044335, + "acc_norm_stderr": 0.0338640574606209 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6848484848484848, + "acc_stderr": 0.0362773057502241, + "acc_norm": 0.6848484848484848, + "acc_norm_stderr": 0.0362773057502241 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6212121212121212, + "acc_stderr": 0.03456088731993747, + "acc_norm": 0.6212121212121212, + "acc_norm_stderr": 0.03456088731993747 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7461139896373057, + "acc_stderr": 0.0314102478056532, + "acc_norm": 0.7461139896373057, + "acc_norm_stderr": 0.0314102478056532 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.48205128205128206, + "acc_stderr": 0.02533466708095495, + "acc_norm": 0.48205128205128206, + "acc_norm_stderr": 0.02533466708095495 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.02620276653465215, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.02620276653465215 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4831932773109244, + "acc_stderr": 0.03246013680375308, + "acc_norm": 0.4831932773109244, + "acc_norm_stderr": 0.03246013680375308 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3576158940397351, + "acc_stderr": 0.03913453431177258, + "acc_norm": 0.3576158940397351, + "acc_norm_stderr": 0.03913453431177258 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7045871559633028, + "acc_stderr": 0.019560619182976, + "acc_norm": 0.7045871559633028, + "acc_norm_stderr": 0.019560619182976 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4583333333333333, + "acc_stderr": 0.033981108902946366, + "acc_norm": 0.4583333333333333, + "acc_norm_stderr": 0.033981108902946366 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.696078431372549, + "acc_stderr": 0.032282103870378935, + "acc_norm": 0.696078431372549, + "acc_norm_stderr": 0.032282103870378935 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7172995780590717, + "acc_stderr": 0.029312814153955917, + "acc_norm": 0.7172995780590717, + "acc_norm_stderr": 0.029312814153955917 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5829596412556054, + "acc_stderr": 0.03309266936071721, + "acc_norm": 0.5829596412556054, + "acc_norm_stderr": 0.03309266936071721 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5877862595419847, + "acc_stderr": 0.04317171194870254, + "acc_norm": 0.5877862595419847, + "acc_norm_stderr": 0.04317171194870254 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.043913262867240704, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.043913262867240704 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6203703703703703, + "acc_stderr": 0.04691521224077742, + "acc_norm": 0.6203703703703703, + "acc_norm_stderr": 0.04691521224077742 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5705521472392638, + "acc_stderr": 0.038890666191127236, + "acc_norm": 0.5705521472392638, + "acc_norm_stderr": 0.038890666191127236 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.027236013946196708, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.027236013946196708 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7088122605363985, + "acc_stderr": 0.016246087069701407, + "acc_norm": 0.7088122605363985, + "acc_norm_stderr": 0.016246087069701407 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5664739884393064, + "acc_stderr": 0.026680134761679214, + "acc_norm": 0.5664739884393064, + "acc_norm_stderr": 0.026680134761679214 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23910614525139665, + "acc_stderr": 0.014265554192331144, + "acc_norm": 0.23910614525139665, + "acc_norm_stderr": 0.014265554192331144 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5228758169934641, + "acc_stderr": 0.028599936776089782, + "acc_norm": 0.5228758169934641, + "acc_norm_stderr": 0.028599936776089782 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5916398713826366, + "acc_stderr": 0.027917050748484627, + "acc_norm": 0.5916398713826366, + "acc_norm_stderr": 0.027917050748484627 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.558641975308642, + "acc_stderr": 0.027628737155668777, + "acc_norm": 0.558641975308642, + "acc_norm_stderr": 0.027628737155668777 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.375886524822695, + "acc_stderr": 0.028893955412115882, + "acc_norm": 0.375886524822695, + "acc_norm_stderr": 0.028893955412115882 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.38396349413298564, + "acc_stderr": 0.01242158783313423, + "acc_norm": 0.38396349413298564, + "acc_norm_stderr": 0.01242158783313423 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4632352941176471, + "acc_stderr": 0.030290619180485694, + "acc_norm": 0.4632352941176471, + "acc_norm_stderr": 0.030290619180485694 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5032679738562091, + "acc_stderr": 0.020227402794434864, + "acc_norm": 0.5032679738562091, + "acc_norm_stderr": 0.020227402794434864 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5727272727272728, + "acc_stderr": 0.047381987035454834, + "acc_norm": 0.5727272727272728, + "acc_norm_stderr": 0.047381987035454834 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5755102040816327, + "acc_stderr": 0.031642094879429414, + "acc_norm": 0.5755102040816327, + "acc_norm_stderr": 0.031642094879429414 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7064676616915423, + "acc_stderr": 0.032200241045342054, + "acc_norm": 0.7064676616915423, + "acc_norm_stderr": 0.032200241045342054 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.038367221765980515, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.038367221765980515 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7368421052631579, + "acc_stderr": 0.03377310252209205, + "acc_norm": 0.7368421052631579, + "acc_norm_stderr": 0.03377310252209205 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3317013463892289, + "mc1_stderr": 0.01648214881024147, + "mc2": 0.491698763027883, + "mc2_stderr": 0.015357177241665524 + }, + "all": { + "acc": 0.51451624752162, + "acc_stderr": 0.03496295048918739, + "acc_norm": 0.5182177314274695, + "acc_norm_stderr": 0.0349480152345826, + "mc1": 0.3317013463892289, + "mc1_stderr": 0.01648214881024147, + "mc2": 0.491698763027883, + "mc2_stderr": 0.015357177241665524 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "7810.171063899994", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/Yehoon/yehoon_llama2/results_2023-10-24T20-19-53.869610.json b/eval-results/Yehoon/yehoon_llama2/results_2023-10-24T20-19-53.869610.json new file mode 100644 index 0000000000000000000000000000000000000000..456895d349816699cb5f7894540af338dee331a9 --- /dev/null +++ b/eval-results/Yehoon/yehoon_llama2/results_2023-10-24T20-19-53.869610.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "Yehoon/yehoon_llama2", + "model_sha": "443cb81ce988ea6c0b1e20132c170463d559367e", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.008598993288590604, + "em_stderr": 0.0009455579144542034, + "f1": 0.0916033976510068, + "f1_stderr": 0.0018917747787763773 + }, + "harness|gsm8k|5": { + "acc": 0.07278241091736164, + "acc_stderr": 0.007155604761167479 + }, + "harness|winogrande|5": { + "acc": 0.7474348855564326, + "acc_stderr": 0.012211148449394105 + }, + "all": { + "em": 0.008598993288590604, + "em_stderr": 0.0009455579144542034, + "f1": 0.0916033976510068, + "f1_stderr": 0.0018917747787763773, + "acc": 0.4101086482368971, + "acc_stderr": 0.009683376605280791 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "00fbd63093b42f91" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "c7a30b10087e160e" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "e152b8b2d8b41fd3" + }, + "total_evaluation_time_secondes": "10038.23200583458", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/ahxt/llama2_xs_460M_experimental/results_2023-09-11T16-45-07.137608.json b/eval-results/ahxt/llama2_xs_460M_experimental/results_2023-09-11T16-45-07.137608.json new file mode 100644 index 0000000000000000000000000000000000000000..353599c7342a8a972250ca18f1bf9dbaab29b289 --- /dev/null +++ b/eval-results/ahxt/llama2_xs_460M_experimental/results_2023-09-11T16-45-07.137608.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "ahxt/llama2_xs_460M_experimental", + "model_sha": "c8db281477559f5c969a9be794ce236f8a99e1a0", + "model_size": "886.6 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.2158703071672355, + "acc_stderr": 0.012022975360030672, + "acc_norm": 0.24914675767918087, + "acc_norm_stderr": 0.012639407111926435 + }, + "harness|hellaswag|10": { + "acc": 0.3269269069906393, + "acc_stderr": 0.004681316064444433, + "acc_norm": 0.3846843258315077, + "acc_norm_stderr": 0.004855262903270804 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.03820169914517904, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.03820169914517904 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.19736842105263158, + "acc_stderr": 0.03238981601699397, + "acc_norm": 0.19736842105263158, + "acc_norm_stderr": 0.03238981601699397 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.24528301886792453, + "acc_stderr": 0.026480357179895688, + "acc_norm": 0.24528301886792453, + "acc_norm_stderr": 0.026480357179895688 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2708333333333333, + "acc_stderr": 0.037161774375660164, + "acc_norm": 0.2708333333333333, + "acc_norm_stderr": 0.037161774375660164 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.23121387283236994, + "acc_stderr": 0.0321473730202947, + "acc_norm": 0.23121387283236994, + "acc_norm_stderr": 0.0321473730202947 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.30392156862745096, + "acc_stderr": 0.04576665403207763, + "acc_norm": 0.30392156862745096, + "acc_norm_stderr": 0.04576665403207763 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.21, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.21, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.19574468085106383, + "acc_stderr": 0.025937853139977148, + "acc_norm": 0.19574468085106383, + "acc_norm_stderr": 0.025937853139977148 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813344, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813344 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135303, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135303 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.022644212615525214, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.022644212615525214 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.03809523809523811, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.03809523809523811 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2870967741935484, + "acc_stderr": 0.025736542745594525, + "acc_norm": 0.2870967741935484, + "acc_norm_stderr": 0.025736542745594525 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2660098522167488, + "acc_stderr": 0.031089826002937523, + "acc_norm": 0.2660098522167488, + "acc_norm_stderr": 0.031089826002937523 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.23636363636363636, + "acc_stderr": 0.03317505930009179, + "acc_norm": 0.23636363636363636, + "acc_norm_stderr": 0.03317505930009179 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3383838383838384, + "acc_stderr": 0.03371124142626302, + "acc_norm": 0.3383838383838384, + "acc_norm_stderr": 0.03371124142626302 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.33678756476683935, + "acc_stderr": 0.03410780251836183, + "acc_norm": 0.33678756476683935, + "acc_norm_stderr": 0.03410780251836183 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.32564102564102565, + "acc_stderr": 0.02375966576741229, + "acc_norm": 0.32564102564102565, + "acc_norm_stderr": 0.02375966576741229 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.27037037037037037, + "acc_stderr": 0.02708037281514566, + "acc_norm": 0.27037037037037037, + "acc_norm_stderr": 0.02708037281514566 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.22268907563025211, + "acc_stderr": 0.027025433498882357, + "acc_norm": 0.22268907563025211, + "acc_norm_stderr": 0.027025433498882357 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.038227469376587525, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.038227469376587525 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.29541284403669726, + "acc_stderr": 0.019560619182976, + "acc_norm": 0.29541284403669726, + "acc_norm_stderr": 0.019560619182976 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.23039215686274508, + "acc_stderr": 0.02955429260569506, + "acc_norm": 0.23039215686274508, + "acc_norm_stderr": 0.02955429260569506 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.22784810126582278, + "acc_stderr": 0.02730348459906942, + "acc_norm": 0.22784810126582278, + "acc_norm_stderr": 0.02730348459906942 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.15695067264573992, + "acc_stderr": 0.024413587174907426, + "acc_norm": 0.15695067264573992, + "acc_norm_stderr": 0.024413587174907426 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.22137404580152673, + "acc_stderr": 0.036412970813137296, + "acc_norm": 0.22137404580152673, + "acc_norm_stderr": 0.036412970813137296 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.34710743801652894, + "acc_stderr": 0.04345724570292534, + "acc_norm": 0.34710743801652894, + "acc_norm_stderr": 0.04345724570292534 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.28703703703703703, + "acc_stderr": 0.043733130409147614, + "acc_norm": 0.28703703703703703, + "acc_norm_stderr": 0.043733130409147614 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.26993865030674846, + "acc_stderr": 0.034878251684978906, + "acc_norm": 0.26993865030674846, + "acc_norm_stderr": 0.034878251684978906 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.1875, + "acc_stderr": 0.0370468111477387, + "acc_norm": 0.1875, + "acc_norm_stderr": 0.0370468111477387 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.18446601941747573, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.18446601941747573, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.21794871794871795, + "acc_stderr": 0.02704685763071666, + "acc_norm": 0.21794871794871795, + "acc_norm_stderr": 0.02704685763071666 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2656449553001277, + "acc_stderr": 0.015794302487888726, + "acc_norm": 0.2656449553001277, + "acc_norm_stderr": 0.015794302487888726 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.23699421965317918, + "acc_stderr": 0.022894082489925992, + "acc_norm": 0.23699421965317918, + "acc_norm_stderr": 0.022894082489925992 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24692737430167597, + "acc_stderr": 0.014422292204808835, + "acc_norm": 0.24692737430167597, + "acc_norm_stderr": 0.014422292204808835 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.28104575163398693, + "acc_stderr": 0.025738854797818723, + "acc_norm": 0.28104575163398693, + "acc_norm_stderr": 0.025738854797818723 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.19292604501607716, + "acc_stderr": 0.022411516780911363, + "acc_norm": 0.19292604501607716, + "acc_norm_stderr": 0.022411516780911363 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.22530864197530864, + "acc_stderr": 0.023246202647819746, + "acc_norm": 0.22530864197530864, + "acc_norm_stderr": 0.023246202647819746 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23049645390070922, + "acc_stderr": 0.025123739226872405, + "acc_norm": 0.23049645390070922, + "acc_norm_stderr": 0.025123739226872405 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2685788787483703, + "acc_stderr": 0.011320056629121734, + "acc_norm": 0.2685788787483703, + "acc_norm_stderr": 0.011320056629121734 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4485294117647059, + "acc_stderr": 0.030211479609121593, + "acc_norm": 0.4485294117647059, + "acc_norm_stderr": 0.030211479609121593 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.23366013071895425, + "acc_stderr": 0.017119158496044503, + "acc_norm": 0.23366013071895425, + "acc_norm_stderr": 0.017119158496044503 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.23636363636363636, + "acc_stderr": 0.04069306319721377, + "acc_norm": 0.23636363636363636, + "acc_norm_stderr": 0.04069306319721377 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3020408163265306, + "acc_stderr": 0.029393609319879815, + "acc_norm": 0.3020408163265306, + "acc_norm_stderr": 0.029393609319879815 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2835820895522388, + "acc_stderr": 0.031871875379197986, + "acc_norm": 0.2835820895522388, + "acc_norm_stderr": 0.031871875379197986 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.28313253012048195, + "acc_stderr": 0.03507295431370519, + "acc_norm": 0.28313253012048195, + "acc_norm_stderr": 0.03507295431370519 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.28654970760233917, + "acc_stderr": 0.03467826685703826, + "acc_norm": 0.28654970760233917, + "acc_norm_stderr": 0.03467826685703826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23990208078335373, + "mc1_stderr": 0.014948812679062133, + "mc2": 0.41591461733747837, + "mc2_stderr": 0.01491393118316991 + }, + "all": { + "acc": 0.26203176592138006, + "acc_stderr": 0.031723424035979185, + "acc_norm": 0.2635747128595633, + "acc_norm_stderr": 0.031736820283279565, + "mc1": 0.23990208078335373, + "mc1_stderr": 0.014948812679062133, + "mc2": 0.41591461733747837, + "mc2_stderr": 0.01491393118316991 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1c1a55bd0cf37f8a", + "hash_cont_tokens": "ed17e576dbafa5da" + }, + "truncated": 1579, + "non-truncated": 3108, + "padded": 3074, + "non-padded": 1613, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "acb705837b1ee904", + "hash_cont_tokens": "05628cd18ea5df72" + }, + "truncated": 2200, + "non-truncated": 37968, + "padded": 37825, + "non-padded": 2343, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "44d0f10c01732a59", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "841f3c40fb9a3ad6", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "e642a964ffadbbdb", + "hash_cont_tokens": "18cfffb76bc8f0d1" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "8a4d4091d68bce58", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "8739dcdb7bfc8596", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "db892005f731d424", + "hash_cont_tokens": "16b3626c8a5e3797" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "2d2d3377e0197844", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "e2a4e62aae58eb4d", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "06432181f0ba2f54", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "55c3f43a3d2070f5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 20, + "non-truncated": 672, + "padded": 668, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "6a9e39f77d0269f6", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "2906257342a70a67", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "fe9ccdf4167c086c", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "de65d3ac5c000cd6", + "hash_cont_tokens": "21f0989f5760198a" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5824935b7fc06577", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "ba54cb3b4bfc0de4", + "hash_cont_tokens": "f7d801bfd913884d" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fc752ab9e74113e1", + "hash_cont_tokens": "23f9089575432d5a" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "2f8aa9972e399a06", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "375a497a3eb4c553", + "hash_cont_tokens": "04b8293f2ab7fbbf" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "114c41a7a8b07aee", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "381d70d6716ca04b", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 16, + "non-truncated": 384, + "padded": 384, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "27a316b2ddd0d5bf", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "37e4e714cb26d3c5", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "433ced0d0e657d74", + "hash_cont_tokens": "7994d94bfa36d003" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "549f7e8adc5c21f4", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "54a15ac61c3aa5a3", + "hash_cont_tokens": "a2c91752be5b1798" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "fc205d3accb58afd", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "2b05ab961750bd9c", + "hash_cont_tokens": "db71da66ed82b921" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "128d162675f0eabe", + "hash_cont_tokens": "e81cf9738ad7e157" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a18194d52366718c", + "hash_cont_tokens": "4a2d5f00cb00d9b7" + }, + "truncated": 4, + "non-truncated": 860, + "padded": 860, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "f9cf04dd13d52ef0", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c178574cd0b659f3", + "hash_cont_tokens": "e9bcfaa6beefb456" + }, + "truncated": 948, + "non-truncated": 0, + "padded": 0, + "non-padded": 948, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "193364e509706d28", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "79ff63491e7055d7", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "50da24d68bc01dcc", + "hash_cont_tokens": "6f8215a3de7eebd1" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9c87ef9ca710653a", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "23477d89c414b562", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "25f39962f1d0d842", + "hash_cont_tokens": "aacac708cd4c5a61" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "67152a3fd19f246f", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "4a16cc6e7b111100", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "c33732a2243c9340", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "3d69d9503e063f99", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "8216be86eeb0d710", + "hash_cont_tokens": "16b6c6e390eb7cea" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "14e3d63f795dc9a8", + "hash_cont_tokens": "4130880a19c4edb0" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "63d322de5cea7460", + "hash_cont_tokens": "96b81f570a84328b" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "827bb390a272aa32", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "ef8098661a2ccd4a", + "hash_cont_tokens": "e3a7592f84b44888" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "80fc60332b662a25", + "hash_cont_tokens": "f9edf462e8201551" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "fa0a2c88615f3a18", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 6136, + "non-truncated": 0, + "padded": 0, + "non-padded": 6136, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "80768b5133707128", + "hash_cont_tokens": "ecf7754754c2bb76" + }, + "truncated": 1032, + "non-truncated": 56, + "padded": 48, + "non-padded": 1040, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3313ef98c0cf80f1", + "hash_cont_tokens": "30b07e31cf9b5c6f" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "6a8e48def6985c2c", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "90f8fbb6078c667b", + "hash_cont_tokens": "4d1dc7c4ad251829" + }, + "truncated": 980, + "non-truncated": 0, + "padded": 0, + "non-padded": 980, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fee87f2392545d9f", + "hash_cont_tokens": "d36b9d9f0f4424fe" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "6955799c308be8c2", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "81676e8ad48602dc", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "81646a7b3f910286", + "hash_cont_tokens": "a0a7af55ac7ae037" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "36e7c87cf66eb953", + "hash_cont_tokens": "725e836dedcc8d0f" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "21c4bee89c13b64d", + "hash_cont_tokens": "2b2ca94437702477" + }, + "total_evaluation_time_secondes": "1546.2097175121307", + "truncated": 14391, + "non-truncated": 96628, + "padded": 96383, + "non-padded": 14636, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/ahxt/llama2_xs_460M_experimental/results_2023-10-24T21-46-15.240855.json b/eval-results/ahxt/llama2_xs_460M_experimental/results_2023-10-24T21-46-15.240855.json new file mode 100644 index 0000000000000000000000000000000000000000..a3d5ff452159149b2398f86870ffbf393010fa21 --- /dev/null +++ b/eval-results/ahxt/llama2_xs_460M_experimental/results_2023-10-24T21-46-15.240855.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "ahxt/llama2_xs_460M_experimental", + "model_sha": "c8db281477559f5c969a9be794ce236f8a99e1a0", + "model_size": "886.6 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.002936241610738255, + "em_stderr": 0.0005541113054709602, + "f1": 0.055131082214765176, + "f1_stderr": 0.0014074468297557536 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.4988161010260458, + "acc_stderr": 0.014052446290529012 + }, + "all": { + "em": 0.002936241610738255, + "em_stderr": 0.0005541113054709602, + "f1": 0.055131082214765176, + "f1_stderr": 0.0014074468297557536, + "acc": 0.2494080505130229, + "acc_stderr": 0.007026223145264506 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "eabc74d7251767a0", + "hash_cont_tokens": "097af914ec6dde1c" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "591c78b7a5bedc6e", + "hash_cont_tokens": "bc49e74c54576b14" + }, + "truncated": 988, + "non-truncated": 331, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "f78705e6c36f831b", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "ec50143b19973705", + "hash_cont_tokens": "c0c0af1fe9a4853e" + }, + "total_evaluation_time_secondes": "7710.404353380203", + "truncated": 10278, + "non-truncated": 3111, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/aisquared/chopt-1_3b/results_2023-07-19T14-44-06.685040.json b/eval-results/aisquared/chopt-1_3b/results_2023-07-19T14-44-06.685040.json new file mode 100644 index 0000000000000000000000000000000000000000..92d04e62914f9aa6901ab1e3b7604de0409bff7f --- /dev/null +++ b/eval-results/aisquared/chopt-1_3b/results_2023-07-19T14-44-06.685040.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.2909556313993174, + "acc_stderr": 0.013273077865907585, + "acc_norm": 0.3148464163822526, + "acc_norm_stderr": 0.01357265770308495 + }, + "harness|hellaswag|10": { + "acc": 0.4298944433379805, + "acc_stderr": 0.004940490508240654, + "acc_norm": 0.5663214499103765, + "acc_norm_stderr": 0.004945691164810065 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3111111111111111, + "acc_stderr": 0.039992628766177214, + "acc_norm": 0.3111111111111111, + "acc_norm_stderr": 0.039992628766177214 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.03317672787533157, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.03317672787533157 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2490566037735849, + "acc_stderr": 0.026616482980501708, + "acc_norm": 0.2490566037735849, + "acc_norm_stderr": 0.026616482980501708 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036623, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036623 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.23121387283236994, + "acc_stderr": 0.0321473730202947, + "acc_norm": 0.23121387283236994, + "acc_norm_stderr": 0.0321473730202947 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.04023382273617749, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.04023382273617749 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2, + "acc_stderr": 0.0261488180184245, + "acc_norm": 0.2, + "acc_norm_stderr": 0.0261488180184245 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2620689655172414, + "acc_stderr": 0.036646663372252565, + "acc_norm": 0.2620689655172414, + "acc_norm_stderr": 0.036646663372252565 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2698412698412698, + "acc_stderr": 0.02286083830923207, + "acc_norm": 0.2698412698412698, + "acc_norm_stderr": 0.02286083830923207 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.03809523809523811, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.03809523809523811 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24516129032258063, + "acc_stderr": 0.024472243840895518, + "acc_norm": 0.24516129032258063, + "acc_norm_stderr": 0.024472243840895518 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2315270935960591, + "acc_stderr": 0.02967833314144444, + "acc_norm": 0.2315270935960591, + "acc_norm_stderr": 0.02967833314144444 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2909090909090909, + "acc_stderr": 0.03546563019624337, + "acc_norm": 0.2909090909090909, + "acc_norm_stderr": 0.03546563019624337 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.02962022787479049, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.02962022787479049 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.22797927461139897, + "acc_stderr": 0.03027690994517825, + "acc_norm": 0.22797927461139897, + "acc_norm_stderr": 0.03027690994517825 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.258974358974359, + "acc_stderr": 0.02221110681006166, + "acc_norm": 0.258974358974359, + "acc_norm_stderr": 0.02221110681006166 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.21481481481481482, + "acc_stderr": 0.025040443877000683, + "acc_norm": 0.21481481481481482, + "acc_norm_stderr": 0.025040443877000683 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.18907563025210083, + "acc_stderr": 0.02543511943810537, + "acc_norm": 0.18907563025210083, + "acc_norm_stderr": 0.02543511943810537 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.036313298039696525, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.036313298039696525 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3211009174311927, + "acc_stderr": 0.020018149772733744, + "acc_norm": 0.3211009174311927, + "acc_norm_stderr": 0.020018149772733744 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.03054674526495319, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.03054674526495319 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.22058823529411764, + "acc_stderr": 0.02910225438967408, + "acc_norm": 0.22058823529411764, + "acc_norm_stderr": 0.02910225438967408 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.24050632911392406, + "acc_stderr": 0.02782078198114968, + "acc_norm": 0.24050632911392406, + "acc_norm_stderr": 0.02782078198114968 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.23766816143497757, + "acc_stderr": 0.028568079464714263, + "acc_norm": 0.23766816143497757, + "acc_norm_stderr": 0.028568079464714263 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2748091603053435, + "acc_stderr": 0.039153454088478354, + "acc_norm": 0.2748091603053435, + "acc_norm_stderr": 0.039153454088478354 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.3140495867768595, + "acc_stderr": 0.04236964753041017, + "acc_norm": 0.3140495867768595, + "acc_norm_stderr": 0.04236964753041017 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2037037037037037, + "acc_stderr": 0.03893542518824847, + "acc_norm": 0.2037037037037037, + "acc_norm_stderr": 0.03893542518824847 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.03259177392742177, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.03259177392742177 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.19642857142857142, + "acc_stderr": 0.037709700493470194, + "acc_norm": 0.19642857142857142, + "acc_norm_stderr": 0.037709700493470194 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3786407766990291, + "acc_stderr": 0.04802694698258972, + "acc_norm": 0.3786407766990291, + "acc_norm_stderr": 0.04802694698258972 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.24786324786324787, + "acc_stderr": 0.028286324075564383, + "acc_norm": 0.24786324786324787, + "acc_norm_stderr": 0.028286324075564383 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.25287356321839083, + "acc_stderr": 0.01554337731371968, + "acc_norm": 0.25287356321839083, + "acc_norm_stderr": 0.01554337731371968 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.27167630057803466, + "acc_stderr": 0.023948512905468365, + "acc_norm": 0.27167630057803466, + "acc_norm_stderr": 0.023948512905468365 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2681564245810056, + "acc_stderr": 0.014816119635317003, + "acc_norm": 0.2681564245810056, + "acc_norm_stderr": 0.014816119635317003 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.23202614379084968, + "acc_stderr": 0.02417084087934102, + "acc_norm": 0.23202614379084968, + "acc_norm_stderr": 0.02417084087934102 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2347266881028939, + "acc_stderr": 0.024071805887677045, + "acc_norm": 0.2347266881028939, + "acc_norm_stderr": 0.024071805887677045 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.024383665531035447, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.024383665531035447 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2695035460992908, + "acc_stderr": 0.026469036818590627, + "acc_norm": 0.2695035460992908, + "acc_norm_stderr": 0.026469036818590627 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2757496740547588, + "acc_stderr": 0.01141381360916099, + "acc_norm": 0.2757496740547588, + "acc_norm_stderr": 0.01141381360916099 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.18382352941176472, + "acc_stderr": 0.023529242185193113, + "acc_norm": 0.18382352941176472, + "acc_norm_stderr": 0.023529242185193113 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2679738562091503, + "acc_stderr": 0.017917974069594722, + "acc_norm": 0.2679738562091503, + "acc_norm_stderr": 0.017917974069594722 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.3090909090909091, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.3090909090909091, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.17959183673469387, + "acc_stderr": 0.024573293589585637, + "acc_norm": 0.17959183673469387, + "acc_norm_stderr": 0.024573293589585637 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24875621890547264, + "acc_stderr": 0.030567675938916707, + "acc_norm": 0.24875621890547264, + "acc_norm_stderr": 0.030567675938916707 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411018, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411018 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2469879518072289, + "acc_stderr": 0.03357351982064536, + "acc_norm": 0.2469879518072289, + "acc_norm_stderr": 0.03357351982064536 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.03188578017686398, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.03188578017686398 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23990208078335373, + "mc1_stderr": 0.01494881267906214, + "mc2": 0.401922831966366, + "mc2_stderr": 0.015447511030604369 + }, + "all": { + "acc": 0.25712896304828714, + "acc_stderr": 0.03167806588310976, + "acc_norm": 0.2598462137526148, + "acc_norm_stderr": 0.031683231654190215, + "mc1": 0.23990208078335373, + "mc1_stderr": 0.01494881267906214, + "mc2": 0.401922831966366, + "mc2_stderr": 0.015447511030604369 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "aisquared/chopt-1_3b", + "model_sha": "fdd3691978f557baf9d1c20d4ede900c47f7e135", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2e52476df896898b", + "hash_cont_tokens": "28e2701291693338" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "a5079f2e8402bdc3", + "hash_cont_tokens": "30e348bce778fa10" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "094c3a171105c12e", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "fe68bfcf91b9075e", + "hash_cont_tokens": "705516ff46ec26dc" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "4d77ecaf04a26dfe", + "hash_cont_tokens": "881af7bd65854d45" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "7353edcfcf72d221", + "hash_cont_tokens": "e760cc7be5ddbe71" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "162bb9f7b3cd706e", + "hash_cont_tokens": "37477257cf9eeb0a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "63d442b13b5d85b6", + "hash_cont_tokens": "3f04694ac6f92548" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "99db48cd6b077b68", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "4bc7d55623070a07", + "hash_cont_tokens": "15b2112308ef7b2b" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e83395ed75fa03d5", + "hash_cont_tokens": "a67ba9facbae0268" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "7f508f7828fe5ba6", + "hash_cont_tokens": "40630b2e3e33ca08" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "0fb01b8731db8d81", + "hash_cont_tokens": "4085a0ba4a98cf79" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "8c8460fe570b556e", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "16e0aa20b920aa11", + "hash_cont_tokens": "f15de85dda56bf9a" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "bc236ab739e1c15b", + "hash_cont_tokens": "35b673589f562c55" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "eec634c59e67082e", + "hash_cont_tokens": "1fec337497bf988f" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "551d76303aaf3f4e", + "hash_cont_tokens": "85d6a2e58f1aa799" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "532728846623b114", + "hash_cont_tokens": "6a362d8f09b66319" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "8aaecba1a0475c64", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "2afe2320ca29933a", + "hash_cont_tokens": "7186426999d40201" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "2ba3b67fb2446a06", + "hash_cont_tokens": "97e729fbed631d26" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "10e55771dbb42b2c", + "hash_cont_tokens": "2d5af91609bd4d0d" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "6d8596e5edbe236d", + "hash_cont_tokens": "2553c38072fe59e9" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "3fb9fd43f1792a28", + "hash_cont_tokens": "967f1a6377c5dada" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "51f21e325fe493bc", + "hash_cont_tokens": "5cbe4530fc364ed8" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "78a8e9b40bc5418c", + "hash_cont_tokens": "3c15870aa9a751c8" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "44525d3009ded4a4", + "hash_cont_tokens": "75f6aa84e7959e70" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "76e98460e3320e1c", + "hash_cont_tokens": "7bfc49a85b0e6b0f" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "f47dbaece0632444", + "hash_cont_tokens": "5ced294bf867b6fa" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d685add8792a69d2", + "hash_cont_tokens": "9ffbe637167399d6" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "10fa751069aea803", + "hash_cont_tokens": "25c58237091f9ea7" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "2b245a8312dd0ee8", + "hash_cont_tokens": "19500e048c94127a" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "fa3b5b3bf631cd40", + "hash_cont_tokens": "0135bf601685a8b0" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "a7cc14eb97a963c1", + "hash_cont_tokens": "350bc807db8602e4" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "5a27a3a18e11300c", + "hash_cont_tokens": "944bf06e08c9e841" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "5355beafda861ea0", + "hash_cont_tokens": "a9ec061d9a865f49" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "85bf654d3221129b", + "hash_cont_tokens": "3813b356ad4675eb" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "5f8c6e6a21145296", + "hash_cont_tokens": "4250ef4e0ecec581" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "1cf278ba4dac7b93", + "hash_cont_tokens": "c4fb7cc44b48985a" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "67df50e49cb50049", + "hash_cont_tokens": "f6301f26d3421bfe" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e254e479a1dd95e6", + "hash_cont_tokens": "4bea1308c2dedd32" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "836b977dd80307df", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "3d9d2c0b97a586f9", + "hash_cont_tokens": "d87f2c7e8fda82f9" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "b354e905172e9a92", + "hash_cont_tokens": "098675117a7f6f77" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "e0f5580d6e0bd639", + "hash_cont_tokens": "bd59c34597b05651" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "e66c2273b0b50f8a", + "hash_cont_tokens": "03bcb0a0f9d4f331" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "72c74dca625bae21", + "hash_cont_tokens": "4b9e620ce1055d4a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "139ea332c437abef", + "hash_cont_tokens": "3f04832c8adc4e0a" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "9e4929005482ae10", + "hash_cont_tokens": "767ed1231cb8e258" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "7105767805e28747", + "hash_cont_tokens": "f0b059007537e041" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f04f0a03ea895b5b", + "hash_cont_tokens": "3bc5fb58666e5e8b" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "46fbbd942e3b6db5", + "hash_cont_tokens": "190e8f92d03650fe" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "4b9217ec408da4d4", + "hash_cont_tokens": "1bda889eaab363c0" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "9eadb993a592c2bf", + "hash_cont_tokens": "859ddf07f8d0ab66" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "18f0e119974d9136", + "hash_cont_tokens": "7fdcb74bc758e7bd" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "9a26a58deec29cba", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "4b0d85cf3b0bf65b", + "hash_cont_tokens": "456a90466d8efd2a" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "b0e8f149dfd2fa76", + "hash_cont_tokens": "6d21235f853c8d4b" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "6e0e57e58e2d03ff", + "hash_cont_tokens": "a67a79a7e9449644" + } + } +} \ No newline at end of file diff --git a/eval-results/aisquared/chopt-1_3b/results_2023-10-25T02-11-14.117719.json b/eval-results/aisquared/chopt-1_3b/results_2023-10-25T02-11-14.117719.json new file mode 100644 index 0000000000000000000000000000000000000000..20fa10c86d9323430bf8c5e9e4340347701b16ce --- /dev/null +++ b/eval-results/aisquared/chopt-1_3b/results_2023-10-25T02-11-14.117719.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "aisquared/chopt-1_3b", + "model_sha": "fdd3691978f557baf9d1c20d4ede900c47f7e135", + "model_size": "2.45 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.002936241610738255, + "em_stderr": 0.0005541113054710093, + "f1": 0.046667365771812144, + "f1_stderr": 0.0012971244615236355 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5824782951854776, + "acc_stderr": 0.013859978264440248 + }, + "all": { + "em": 0.002936241610738255, + "em_stderr": 0.0005541113054710093, + "f1": 0.046667365771812144, + "f1_stderr": 0.0012971244615236355, + "acc": 0.2912391475927388, + "acc_stderr": 0.006929989132220124 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "e74b23fd6ab24722", + "hash_cont_tokens": "19e4f6e5fc75dd7c" + }, + "truncated": 384, + "non-truncated": 9152, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "a2243014cab6a7a0", + "hash_cont_tokens": "9c1847969a6412e0" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "0a8020a0b9bd626c", + "hash_cont_tokens": "d75b4039559457e2" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "409bf3c4619f5fc0", + "hash_cont_tokens": "aedda0b5f4ce488b" + }, + "total_evaluation_time_secondes": "7992.957913398743", + "truncated": 384, + "non-truncated": 13005, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/aisquared/chopt-2_7b/results_2023-07-19T16-07-47.560826.json b/eval-results/aisquared/chopt-2_7b/results_2023-07-19T16-07-47.560826.json new file mode 100644 index 0000000000000000000000000000000000000000..f5f48fc2a9dd3b95e11224caf899ae80c25e9c3a --- /dev/null +++ b/eval-results/aisquared/chopt-2_7b/results_2023-07-19T16-07-47.560826.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.3464163822525597, + "acc_stderr": 0.013905011180063247, + "acc_norm": 0.36006825938566556, + "acc_norm_stderr": 0.014027516814585188 + }, + "harness|hellaswag|10": { + "acc": 0.47699661422027484, + "acc_stderr": 0.004984497871025248, + "acc_norm": 0.6338378809002191, + "acc_norm_stderr": 0.0048076995399734266 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.0391545063041425, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.0391545063041425 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.18421052631578946, + "acc_stderr": 0.0315469804508223, + "acc_norm": 0.18421052631578946, + "acc_norm_stderr": 0.0315469804508223 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.29056603773584905, + "acc_stderr": 0.027943219989337128, + "acc_norm": 0.29056603773584905, + "acc_norm_stderr": 0.027943219989337128 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.24305555555555555, + "acc_stderr": 0.03586879280080341, + "acc_norm": 0.24305555555555555, + "acc_norm_stderr": 0.03586879280080341 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.04605661864718381, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04605661864718381 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.17, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.17, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2254335260115607, + "acc_stderr": 0.03186209851641144, + "acc_norm": 0.2254335260115607, + "acc_norm_stderr": 0.03186209851641144 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.04023382273617749, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.04023382273617749 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.24680851063829787, + "acc_stderr": 0.028185441301234078, + "acc_norm": 0.24680851063829787, + "acc_norm_stderr": 0.028185441301234078 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.03999423879281336, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.03999423879281336 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.21379310344827587, + "acc_stderr": 0.0341652044774755, + "acc_norm": 0.21379310344827587, + "acc_norm_stderr": 0.0341652044774755 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25132275132275134, + "acc_stderr": 0.022340482339643895, + "acc_norm": 0.25132275132275134, + "acc_norm_stderr": 0.022340482339643895 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.041634530313028585, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.041634530313028585 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24193548387096775, + "acc_stderr": 0.024362599693031093, + "acc_norm": 0.24193548387096775, + "acc_norm_stderr": 0.024362599693031093 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.03144712581678242, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.03144712581678242 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.23636363636363636, + "acc_stderr": 0.033175059300091805, + "acc_norm": 0.23636363636363636, + "acc_norm_stderr": 0.033175059300091805 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.14646464646464646, + "acc_stderr": 0.025190921114603925, + "acc_norm": 0.14646464646464646, + "acc_norm_stderr": 0.025190921114603925 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.20207253886010362, + "acc_stderr": 0.02897908979429673, + "acc_norm": 0.20207253886010362, + "acc_norm_stderr": 0.02897908979429673 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.258974358974359, + "acc_stderr": 0.022211106810061658, + "acc_norm": 0.258974358974359, + "acc_norm_stderr": 0.022211106810061658 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.02606715922227579, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.02606715922227579 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.22268907563025211, + "acc_stderr": 0.027025433498882378, + "acc_norm": 0.22268907563025211, + "acc_norm_stderr": 0.027025433498882378 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23841059602649006, + "acc_stderr": 0.0347918557259966, + "acc_norm": 0.23841059602649006, + "acc_norm_stderr": 0.0347918557259966 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.26605504587155965, + "acc_stderr": 0.01894602232222559, + "acc_norm": 0.26605504587155965, + "acc_norm_stderr": 0.01894602232222559 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.23148148148148148, + "acc_stderr": 0.02876511171804696, + "acc_norm": 0.23148148148148148, + "acc_norm_stderr": 0.02876511171804696 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.22058823529411764, + "acc_stderr": 0.029102254389674082, + "acc_norm": 0.22058823529411764, + "acc_norm_stderr": 0.029102254389674082 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2489451476793249, + "acc_stderr": 0.028146970599422644, + "acc_norm": 0.2489451476793249, + "acc_norm_stderr": 0.028146970599422644 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.33183856502242154, + "acc_stderr": 0.031602951437766785, + "acc_norm": 0.33183856502242154, + "acc_norm_stderr": 0.031602951437766785 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.25190839694656486, + "acc_stderr": 0.03807387116306086, + "acc_norm": 0.25190839694656486, + "acc_norm_stderr": 0.03807387116306086 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.23140495867768596, + "acc_stderr": 0.03849856098794088, + "acc_norm": 0.23140495867768596, + "acc_norm_stderr": 0.03849856098794088 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.25766871165644173, + "acc_stderr": 0.03436150827846917, + "acc_norm": 0.25766871165644173, + "acc_norm_stderr": 0.03436150827846917 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.30097087378640774, + "acc_stderr": 0.045416094465039476, + "acc_norm": 0.30097087378640774, + "acc_norm_stderr": 0.045416094465039476 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.24358974358974358, + "acc_stderr": 0.028120966503914407, + "acc_norm": 0.24358974358974358, + "acc_norm_stderr": 0.028120966503914407 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.30779054916985954, + "acc_stderr": 0.016506045045155626, + "acc_norm": 0.30779054916985954, + "acc_norm_stderr": 0.016506045045155626 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24566473988439305, + "acc_stderr": 0.02317629820399201, + "acc_norm": 0.24566473988439305, + "acc_norm_stderr": 0.02317629820399201 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.21452513966480447, + "acc_stderr": 0.013728923407828815, + "acc_norm": 0.21452513966480447, + "acc_norm_stderr": 0.013728923407828815 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.02463004897982475, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.02463004897982475 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2765273311897106, + "acc_stderr": 0.025403832978179604, + "acc_norm": 0.2765273311897106, + "acc_norm_stderr": 0.025403832978179604 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.24691358024691357, + "acc_stderr": 0.023993501709042103, + "acc_norm": 0.24691358024691357, + "acc_norm_stderr": 0.023993501709042103 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.25177304964539005, + "acc_stderr": 0.0258921511567094, + "acc_norm": 0.25177304964539005, + "acc_norm_stderr": 0.0258921511567094 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.23402868318122555, + "acc_stderr": 0.010813585552659677, + "acc_norm": 0.23402868318122555, + "acc_norm_stderr": 0.010813585552659677 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.25, + "acc_stderr": 0.026303648393696036, + "acc_norm": 0.25, + "acc_norm_stderr": 0.026303648393696036 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.017740899509177795, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.017740899509177795 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.3090909090909091, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.3090909090909091, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.21224489795918366, + "acc_stderr": 0.026176967197866767, + "acc_norm": 0.21224489795918366, + "acc_norm_stderr": 0.026176967197866767 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24875621890547264, + "acc_stderr": 0.030567675938916718, + "acc_norm": 0.24875621890547264, + "acc_norm_stderr": 0.030567675938916718 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3313253012048193, + "acc_stderr": 0.036643147772880864, + "acc_norm": 0.3313253012048193, + "acc_norm_stderr": 0.036643147772880864 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.25146198830409355, + "acc_stderr": 0.033275044238468436, + "acc_norm": 0.25146198830409355, + "acc_norm_stderr": 0.033275044238468436 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2350061199510404, + "mc1_stderr": 0.014843061507731613, + "mc2": 0.37706019247158395, + "mc2_stderr": 0.015375049922408362 + }, + "all": { + "acc": 0.2597567004608136, + "acc_stderr": 0.03174816747059085, + "acc_norm": 0.2626464147627297, + "acc_norm_stderr": 0.03174724725539543, + "mc1": 0.2350061199510404, + "mc1_stderr": 0.014843061507731613, + "mc2": 0.37706019247158395, + "mc2_stderr": 0.015375049922408362 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "aisquared/chopt-2_7b", + "model_sha": "45f57352c10a1fb1ec13c4bf387a15552ca1fe65", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2e52476df896898b", + "hash_cont_tokens": "28e2701291693338" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "a5079f2e8402bdc3", + "hash_cont_tokens": "30e348bce778fa10" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "094c3a171105c12e", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "fe68bfcf91b9075e", + "hash_cont_tokens": "705516ff46ec26dc" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "4d77ecaf04a26dfe", + "hash_cont_tokens": "881af7bd65854d45" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "7353edcfcf72d221", + "hash_cont_tokens": "e760cc7be5ddbe71" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "162bb9f7b3cd706e", + "hash_cont_tokens": "37477257cf9eeb0a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "63d442b13b5d85b6", + "hash_cont_tokens": "3f04694ac6f92548" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "99db48cd6b077b68", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "4bc7d55623070a07", + "hash_cont_tokens": "15b2112308ef7b2b" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e83395ed75fa03d5", + "hash_cont_tokens": "a67ba9facbae0268" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "7f508f7828fe5ba6", + "hash_cont_tokens": "40630b2e3e33ca08" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "0fb01b8731db8d81", + "hash_cont_tokens": "4085a0ba4a98cf79" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "8c8460fe570b556e", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "16e0aa20b920aa11", + "hash_cont_tokens": "f15de85dda56bf9a" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "bc236ab739e1c15b", + "hash_cont_tokens": "35b673589f562c55" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "eec634c59e67082e", + "hash_cont_tokens": "1fec337497bf988f" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "551d76303aaf3f4e", + "hash_cont_tokens": "85d6a2e58f1aa799" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "532728846623b114", + "hash_cont_tokens": "6a362d8f09b66319" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "8aaecba1a0475c64", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "2afe2320ca29933a", + "hash_cont_tokens": "7186426999d40201" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "2ba3b67fb2446a06", + "hash_cont_tokens": "97e729fbed631d26" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "10e55771dbb42b2c", + "hash_cont_tokens": "2d5af91609bd4d0d" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "6d8596e5edbe236d", + "hash_cont_tokens": "2553c38072fe59e9" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "3fb9fd43f1792a28", + "hash_cont_tokens": "967f1a6377c5dada" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "51f21e325fe493bc", + "hash_cont_tokens": "5cbe4530fc364ed8" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "78a8e9b40bc5418c", + "hash_cont_tokens": "3c15870aa9a751c8" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "44525d3009ded4a4", + "hash_cont_tokens": "75f6aa84e7959e70" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "76e98460e3320e1c", + "hash_cont_tokens": "7bfc49a85b0e6b0f" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "f47dbaece0632444", + "hash_cont_tokens": "5ced294bf867b6fa" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d685add8792a69d2", + "hash_cont_tokens": "9ffbe637167399d6" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "10fa751069aea803", + "hash_cont_tokens": "25c58237091f9ea7" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "2b245a8312dd0ee8", + "hash_cont_tokens": "19500e048c94127a" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "fa3b5b3bf631cd40", + "hash_cont_tokens": "0135bf601685a8b0" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "a7cc14eb97a963c1", + "hash_cont_tokens": "350bc807db8602e4" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "5a27a3a18e11300c", + "hash_cont_tokens": "944bf06e08c9e841" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "5355beafda861ea0", + "hash_cont_tokens": "a9ec061d9a865f49" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "85bf654d3221129b", + "hash_cont_tokens": "3813b356ad4675eb" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "5f8c6e6a21145296", + "hash_cont_tokens": "4250ef4e0ecec581" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "1cf278ba4dac7b93", + "hash_cont_tokens": "c4fb7cc44b48985a" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "67df50e49cb50049", + "hash_cont_tokens": "f6301f26d3421bfe" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e254e479a1dd95e6", + "hash_cont_tokens": "4bea1308c2dedd32" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "836b977dd80307df", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "3d9d2c0b97a586f9", + "hash_cont_tokens": "d87f2c7e8fda82f9" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "b354e905172e9a92", + "hash_cont_tokens": "098675117a7f6f77" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "e0f5580d6e0bd639", + "hash_cont_tokens": "bd59c34597b05651" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "e66c2273b0b50f8a", + "hash_cont_tokens": "03bcb0a0f9d4f331" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "72c74dca625bae21", + "hash_cont_tokens": "4b9e620ce1055d4a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "139ea332c437abef", + "hash_cont_tokens": "3f04832c8adc4e0a" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "9e4929005482ae10", + "hash_cont_tokens": "767ed1231cb8e258" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "7105767805e28747", + "hash_cont_tokens": "f0b059007537e041" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f04f0a03ea895b5b", + "hash_cont_tokens": "3bc5fb58666e5e8b" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "46fbbd942e3b6db5", + "hash_cont_tokens": "190e8f92d03650fe" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "4b9217ec408da4d4", + "hash_cont_tokens": "1bda889eaab363c0" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "9eadb993a592c2bf", + "hash_cont_tokens": "859ddf07f8d0ab66" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "18f0e119974d9136", + "hash_cont_tokens": "7fdcb74bc758e7bd" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "9a26a58deec29cba", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "4b0d85cf3b0bf65b", + "hash_cont_tokens": "456a90466d8efd2a" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "b0e8f149dfd2fa76", + "hash_cont_tokens": "6d21235f853c8d4b" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "6e0e57e58e2d03ff", + "hash_cont_tokens": "a67a79a7e9449644" + } + } +} \ No newline at end of file diff --git a/eval-results/aisquared/chopt-2_7b/results_2023-10-13T00-33-45.271884.json b/eval-results/aisquared/chopt-2_7b/results_2023-10-13T00-33-45.271884.json new file mode 100644 index 0000000000000000000000000000000000000000..2bf2c898fb6051c699e7a6cccfe871422b3e84a5 --- /dev/null +++ b/eval-results/aisquared/chopt-2_7b/results_2023-10-13T00-33-45.271884.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "aisquared/chopt-2_7b", + "model_sha": "45f57352c10a1fb1ec13c4bf387a15552ca1fe65", + "model_size": "4.94 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001363255033557047, + "em_stderr": 0.0003778609196461222, + "f1": 0.04857906879194641, + "f1_stderr": 0.0012385170365466402 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5777426992896606, + "acc_stderr": 0.013881582030658552 + }, + "all": { + "em": 0.001363255033557047, + "em_stderr": 0.0003778609196461222, + "f1": 0.04857906879194641, + "f1_stderr": 0.0012385170365466402, + "acc": 0.2888713496448303, + "acc_stderr": 0.006940791015329276 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "e74b23fd6ab24722", + "hash_cont_tokens": "e2ef52ab80bbd34e" + }, + "truncated": 384, + "non-truncated": 9152, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "a2243014cab6a7a0", + "hash_cont_tokens": "478684cd77217eee" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "0a8020a0b9bd626c", + "hash_cont_tokens": "d75b4039559457e2" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "409bf3c4619f5fc0", + "hash_cont_tokens": "f626c26e74547d37" + }, + "total_evaluation_time_secondes": "15127.967083454132", + "truncated": 384, + "non-truncated": 13005, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/aisquared/dlite-v1-124m/results_2023-07-19T13-54-09.752185.json b/eval-results/aisquared/dlite-v1-124m/results_2023-07-19T13-54-09.752185.json new file mode 100644 index 0000000000000000000000000000000000000000..0f8f97f9cee9b65b543765059e4d546486113461 --- /dev/null +++ b/eval-results/aisquared/dlite-v1-124m/results_2023-07-19T13-54-09.752185.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.2167235494880546, + "acc_stderr": 0.012040156713481189, + "acc_norm": 0.2431740614334471, + "acc_norm_stderr": 0.012536554144587089 + }, + "harness|hellaswag|10": { + "acc": 0.2925712009559849, + "acc_stderr": 0.004540134005060321, + "acc_norm": 0.31159131647082255, + "acc_norm_stderr": 0.0046219725241529765 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.038201699145179055, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.038201699145179055 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.18421052631578946, + "acc_stderr": 0.0315469804508223, + "acc_norm": 0.18421052631578946, + "acc_norm_stderr": 0.0315469804508223 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2037735849056604, + "acc_stderr": 0.02479078450177541, + "acc_norm": 0.2037735849056604, + "acc_norm_stderr": 0.02479078450177541 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2847222222222222, + "acc_stderr": 0.03773809990686934, + "acc_norm": 0.2847222222222222, + "acc_norm_stderr": 0.03773809990686934 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.23, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.23, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2543352601156069, + "acc_stderr": 0.0332055644308557, + "acc_norm": 0.2543352601156069, + "acc_norm_stderr": 0.0332055644308557 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.22127659574468084, + "acc_stderr": 0.02713634960242406, + "acc_norm": 0.22127659574468084, + "acc_norm_stderr": 0.02713634960242406 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.0383515395439942, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.0383515395439942 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.21379310344827587, + "acc_stderr": 0.034165204477475494, + "acc_norm": 0.21379310344827587, + "acc_norm_stderr": 0.034165204477475494 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2328042328042328, + "acc_stderr": 0.02176596167215453, + "acc_norm": 0.2328042328042328, + "acc_norm_stderr": 0.02176596167215453 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.18253968253968253, + "acc_stderr": 0.03455071019102146, + "acc_norm": 0.18253968253968253, + "acc_norm_stderr": 0.03455071019102146 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653695, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653695 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.22258064516129034, + "acc_stderr": 0.023664216671642518, + "acc_norm": 0.22258064516129034, + "acc_norm_stderr": 0.023664216671642518 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.031785297106427524, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.031785297106427524 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.29797979797979796, + "acc_stderr": 0.032586303838365555, + "acc_norm": 0.29797979797979796, + "acc_norm_stderr": 0.032586303838365555 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.31088082901554404, + "acc_stderr": 0.03340361906276588, + "acc_norm": 0.31088082901554404, + "acc_norm_stderr": 0.03340361906276588 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.28974358974358977, + "acc_stderr": 0.023000628243687964, + "acc_norm": 0.28974358974358977, + "acc_norm_stderr": 0.023000628243687964 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24814814814814815, + "acc_stderr": 0.0263357394040558, + "acc_norm": 0.24814814814814815, + "acc_norm_stderr": 0.0263357394040558 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.22268907563025211, + "acc_stderr": 0.02702543349888236, + "acc_norm": 0.22268907563025211, + "acc_norm_stderr": 0.02702543349888236 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.25165562913907286, + "acc_stderr": 0.03543304234389985, + "acc_norm": 0.25165562913907286, + "acc_norm_stderr": 0.03543304234389985 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3100917431192661, + "acc_stderr": 0.01983084968443975, + "acc_norm": 0.3100917431192661, + "acc_norm_stderr": 0.01983084968443975 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3425925925925926, + "acc_stderr": 0.032365852526021574, + "acc_norm": 0.3425925925925926, + "acc_norm_stderr": 0.032365852526021574 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.24019607843137256, + "acc_stderr": 0.02998373305591361, + "acc_norm": 0.24019607843137256, + "acc_norm_stderr": 0.02998373305591361 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.28270042194092826, + "acc_stderr": 0.029312814153955934, + "acc_norm": 0.28270042194092826, + "acc_norm_stderr": 0.029312814153955934 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.15695067264573992, + "acc_stderr": 0.02441358717490743, + "acc_norm": 0.15695067264573992, + "acc_norm_stderr": 0.02441358717490743 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.24427480916030533, + "acc_stderr": 0.037683359597287434, + "acc_norm": 0.24427480916030533, + "acc_norm_stderr": 0.037683359597287434 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.30578512396694213, + "acc_stderr": 0.042059539338841226, + "acc_norm": 0.30578512396694213, + "acc_norm_stderr": 0.042059539338841226 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2037037037037037, + "acc_stderr": 0.03893542518824846, + "acc_norm": 0.2037037037037037, + "acc_norm_stderr": 0.03893542518824846 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2822085889570552, + "acc_stderr": 0.03536117886664742, + "acc_norm": 0.2822085889570552, + "acc_norm_stderr": 0.03536117886664742 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25892857142857145, + "acc_stderr": 0.04157751539865629, + "acc_norm": 0.25892857142857145, + "acc_norm_stderr": 0.04157751539865629 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.21359223300970873, + "acc_stderr": 0.040580420156460364, + "acc_norm": 0.21359223300970873, + "acc_norm_stderr": 0.040580420156460364 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.029343114798094486, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.029343114798094486 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.26309067688378035, + "acc_stderr": 0.015745497169049057, + "acc_norm": 0.26309067688378035, + "acc_norm_stderr": 0.015745497169049057 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2398843930635838, + "acc_stderr": 0.022989592543123567, + "acc_norm": 0.2398843930635838, + "acc_norm_stderr": 0.022989592543123567 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2581699346405229, + "acc_stderr": 0.025058503316958143, + "acc_norm": 0.2581699346405229, + "acc_norm_stderr": 0.025058503316958143 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2861736334405145, + "acc_stderr": 0.025670259242188957, + "acc_norm": 0.2861736334405145, + "acc_norm_stderr": 0.025670259242188957 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25617283950617287, + "acc_stderr": 0.0242885336377261, + "acc_norm": 0.25617283950617287, + "acc_norm_stderr": 0.0242885336377261 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.25177304964539005, + "acc_stderr": 0.025892151156709405, + "acc_norm": 0.25177304964539005, + "acc_norm_stderr": 0.025892151156709405 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2392438070404172, + "acc_stderr": 0.010896123652676667, + "acc_norm": 0.2392438070404172, + "acc_norm_stderr": 0.010896123652676667 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.3786764705882353, + "acc_stderr": 0.029465133639776125, + "acc_norm": 0.3786764705882353, + "acc_norm_stderr": 0.029465133639776125 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2565359477124183, + "acc_stderr": 0.017667841612378984, + "acc_norm": 0.2565359477124183, + "acc_norm_stderr": 0.017667841612378984 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2636363636363636, + "acc_stderr": 0.04220224692971987, + "acc_norm": 0.2636363636363636, + "acc_norm_stderr": 0.04220224692971987 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.23265306122448978, + "acc_stderr": 0.02704925791589618, + "acc_norm": 0.23265306122448978, + "acc_norm_stderr": 0.02704925791589618 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.030360490154014652, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.030360490154014652 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.1927710843373494, + "acc_stderr": 0.030709824050565274, + "acc_norm": 0.1927710843373494, + "acc_norm_stderr": 0.030709824050565274 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.30409356725146197, + "acc_stderr": 0.03528211258245232, + "acc_norm": 0.30409356725146197, + "acc_norm_stderr": 0.03528211258245232 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.21542227662178703, + "mc1_stderr": 0.014391902652427683, + "mc2": 0.36379079677436726, + "mc2_stderr": 0.014716021813912177 + }, + "all": { + "acc": 0.2509309033318517, + "acc_stderr": 0.03138969752799032, + "acc_norm": 0.25170159193287256, + "acc_norm_stderr": 0.031399498137315726, + "mc1": 0.21542227662178703, + "mc1_stderr": 0.014391902652427683, + "mc2": 0.36379079677436726, + "mc2_stderr": 0.014716021813912177 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "aisquared/dlite-v1-124m", + "model_sha": "f6fd5f3960f31881e6cee23f5a872ecc80b40283", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + } + } +} \ No newline at end of file diff --git a/eval-results/aisquared/dlite-v1-124m/results_2023-10-17T05-52-16.762412.json b/eval-results/aisquared/dlite-v1-124m/results_2023-10-17T05-52-16.762412.json new file mode 100644 index 0000000000000000000000000000000000000000..ebeccabf16d4433bb20785f45452e37a46a7fee1 --- /dev/null +++ b/eval-results/aisquared/dlite-v1-124m/results_2023-10-17T05-52-16.762412.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "aisquared/dlite-v1-124m", + "model_sha": "f6fd5f3960f31881e6cee23f5a872ecc80b40283", + "model_size": "238.85 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.011954697986577181, + "em_stderr": 0.0011130056898859015, + "f1": 0.0519830117449665, + "f1_stderr": 0.0015990891614949285 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5019731649565904, + "acc_stderr": 0.014052376259225629 + }, + "all": { + "em": 0.011954697986577181, + "em_stderr": 0.0011130056898859015, + "f1": 0.0519830117449665, + "f1_stderr": 0.0015990891614949285, + "acc": 0.2509865824782952, + "acc_stderr": 0.0070261881296128145 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "79d02d8a4c41a2f1" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "ac2731397e78da5f" + }, + "truncated": 917, + "non-truncated": 402, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "6b3de69e2f87348c", + "hash_cont_tokens": "6c0b791c0a4dbecc" + }, + "total_evaluation_time_secondes": "3851.4590072631836", + "truncated": 10207, + "non-truncated": 3182, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/aisquared/dlite-v1-1_5b/results_2023-07-19T15-22-45.415057.json b/eval-results/aisquared/dlite-v1-1_5b/results_2023-07-19T15-22-45.415057.json new file mode 100644 index 0000000000000000000000000000000000000000..2a7e0699f824c139afd9e4ec67b853cf7545a3f5 --- /dev/null +++ b/eval-results/aisquared/dlite-v1-1_5b/results_2023-07-19T15-22-45.415057.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.28498293515358364, + "acc_stderr": 0.013191348179838793, + "acc_norm": 0.3165529010238908, + "acc_norm_stderr": 0.013592431519068079 + }, + "harness|hellaswag|10": { + "acc": 0.39494124676359293, + "acc_stderr": 0.004878390226591719, + "acc_norm": 0.4969129655447122, + "acc_norm_stderr": 0.004989686307484563 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.037498507091740234, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.037498507091740234 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.20394736842105263, + "acc_stderr": 0.032790004063100515, + "acc_norm": 0.20394736842105263, + "acc_norm_stderr": 0.032790004063100515 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.23, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.23, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2830188679245283, + "acc_stderr": 0.027724236492700904, + "acc_norm": 0.2830188679245283, + "acc_norm_stderr": 0.027724236492700904 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2986111111111111, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.2986111111111111, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.35260115606936415, + "acc_stderr": 0.036430371689585475, + "acc_norm": 0.35260115606936415, + "acc_norm_stderr": 0.036430371689585475 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.03793281185307809, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.03793281185307809 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.251063829787234, + "acc_stderr": 0.028346963777162466, + "acc_norm": 0.251063829787234, + "acc_norm_stderr": 0.028346963777162466 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.038351539543994194, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.038351539543994194 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.02241804289111394, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.02241804289111394 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04285714285714281, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04285714285714281 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.15, + "acc_stderr": 0.0358870281282637, + "acc_norm": 0.15, + "acc_norm_stderr": 0.0358870281282637 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25483870967741934, + "acc_stderr": 0.024790118459332208, + "acc_norm": 0.25483870967741934, + "acc_norm_stderr": 0.024790118459332208 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.18719211822660098, + "acc_stderr": 0.027444924966882618, + "acc_norm": 0.18719211822660098, + "acc_norm_stderr": 0.027444924966882618 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.28484848484848485, + "acc_stderr": 0.03524390844511782, + "acc_norm": 0.28484848484848485, + "acc_norm_stderr": 0.03524390844511782 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.36363636363636365, + "acc_stderr": 0.034273086529999344, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.034273086529999344 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.22279792746113988, + "acc_stderr": 0.03003114797764154, + "acc_norm": 0.22279792746113988, + "acc_norm_stderr": 0.03003114797764154 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.34102564102564104, + "acc_stderr": 0.02403548967633506, + "acc_norm": 0.34102564102564104, + "acc_norm_stderr": 0.02403548967633506 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.02620276653465215, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.02620276653465215 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.02755361446786381, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.02755361446786381 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2251655629139073, + "acc_stderr": 0.03410435282008936, + "acc_norm": 0.2251655629139073, + "acc_norm_stderr": 0.03410435282008936 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3321100917431193, + "acc_stderr": 0.020192682985423344, + "acc_norm": 0.3321100917431193, + "acc_norm_stderr": 0.020192682985423344 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.03256850570293648, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.03256850570293648 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.03019028245350194, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.03019028245350194 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.25316455696202533, + "acc_stderr": 0.0283046579430353, + "acc_norm": 0.25316455696202533, + "acc_norm_stderr": 0.0283046579430353 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.17040358744394618, + "acc_stderr": 0.025234593447136175, + "acc_norm": 0.17040358744394618, + "acc_norm_stderr": 0.025234593447136175 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.26717557251908397, + "acc_stderr": 0.038808483010823944, + "acc_norm": 0.26717557251908397, + "acc_norm_stderr": 0.038808483010823944 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.25153374233128833, + "acc_stderr": 0.03408997886857529, + "acc_norm": 0.25153374233128833, + "acc_norm_stderr": 0.03408997886857529 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25, + "acc_stderr": 0.04109974682633932, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04109974682633932 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.27184466019417475, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.27184466019417475, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2692307692307692, + "acc_stderr": 0.02905858830374884, + "acc_norm": 0.2692307692307692, + "acc_norm_stderr": 0.02905858830374884 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2720306513409962, + "acc_stderr": 0.015913367447500524, + "acc_norm": 0.2720306513409962, + "acc_norm_stderr": 0.015913367447500524 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24566473988439305, + "acc_stderr": 0.02317629820399201, + "acc_norm": 0.24566473988439305, + "acc_norm_stderr": 0.02317629820399201 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.0239291555173513, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.0239291555173513 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.18971061093247588, + "acc_stderr": 0.02226819625878322, + "acc_norm": 0.18971061093247588, + "acc_norm_stderr": 0.02226819625878322 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25617283950617287, + "acc_stderr": 0.0242885336377261, + "acc_norm": 0.25617283950617287, + "acc_norm_stderr": 0.0242885336377261 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2375886524822695, + "acc_stderr": 0.02538951255272991, + "acc_norm": 0.2375886524822695, + "acc_norm_stderr": 0.02538951255272991 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24119947848761408, + "acc_stderr": 0.010926496102034956, + "acc_norm": 0.24119947848761408, + "acc_norm_stderr": 0.010926496102034956 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2536764705882353, + "acc_stderr": 0.026431329870789534, + "acc_norm": 0.2536764705882353, + "acc_norm_stderr": 0.026431329870789534 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.017401816711427657, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.017401816711427657 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.33636363636363636, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.33636363636363636, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.18775510204081633, + "acc_stderr": 0.025000256039546212, + "acc_norm": 0.18775510204081633, + "acc_norm_stderr": 0.025000256039546212 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2537313432835821, + "acc_stderr": 0.030769444967296028, + "acc_norm": 0.2537313432835821, + "acc_norm_stderr": 0.030769444967296028 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036623, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036623 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3313253012048193, + "acc_stderr": 0.036643147772880864, + "acc_norm": 0.3313253012048193, + "acc_norm_stderr": 0.036643147772880864 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.22643818849449204, + "mc1_stderr": 0.014651337324602574, + "mc2": 0.3708479337614819, + "mc2_stderr": 0.01403741058760603 + }, + "all": { + "acc": 0.2590124397337627, + "acc_stderr": 0.031632406427795565, + "acc_norm": 0.2612758581176852, + "acc_norm_stderr": 0.03164109082474679, + "mc1": 0.22643818849449204, + "mc1_stderr": 0.014651337324602574, + "mc2": 0.3708479337614819, + "mc2_stderr": 0.01403741058760603 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "aisquared/dlite-v1-1_5b", + "model_sha": "4ac21faec255e3544e96aeb3591c27bdee5ebf45", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + } + } +} \ No newline at end of file diff --git a/eval-results/aisquared/dlite-v1-1_5b/results_2023-09-23T17-48-40.273494.json b/eval-results/aisquared/dlite-v1-1_5b/results_2023-09-23T17-48-40.273494.json new file mode 100644 index 0000000000000000000000000000000000000000..57dad81ad451e4eae81e10e498c3ffea7c7a56b9 --- /dev/null +++ b/eval-results/aisquared/dlite-v1-1_5b/results_2023-09-23T17-48-40.273494.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "aisquared/dlite-v1-1_5b", + "model_sha": "4ac21faec255e3544e96aeb3591c27bdee5ebf45", + "model_size": "2.91 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.005977348993288591, + "em_stderr": 0.0007893908687131983, + "f1": 0.06289953859060417, + "f1_stderr": 0.0015069024652225058 + }, + "harness|gsm8k|5": { + "acc": 0.000758150113722517, + "acc_stderr": 0.0007581501137225347 + }, + "harness|winogrande|5": { + "acc": 0.5595895816890292, + "acc_stderr": 0.013952330311915607 + }, + "all": { + "em": 0.005977348993288591, + "em_stderr": 0.0007893908687131983, + "f1": 0.06289953859060417, + "f1_stderr": 0.0015069024652225058, + "acc": 0.28017386590137583, + "acc_stderr": 0.00735524021281907 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "c31cbc9e63001153" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "08778d39fee18671" + }, + "truncated": 917, + "non-truncated": 402, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "6b3de69e2f87348c", + "hash_cont_tokens": "a9873fc58c55127e" + }, + "total_evaluation_time_secondes": "20439.546547412872", + "truncated": 10207, + "non-truncated": 3182, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/aisquared/dlite-v1-355m/results_2023-07-19T14-15-29.432225.json b/eval-results/aisquared/dlite-v1-355m/results_2023-07-19T14-15-29.432225.json new file mode 100644 index 0000000000000000000000000000000000000000..6d3f28d92556332dca587764cc23608eb16c0381 --- /dev/null +++ b/eval-results/aisquared/dlite-v1-355m/results_2023-07-19T14-15-29.432225.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.23720136518771331, + "acc_stderr": 0.01243039982926084, + "acc_norm": 0.2713310580204778, + "acc_norm_stderr": 0.012993807727545794 + }, + "harness|hellaswag|10": { + "acc": 0.3353913563035252, + "acc_stderr": 0.004711622011148468, + "acc_norm": 0.3906592312288389, + "acc_norm_stderr": 0.004869010152280754 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.32592592592592595, + "acc_stderr": 0.040491220417025055, + "acc_norm": 0.32592592592592595, + "acc_norm_stderr": 0.040491220417025055 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3223684210526316, + "acc_stderr": 0.03803510248351585, + "acc_norm": 0.3223684210526316, + "acc_norm_stderr": 0.03803510248351585 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.3018867924528302, + "acc_stderr": 0.02825420034443867, + "acc_norm": 0.3018867924528302, + "acc_norm_stderr": 0.02825420034443867 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2847222222222222, + "acc_stderr": 0.037738099906869355, + "acc_norm": 0.2847222222222222, + "acc_norm_stderr": 0.037738099906869355 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.23699421965317918, + "acc_stderr": 0.03242414757483099, + "acc_norm": 0.23699421965317918, + "acc_norm_stderr": 0.03242414757483099 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.043364327079931785, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.043364327079931785 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206824, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206824 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2170212765957447, + "acc_stderr": 0.026947483121496238, + "acc_norm": 0.2170212765957447, + "acc_norm_stderr": 0.026947483121496238 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748142, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748142 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2620689655172414, + "acc_stderr": 0.036646663372252565, + "acc_norm": 0.2620689655172414, + "acc_norm_stderr": 0.036646663372252565 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2698412698412698, + "acc_stderr": 0.02286083830923207, + "acc_norm": 0.2698412698412698, + "acc_norm_stderr": 0.02286083830923207 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.04073524322147124, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.04073524322147124 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2645161290322581, + "acc_stderr": 0.02509189237885928, + "acc_norm": 0.2645161290322581, + "acc_norm_stderr": 0.02509189237885928 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.31527093596059114, + "acc_stderr": 0.03269080871970186, + "acc_norm": 0.31527093596059114, + "acc_norm_stderr": 0.03269080871970186 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.24242424242424243, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3484848484848485, + "acc_stderr": 0.033948539651564025, + "acc_norm": 0.3484848484848485, + "acc_norm_stderr": 0.033948539651564025 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.35751295336787564, + "acc_stderr": 0.034588160421810045, + "acc_norm": 0.35751295336787564, + "acc_norm_stderr": 0.034588160421810045 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.35384615384615387, + "acc_stderr": 0.024243783994062164, + "acc_norm": 0.35384615384615387, + "acc_norm_stderr": 0.024243783994062164 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.026962424325073835, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.026962424325073835 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.24369747899159663, + "acc_stderr": 0.027886828078380572, + "acc_norm": 0.24369747899159663, + "acc_norm_stderr": 0.027886828078380572 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.037345356767871984, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.037345356767871984 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3486238532110092, + "acc_stderr": 0.02043125409071433, + "acc_norm": 0.3486238532110092, + "acc_norm_stderr": 0.02043125409071433 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.46296296296296297, + "acc_stderr": 0.03400603625538272, + "acc_norm": 0.46296296296296297, + "acc_norm_stderr": 0.03400603625538272 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.030587591351604257, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.030587591351604257 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.21518987341772153, + "acc_stderr": 0.02675082699467616, + "acc_norm": 0.21518987341772153, + "acc_norm_stderr": 0.02675082699467616 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.19282511210762332, + "acc_stderr": 0.02647824096048936, + "acc_norm": 0.19282511210762332, + "acc_norm_stderr": 0.02647824096048936 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2824427480916031, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.2824427480916031, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.12396694214876033, + "acc_stderr": 0.030083098716035227, + "acc_norm": 0.12396694214876033, + "acc_norm_stderr": 0.030083098716035227 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.18518518518518517, + "acc_stderr": 0.03755265865037183, + "acc_norm": 0.18518518518518517, + "acc_norm_stderr": 0.03755265865037183 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.294478527607362, + "acc_stderr": 0.03581165790474082, + "acc_norm": 0.294478527607362, + "acc_norm_stderr": 0.03581165790474082 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.15178571428571427, + "acc_stderr": 0.034057028381856945, + "acc_norm": 0.15178571428571427, + "acc_norm_stderr": 0.034057028381856945 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.34951456310679613, + "acc_stderr": 0.04721188506097173, + "acc_norm": 0.34951456310679613, + "acc_norm_stderr": 0.04721188506097173 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.17094017094017094, + "acc_stderr": 0.0246624968452098, + "acc_norm": 0.17094017094017094, + "acc_norm_stderr": 0.0246624968452098 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.17, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.17, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.23371647509578544, + "acc_stderr": 0.01513338327898884, + "acc_norm": 0.23371647509578544, + "acc_norm_stderr": 0.01513338327898884 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.23699421965317918, + "acc_stderr": 0.02289408248992599, + "acc_norm": 0.23699421965317918, + "acc_norm_stderr": 0.02289408248992599 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24134078212290502, + "acc_stderr": 0.014310999547961459, + "acc_norm": 0.24134078212290502, + "acc_norm_stderr": 0.014310999547961459 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.02582916327275748, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.02582916327275748 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2958199356913183, + "acc_stderr": 0.025922371788818795, + "acc_norm": 0.2958199356913183, + "acc_norm_stderr": 0.025922371788818795 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.24382716049382716, + "acc_stderr": 0.0238918795419596, + "acc_norm": 0.24382716049382716, + "acc_norm_stderr": 0.0238918795419596 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2624113475177305, + "acc_stderr": 0.02624492034984302, + "acc_norm": 0.2624113475177305, + "acc_norm_stderr": 0.02624492034984302 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.25684485006518903, + "acc_stderr": 0.011158455853098867, + "acc_norm": 0.25684485006518903, + "acc_norm_stderr": 0.011158455853098867 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.0301619119307671, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.0301619119307671 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.21241830065359477, + "acc_stderr": 0.016547148636203147, + "acc_norm": 0.21241830065359477, + "acc_norm_stderr": 0.016547148636203147 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.041723430387053825, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.041723430387053825 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3673469387755102, + "acc_stderr": 0.03086214492108756, + "acc_norm": 0.3673469387755102, + "acc_norm_stderr": 0.03086214492108756 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24875621890547264, + "acc_stderr": 0.030567675938916707, + "acc_norm": 0.24875621890547264, + "acc_norm_stderr": 0.030567675938916707 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.21084337349397592, + "acc_stderr": 0.031755547866299194, + "acc_norm": 0.21084337349397592, + "acc_norm_stderr": 0.031755547866299194 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.034462962170884265, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.034462962170884265 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862671, + "mc2": 0.3712543349925085, + "mc2_stderr": 0.014372378251071535 + }, + "all": { + "acc": 0.2717270488035596, + "acc_stderr": 0.03202713461660246, + "acc_norm": 0.27324226181640837, + "acc_norm_stderr": 0.03203935149862648, + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862671, + "mc2": 0.3712543349925085, + "mc2_stderr": 0.014372378251071535 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "aisquared/dlite-v1-355m", + "model_sha": "c5f4b5a61e6a66a5c7613164d99a70db5bf7e9a2", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + } + } +} \ No newline at end of file diff --git a/eval-results/aisquared/dlite-v1-355m/results_2023-10-27T20-11-22.634896.json b/eval-results/aisquared/dlite-v1-355m/results_2023-10-27T20-11-22.634896.json new file mode 100644 index 0000000000000000000000000000000000000000..125234433e09f85361fad6cf4ee54cf11fdc1666 --- /dev/null +++ b/eval-results/aisquared/dlite-v1-355m/results_2023-10-27T20-11-22.634896.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "aisquared/dlite-v1-355m", + "model_sha": "c5f4b5a61e6a66a5c7613164d99a70db5bf7e9a2", + "model_size": "679.78 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.009123322147651007, + "em_stderr": 0.0009737017705541621, + "f1": 0.05341862416107383, + "f1_stderr": 0.0014844140427647057 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5280189423835833, + "acc_stderr": 0.014030404213405791 + }, + "all": { + "em": 0.009123322147651007, + "em_stderr": 0.0009737017705541621, + "f1": 0.05341862416107383, + "f1_stderr": 0.0014844140427647057, + "acc": 0.26400947119179163, + "acc_stderr": 0.0070152021067028955 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "95494b700d8b76f1" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "9bce636f9b830f64" + }, + "truncated": 917, + "non-truncated": 402, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "6b3de69e2f87348c", + "hash_cont_tokens": "906e0d66d6098de3" + }, + "total_evaluation_time_secondes": "11910.574273347855", + "truncated": 10207, + "non-truncated": 3182, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/aisquared/dlite-v1-774m/results_2023-07-19T14-26-45.959233.json b/eval-results/aisquared/dlite-v1-774m/results_2023-07-19T14-26-45.959233.json new file mode 100644 index 0000000000000000000000000000000000000000..d3f2c92920b5a014bc0cdea3543b7de4c4918aad --- /dev/null +++ b/eval-results/aisquared/dlite-v1-774m/results_2023-07-19T14-26-45.959233.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.25341296928327645, + "acc_stderr": 0.012710896778378606, + "acc_norm": 0.28071672354948807, + "acc_norm_stderr": 0.013131238126975586 + }, + "harness|hellaswag|10": { + "acc": 0.36536546504680345, + "acc_stderr": 0.004805483767055344, + "acc_norm": 0.4435371439952201, + "acc_norm_stderr": 0.004957863944093126 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653697, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653697 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.035914440841969694, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.035914440841969694 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.21710526315789475, + "acc_stderr": 0.03355045304882923, + "acc_norm": 0.21710526315789475, + "acc_norm_stderr": 0.03355045304882923 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.32075471698113206, + "acc_stderr": 0.028727502957880263, + "acc_norm": 0.32075471698113206, + "acc_norm_stderr": 0.028727502957880263 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.24277456647398843, + "acc_stderr": 0.0326926380614177, + "acc_norm": 0.24277456647398843, + "acc_norm_stderr": 0.0326926380614177 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237655, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237655 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.17, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.17, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.20851063829787234, + "acc_stderr": 0.02655698211783872, + "acc_norm": 0.20851063829787234, + "acc_norm_stderr": 0.02655698211783872 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.22807017543859648, + "acc_stderr": 0.03947152782669415, + "acc_norm": 0.22807017543859648, + "acc_norm_stderr": 0.03947152782669415 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.31724137931034485, + "acc_stderr": 0.03878352372138622, + "acc_norm": 0.31724137931034485, + "acc_norm_stderr": 0.03878352372138622 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2671957671957672, + "acc_stderr": 0.02278967314577656, + "acc_norm": 0.2671957671957672, + "acc_norm_stderr": 0.02278967314577656 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.15873015873015872, + "acc_stderr": 0.03268454013011743, + "acc_norm": 0.15873015873015872, + "acc_norm_stderr": 0.03268454013011743 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25483870967741934, + "acc_stderr": 0.02479011845933221, + "acc_norm": 0.25483870967741934, + "acc_norm_stderr": 0.02479011845933221 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.30049261083743845, + "acc_stderr": 0.03225799476233484, + "acc_norm": 0.30049261083743845, + "acc_norm_stderr": 0.03225799476233484 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.28484848484848485, + "acc_stderr": 0.035243908445117836, + "acc_norm": 0.28484848484848485, + "acc_norm_stderr": 0.035243908445117836 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.36363636363636365, + "acc_stderr": 0.034273086529999344, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.034273086529999344 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.32124352331606215, + "acc_stderr": 0.033699508685490674, + "acc_norm": 0.32124352331606215, + "acc_norm_stderr": 0.033699508685490674 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.35128205128205126, + "acc_stderr": 0.024203665177902803, + "acc_norm": 0.35128205128205126, + "acc_norm_stderr": 0.024203665177902803 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085622, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085622 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2184873949579832, + "acc_stderr": 0.026841514322958955, + "acc_norm": 0.2184873949579832, + "acc_norm_stderr": 0.026841514322958955 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.03631329803969653, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.03631329803969653 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3174311926605505, + "acc_stderr": 0.0199571521984605, + "acc_norm": 0.3174311926605505, + "acc_norm_stderr": 0.0199571521984605 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.32407407407407407, + "acc_stderr": 0.03191923445686186, + "acc_norm": 0.32407407407407407, + "acc_norm_stderr": 0.03191923445686186 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.030587591351604246, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.030587591351604246 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.26582278481012656, + "acc_stderr": 0.028756799629658335, + "acc_norm": 0.26582278481012656, + "acc_norm_stderr": 0.028756799629658335 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.10762331838565023, + "acc_stderr": 0.020799400082879994, + "acc_norm": 0.10762331838565023, + "acc_norm_stderr": 0.020799400082879994 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.29770992366412213, + "acc_stderr": 0.04010358942462203, + "acc_norm": 0.29770992366412213, + "acc_norm_stderr": 0.04010358942462203 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.38016528925619836, + "acc_stderr": 0.04431324501968432, + "acc_norm": 0.38016528925619836, + "acc_norm_stderr": 0.04431324501968432 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.16666666666666666, + "acc_stderr": 0.03602814176392645, + "acc_norm": 0.16666666666666666, + "acc_norm_stderr": 0.03602814176392645 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.31901840490797545, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.31901840490797545, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.20535714285714285, + "acc_stderr": 0.03834241021419074, + "acc_norm": 0.20535714285714285, + "acc_norm_stderr": 0.03834241021419074 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.36893203883495146, + "acc_stderr": 0.047776151811567386, + "acc_norm": 0.36893203883495146, + "acc_norm_stderr": 0.047776151811567386 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.21367521367521367, + "acc_stderr": 0.026853450377009144, + "acc_norm": 0.21367521367521367, + "acc_norm_stderr": 0.026853450377009144 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.17, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.17, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.1979565772669221, + "acc_stderr": 0.014248873549217587, + "acc_norm": 0.1979565772669221, + "acc_norm_stderr": 0.014248873549217587 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2630057803468208, + "acc_stderr": 0.023703099525258165, + "acc_norm": 0.2630057803468208, + "acc_norm_stderr": 0.023703099525258165 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24692737430167597, + "acc_stderr": 0.014422292204808836, + "acc_norm": 0.24692737430167597, + "acc_norm_stderr": 0.014422292204808836 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.24836601307189543, + "acc_stderr": 0.02473998135511359, + "acc_norm": 0.24836601307189543, + "acc_norm_stderr": 0.02473998135511359 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24437299035369775, + "acc_stderr": 0.024406162094668882, + "acc_norm": 0.24437299035369775, + "acc_norm_stderr": 0.024406162094668882 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25617283950617287, + "acc_stderr": 0.0242885336377261, + "acc_norm": 0.25617283950617287, + "acc_norm_stderr": 0.0242885336377261 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.26595744680851063, + "acc_stderr": 0.02635806569888059, + "acc_norm": 0.26595744680851063, + "acc_norm_stderr": 0.02635806569888059 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.23272490221642764, + "acc_stderr": 0.010792595553888467, + "acc_norm": 0.23272490221642764, + "acc_norm_stderr": 0.010792595553888467 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.24632352941176472, + "acc_stderr": 0.02617343857052, + "acc_norm": 0.24632352941176472, + "acc_norm_stderr": 0.02617343857052 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.22875816993464052, + "acc_stderr": 0.016992723465466222, + "acc_norm": 0.22875816993464052, + "acc_norm_stderr": 0.016992723465466222 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.19090909090909092, + "acc_stderr": 0.03764425585984924, + "acc_norm": 0.19090909090909092, + "acc_norm_stderr": 0.03764425585984924 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3673469387755102, + "acc_stderr": 0.030862144921087555, + "acc_norm": 0.3673469387755102, + "acc_norm_stderr": 0.030862144921087555 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2935323383084577, + "acc_stderr": 0.03220024104534205, + "acc_norm": 0.2935323383084577, + "acc_norm_stderr": 0.03220024104534205 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.26506024096385544, + "acc_stderr": 0.03436024037944967, + "acc_norm": 0.26506024096385544, + "acc_norm_stderr": 0.03436024037944967 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.29239766081871343, + "acc_stderr": 0.034886477134579215, + "acc_norm": 0.29239766081871343, + "acc_norm_stderr": 0.034886477134579215 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.20807833537331702, + "mc1_stderr": 0.014210503473576625, + "mc2": 0.36111937171754704, + "mc2_stderr": 0.014196087145720468 + }, + "all": { + "acc": 0.26077339552714013, + "acc_stderr": 0.03158470730380373, + "acc_norm": 0.2625611147341677, + "acc_norm_stderr": 0.031594414448306005, + "mc1": 0.20807833537331702, + "mc1_stderr": 0.014210503473576625, + "mc2": 0.36111937171754704, + "mc2_stderr": 0.014196087145720468 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "aisquared/dlite-v1-774m", + "model_sha": "d3f5401d07965fb13c2cb8b458ffaed9a5a79c2d", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + } + } +} \ No newline at end of file diff --git a/eval-results/aisquared/dlite-v1-774m/results_2023-10-18T09-49-41.867604.json b/eval-results/aisquared/dlite-v1-774m/results_2023-10-18T09-49-41.867604.json new file mode 100644 index 0000000000000000000000000000000000000000..37f9d1946fbfd0aa1832de18b0139cf5533eda45 --- /dev/null +++ b/eval-results/aisquared/dlite-v1-774m/results_2023-10-18T09-49-41.867604.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "aisquared/dlite-v1-774m", + "model_sha": "d3f5401d07965fb13c2cb8b458ffaed9a5a79c2d", + "model_size": "1.45 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.011220637583892617, + "em_stderr": 0.0010786936337733937, + "f1": 0.06615142617449662, + "f1_stderr": 0.001688981547339462 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5461720599842147, + "acc_stderr": 0.013992441563707063 + }, + "all": { + "em": 0.011220637583892617, + "em_stderr": 0.0010786936337733937, + "f1": 0.06615142617449662, + "f1_stderr": 0.001688981547339462, + "acc": 0.27308602999210735, + "acc_stderr": 0.006996220781853532 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "02cb7da79305c730" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "ba0062051a65b543" + }, + "truncated": 917, + "non-truncated": 402, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "6b3de69e2f87348c", + "hash_cont_tokens": "f0f2d7e46fd897ce" + }, + "total_evaluation_time_secondes": "17863.835258245468", + "truncated": 10207, + "non-truncated": 3182, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/aisquared/dlite-v2-124m/results_2023-07-19T13-53-19.147655.json b/eval-results/aisquared/dlite-v2-124m/results_2023-07-19T13-53-19.147655.json new file mode 100644 index 0000000000000000000000000000000000000000..a0c683f9f93f9af7f00bb12cf77bfcb8243722d4 --- /dev/null +++ b/eval-results/aisquared/dlite-v2-124m/results_2023-07-19T13-53-19.147655.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.2022184300341297, + "acc_stderr": 0.011737454431872104, + "acc_norm": 0.23976109215017063, + "acc_norm_stderr": 0.012476304127453949 + }, + "harness|hellaswag|10": { + "acc": 0.2916749651463852, + "acc_stderr": 0.004536045368404717, + "acc_norm": 0.31099382593108943, + "acc_norm_stderr": 0.004619542392006394 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.21481481481481482, + "acc_stderr": 0.03547854198560827, + "acc_norm": 0.21481481481481482, + "acc_norm_stderr": 0.03547854198560827 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.15789473684210525, + "acc_stderr": 0.029674167520101442, + "acc_norm": 0.15789473684210525, + "acc_norm_stderr": 0.029674167520101442 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.15, + "acc_stderr": 0.03588702812826369, + "acc_norm": 0.15, + "acc_norm_stderr": 0.03588702812826369 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.22641509433962265, + "acc_stderr": 0.025757559893106734, + "acc_norm": 0.22641509433962265, + "acc_norm_stderr": 0.025757559893106734 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.03476590104304134, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.03476590104304134 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.21, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.21, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2658959537572254, + "acc_stderr": 0.03368762932259431, + "acc_norm": 0.2658959537572254, + "acc_norm_stderr": 0.03368762932259431 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.040925639582376536, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.040925639582376536 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.17, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.17, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.25957446808510637, + "acc_stderr": 0.028659179374292323, + "acc_norm": 0.25957446808510637, + "acc_norm_stderr": 0.028659179374292323 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.022418042891113946, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.022418042891113946 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.037184890068181146, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.037184890068181146 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2967741935483871, + "acc_stderr": 0.025988500792411898, + "acc_norm": 0.2967741935483871, + "acc_norm_stderr": 0.025988500792411898 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.29064039408866993, + "acc_stderr": 0.031947400722655395, + "acc_norm": 0.29064039408866993, + "acc_norm_stderr": 0.031947400722655395 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.17, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.17, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.17575757575757575, + "acc_stderr": 0.02972094300622445, + "acc_norm": 0.17575757575757575, + "acc_norm_stderr": 0.02972094300622445 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3383838383838384, + "acc_stderr": 0.033711241426263035, + "acc_norm": 0.3383838383838384, + "acc_norm_stderr": 0.033711241426263035 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.29533678756476683, + "acc_stderr": 0.0329229663915514, + "acc_norm": 0.29533678756476683, + "acc_norm_stderr": 0.0329229663915514 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3487179487179487, + "acc_stderr": 0.02416278028401772, + "acc_norm": 0.3487179487179487, + "acc_norm_stderr": 0.02416278028401772 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.02606715922227581, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.02606715922227581 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3487394957983193, + "acc_stderr": 0.03095663632856655, + "acc_norm": 0.3487394957983193, + "acc_norm_stderr": 0.03095663632856655 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2847682119205298, + "acc_stderr": 0.03684881521389023, + "acc_norm": 0.2847682119205298, + "acc_norm_stderr": 0.03684881521389023 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3155963302752294, + "acc_stderr": 0.019926117513869662, + "acc_norm": 0.3155963302752294, + "acc_norm_stderr": 0.019926117513869662 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.44907407407407407, + "acc_stderr": 0.03392238405321617, + "acc_norm": 0.44907407407407407, + "acc_norm_stderr": 0.03392238405321617 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.3088235294117647, + "acc_stderr": 0.03242661719827218, + "acc_norm": 0.3088235294117647, + "acc_norm_stderr": 0.03242661719827218 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.24472573839662448, + "acc_stderr": 0.02798569938703643, + "acc_norm": 0.24472573839662448, + "acc_norm_stderr": 0.02798569938703643 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.242152466367713, + "acc_stderr": 0.028751392398694755, + "acc_norm": 0.242152466367713, + "acc_norm_stderr": 0.028751392398694755 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.25190839694656486, + "acc_stderr": 0.03807387116306086, + "acc_norm": 0.25190839694656486, + "acc_norm_stderr": 0.03807387116306086 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.23140495867768596, + "acc_stderr": 0.03849856098794089, + "acc_norm": 0.23140495867768596, + "acc_norm_stderr": 0.03849856098794089 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.18518518518518517, + "acc_stderr": 0.03755265865037181, + "acc_norm": 0.18518518518518517, + "acc_norm_stderr": 0.03755265865037181 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2392638036809816, + "acc_stderr": 0.033519538795212696, + "acc_norm": 0.2392638036809816, + "acc_norm_stderr": 0.033519538795212696 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.29464285714285715, + "acc_stderr": 0.04327040932578728, + "acc_norm": 0.29464285714285715, + "acc_norm_stderr": 0.04327040932578728 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.22330097087378642, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.22330097087378642, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.24786324786324787, + "acc_stderr": 0.028286324075564393, + "acc_norm": 0.24786324786324787, + "acc_norm_stderr": 0.028286324075564393 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.24393358876117496, + "acc_stderr": 0.015357212665829479, + "acc_norm": 0.24393358876117496, + "acc_norm_stderr": 0.015357212665829479 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.1936416184971098, + "acc_stderr": 0.02127423031751555, + "acc_norm": 0.1936416184971098, + "acc_norm_stderr": 0.02127423031751555 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24581005586592178, + "acc_stderr": 0.014400296429225627, + "acc_norm": 0.24581005586592178, + "acc_norm_stderr": 0.014400296429225627 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.02463004897982478, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.02463004897982478 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.20257234726688103, + "acc_stderr": 0.022827317491059686, + "acc_norm": 0.20257234726688103, + "acc_norm_stderr": 0.022827317491059686 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.22839506172839505, + "acc_stderr": 0.023358211840626263, + "acc_norm": 0.22839506172839505, + "acc_norm_stderr": 0.023358211840626263 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2765957446808511, + "acc_stderr": 0.026684564340461004, + "acc_norm": 0.2765957446808511, + "acc_norm_stderr": 0.026684564340461004 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2503259452411995, + "acc_stderr": 0.01106415102716544, + "acc_norm": 0.2503259452411995, + "acc_norm_stderr": 0.01106415102716544 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.0301619119307671, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.0301619119307671 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.26633986928104575, + "acc_stderr": 0.017883188134667192, + "acc_norm": 0.26633986928104575, + "acc_norm_stderr": 0.017883188134667192 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2, + "acc_stderr": 0.038313051408846034, + "acc_norm": 0.2, + "acc_norm_stderr": 0.038313051408846034 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2530612244897959, + "acc_stderr": 0.027833023871399697, + "acc_norm": 0.2530612244897959, + "acc_norm_stderr": 0.027833023871399697 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23880597014925373, + "acc_stderr": 0.030147775935409224, + "acc_norm": 0.23880597014925373, + "acc_norm_stderr": 0.030147775935409224 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2289156626506024, + "acc_stderr": 0.03270745277352477, + "acc_norm": 0.2289156626506024, + "acc_norm_stderr": 0.03270745277352477 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.2046783625730994, + "acc_stderr": 0.030944459778533214, + "acc_norm": 0.2046783625730994, + "acc_norm_stderr": 0.030944459778533214 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23133414932680538, + "mc1_stderr": 0.01476194517486268, + "mc2": 0.3898403177629661, + "mc2_stderr": 0.01480538503346897 + }, + "all": { + "acc": 0.2526957165636249, + "acc_stderr": 0.031214276729820828, + "acc_norm": 0.25365947118906124, + "acc_norm_stderr": 0.031228214809806993, + "mc1": 0.23133414932680538, + "mc1_stderr": 0.01476194517486268, + "mc2": 0.3898403177629661, + "mc2_stderr": 0.01480538503346897 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "aisquared/dlite-v2-124m", + "model_sha": "bc719f990748ea72be4b6c270df34fc3d37291dc", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + } + } +} \ No newline at end of file diff --git a/eval-results/aisquared/dlite-v2-124m/results_2023-10-27T09-27-20.533537.json b/eval-results/aisquared/dlite-v2-124m/results_2023-10-27T09-27-20.533537.json new file mode 100644 index 0000000000000000000000000000000000000000..cbf2d54b7962148460d5c739b146ef6035961c2d --- /dev/null +++ b/eval-results/aisquared/dlite-v2-124m/results_2023-10-27T09-27-20.533537.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "aisquared/dlite-v2-124m", + "model_sha": "bc719f990748ea72be4b6c270df34fc3d37291dc", + "model_size": "238.85 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0050335570469798654, + "em_stderr": 0.0007247385547751906, + "f1": 0.05289324664429539, + "f1_stderr": 0.001460860471625635 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5043409629044988, + "acc_stderr": 0.014051956064076892 + }, + "all": { + "em": 0.0050335570469798654, + "em_stderr": 0.0007247385547751906, + "f1": 0.05289324664429539, + "f1_stderr": 0.001460860471625635, + "acc": 0.2521704814522494, + "acc_stderr": 0.007025978032038446 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "bf61a7c21d9ed7af" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "683fee6b08788061" + }, + "truncated": 917, + "non-truncated": 402, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "6b3de69e2f87348c", + "hash_cont_tokens": "8882ca9d833687d6" + }, + "total_evaluation_time_secondes": "4724.538892507553", + "truncated": 10207, + "non-truncated": 3182, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/aisquared/dlite-v2-1_5b/results_2023-07-18T11-15-41.059925.json b/eval-results/aisquared/dlite-v2-1_5b/results_2023-07-18T11-15-41.059925.json new file mode 100644 index 0000000000000000000000000000000000000000..1f3865792a50a1d0826c56d0b57da3ad38904657 --- /dev/null +++ b/eval-results/aisquared/dlite-v2-1_5b/results_2023-07-18T11-15-41.059925.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.2986348122866894, + "acc_stderr": 0.013374078615068738, + "acc_norm": 0.32593856655290104, + "acc_norm_stderr": 0.01369743246669324 + }, + "harness|hellaswag|10": { + "acc": 0.4215295757817168, + "acc_stderr": 0.0049279480614860685, + "acc_norm": 0.5398327026488747, + "acc_norm_stderr": 0.004973922192982227 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.03673731683969506, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.03673731683969506 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123387, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123387 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.27547169811320754, + "acc_stderr": 0.027495663683724077, + "acc_norm": 0.27547169811320754, + "acc_norm_stderr": 0.027495663683724077 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2847222222222222, + "acc_stderr": 0.037738099906869334, + "acc_norm": 0.2847222222222222, + "acc_norm_stderr": 0.037738099906869334 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036844, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036844 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3468208092485549, + "acc_stderr": 0.036291466701596636, + "acc_norm": 0.3468208092485549, + "acc_norm_stderr": 0.036291466701596636 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.04023382273617749, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.04023382273617749 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.31063829787234043, + "acc_stderr": 0.03025123757921317, + "acc_norm": 0.31063829787234043, + "acc_norm_stderr": 0.03025123757921317 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.18421052631578946, + "acc_stderr": 0.03646758875075566, + "acc_norm": 0.18421052631578946, + "acc_norm_stderr": 0.03646758875075566 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2, + "acc_stderr": 0.0333333333333333, + "acc_norm": 0.2, + "acc_norm_stderr": 0.0333333333333333 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2275132275132275, + "acc_stderr": 0.021591269407823778, + "acc_norm": 0.2275132275132275, + "acc_norm_stderr": 0.021591269407823778 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04216370213557835, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04216370213557835 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.22903225806451613, + "acc_stderr": 0.023904914311782648, + "acc_norm": 0.22903225806451613, + "acc_norm_stderr": 0.023904914311782648 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.21182266009852216, + "acc_stderr": 0.028748983689941054, + "acc_norm": 0.21182266009852216, + "acc_norm_stderr": 0.028748983689941054 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036844, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036844 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.24242424242424243, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.20707070707070707, + "acc_stderr": 0.028869778460267063, + "acc_norm": 0.20707070707070707, + "acc_norm_stderr": 0.028869778460267063 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.21243523316062177, + "acc_stderr": 0.029519282616817247, + "acc_norm": 0.21243523316062177, + "acc_norm_stderr": 0.029519282616817247 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.26153846153846155, + "acc_stderr": 0.022282141204204412, + "acc_norm": 0.26153846153846155, + "acc_norm_stderr": 0.022282141204204412 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.026202766534652155, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.026202766534652155 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.19747899159663865, + "acc_stderr": 0.025859164122051463, + "acc_norm": 0.19747899159663865, + "acc_norm_stderr": 0.025859164122051463 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2052980132450331, + "acc_stderr": 0.03297986648473836, + "acc_norm": 0.2052980132450331, + "acc_norm_stderr": 0.03297986648473836 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.28807339449541286, + "acc_stderr": 0.019416445892636018, + "acc_norm": 0.28807339449541286, + "acc_norm_stderr": 0.019416445892636018 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.030058202704309846, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.030058202704309846 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.03019028245350195, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.03019028245350195 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.25316455696202533, + "acc_stderr": 0.028304657943035307, + "acc_norm": 0.25316455696202533, + "acc_norm_stderr": 0.028304657943035307 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.22869955156950672, + "acc_stderr": 0.028188240046929193, + "acc_norm": 0.22869955156950672, + "acc_norm_stderr": 0.028188240046929193 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.20610687022900764, + "acc_stderr": 0.03547771004159465, + "acc_norm": 0.20610687022900764, + "acc_norm_stderr": 0.03547771004159465 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2066115702479339, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.2066115702479339, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.04330043749650742, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.04330043749650742 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.24539877300613497, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.24539877300613497, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.26785714285714285, + "acc_stderr": 0.04203277291467764, + "acc_norm": 0.26785714285714285, + "acc_norm_stderr": 0.04203277291467764 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3106796116504854, + "acc_stderr": 0.0458212416016155, + "acc_norm": 0.3106796116504854, + "acc_norm_stderr": 0.0458212416016155 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.29914529914529914, + "acc_stderr": 0.029996951858349483, + "acc_norm": 0.29914529914529914, + "acc_norm_stderr": 0.029996951858349483 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2656449553001277, + "acc_stderr": 0.015794302487888715, + "acc_norm": 0.2656449553001277, + "acc_norm_stderr": 0.015794302487888715 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.23410404624277456, + "acc_stderr": 0.022797110278071145, + "acc_norm": 0.23410404624277456, + "acc_norm_stderr": 0.022797110278071145 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.20915032679738563, + "acc_stderr": 0.023287685312334813, + "acc_norm": 0.20915032679738563, + "acc_norm_stderr": 0.023287685312334813 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.18971061093247588, + "acc_stderr": 0.022268196258783225, + "acc_norm": 0.18971061093247588, + "acc_norm_stderr": 0.022268196258783225 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.24691358024691357, + "acc_stderr": 0.023993501709042103, + "acc_norm": 0.24691358024691357, + "acc_norm_stderr": 0.023993501709042103 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.24468085106382978, + "acc_stderr": 0.025645553622266722, + "acc_norm": 0.24468085106382978, + "acc_norm_stderr": 0.025645553622266722 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.242503259452412, + "acc_stderr": 0.01094657096634877, + "acc_norm": 0.242503259452412, + "acc_norm_stderr": 0.01094657096634877 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2426470588235294, + "acc_stderr": 0.026040662474201268, + "acc_norm": 0.2426470588235294, + "acc_norm_stderr": 0.026040662474201268 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.24836601307189543, + "acc_stderr": 0.017479487001364764, + "acc_norm": 0.24836601307189543, + "acc_norm_stderr": 0.017479487001364764 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.36363636363636365, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.17142857142857143, + "acc_stderr": 0.02412746346265015, + "acc_norm": 0.17142857142857143, + "acc_norm_stderr": 0.02412746346265015 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23880597014925373, + "acc_stderr": 0.030147775935409217, + "acc_norm": 0.23880597014925373, + "acc_norm_stderr": 0.030147775935409217 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3614457831325301, + "acc_stderr": 0.0374005938202932, + "acc_norm": 0.3614457831325301, + "acc_norm_stderr": 0.0374005938202932 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2252141982864137, + "mc1_stderr": 0.014623240768023493, + "mc2": 0.38769255348817355, + "mc2_stderr": 0.015048478254328103 + }, + "all": { + "acc": 0.25306109961082135, + "acc_stderr": 0.03145644539015587, + "acc_norm": 0.25552901285037, + "acc_norm_stderr": 0.03146270518647995, + "mc1": 0.2252141982864137, + "mc1_stderr": 0.014623240768023493, + "mc2": 0.38769255348817355, + "mc2_stderr": 0.015048478254328103 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "aisquared/dlite-v2-1_5b", + "model_sha": "97440ff1b6ef749423758e3495cdce1b5e68ee92", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + } + } +} \ No newline at end of file diff --git a/eval-results/aisquared/dlite-v2-1_5b/results_2023-10-17T07-28-24.104795.json b/eval-results/aisquared/dlite-v2-1_5b/results_2023-10-17T07-28-24.104795.json new file mode 100644 index 0000000000000000000000000000000000000000..ea106cb2e820de2c3c32290c11bc75b3bdbd2dcf --- /dev/null +++ b/eval-results/aisquared/dlite-v2-1_5b/results_2023-10-17T07-28-24.104795.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "aisquared/dlite-v2-1_5b", + "model_sha": "97440ff1b6ef749423758e3495cdce1b5e68ee92", + "model_size": "2.91 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0019924496644295304, + "em_stderr": 0.0004566676462667015, + "f1": 0.0503942953020135, + "f1_stderr": 0.0012335220693783073 + }, + "harness|gsm8k|5": { + "acc": 0.002274450341167551, + "acc_stderr": 0.0013121578148674103 + }, + "harness|winogrande|5": { + "acc": 0.5469613259668509, + "acc_stderr": 0.013990366632148104 + }, + "all": { + "em": 0.0019924496644295304, + "em_stderr": 0.0004566676462667015, + "f1": 0.0503942953020135, + "f1_stderr": 0.0012335220693783073, + "acc": 0.2746178881540092, + "acc_stderr": 0.007651262223507757 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "e86c72b8aa73e0f0" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "a2ac757ae11e55cb" + }, + "truncated": 917, + "non-truncated": 402, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "6b3de69e2f87348c", + "hash_cont_tokens": "6da433d7110e9dcc" + }, + "total_evaluation_time_secondes": "16652.98338842392", + "truncated": 10207, + "non-truncated": 3182, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/aisquared/dlite-v2-355m/results_2023-07-19T14-14-13.332045.json b/eval-results/aisquared/dlite-v2-355m/results_2023-07-19T14-14-13.332045.json new file mode 100644 index 0000000000000000000000000000000000000000..a6150158b1a46da10b93c8fcf4a600ec8bd66357 --- /dev/null +++ b/eval-results/aisquared/dlite-v2-355m/results_2023-07-19T14-14-13.332045.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.24914675767918087, + "acc_stderr": 0.012639407111926442, + "acc_norm": 0.2832764505119454, + "acc_norm_stderr": 0.013167478735134575 + }, + "harness|hellaswag|10": { + "acc": 0.3370842461661024, + "acc_stderr": 0.004717478335689617, + "acc_norm": 0.4053973312089225, + "acc_norm_stderr": 0.004899653704032843 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768081, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768081 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.038532548365520024, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.038532548365520024 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3223684210526316, + "acc_stderr": 0.038035102483515854, + "acc_norm": 0.3223684210526316, + "acc_norm_stderr": 0.038035102483515854 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.3018867924528302, + "acc_stderr": 0.02825420034443867, + "acc_norm": 0.3018867924528302, + "acc_norm_stderr": 0.02825420034443867 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.03745554791462458, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.03745554791462458 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2658959537572254, + "acc_stderr": 0.03368762932259431, + "acc_norm": 0.2658959537572254, + "acc_norm_stderr": 0.03368762932259431 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.22127659574468084, + "acc_stderr": 0.02713634960242406, + "acc_norm": 0.22127659574468084, + "acc_norm_stderr": 0.02713634960242406 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322004, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322004 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.25517241379310346, + "acc_stderr": 0.03632984052707842, + "acc_norm": 0.25517241379310346, + "acc_norm_stderr": 0.03632984052707842 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2830687830687831, + "acc_stderr": 0.023201392938194978, + "acc_norm": 0.2830687830687831, + "acc_norm_stderr": 0.023201392938194978 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23015873015873015, + "acc_stderr": 0.037649508797906045, + "acc_norm": 0.23015873015873015, + "acc_norm_stderr": 0.037649508797906045 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25806451612903225, + "acc_stderr": 0.02489246917246284, + "acc_norm": 0.25806451612903225, + "acc_norm_stderr": 0.02489246917246284 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2955665024630542, + "acc_stderr": 0.032104944337514575, + "acc_norm": 0.2955665024630542, + "acc_norm_stderr": 0.032104944337514575 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036623, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036623 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.24848484848484848, + "acc_stderr": 0.03374402644139405, + "acc_norm": 0.24848484848484848, + "acc_norm_stderr": 0.03374402644139405 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3383838383838384, + "acc_stderr": 0.03371124142626303, + "acc_norm": 0.3383838383838384, + "acc_norm_stderr": 0.03371124142626303 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.34196891191709844, + "acc_stderr": 0.03423465100104281, + "acc_norm": 0.34196891191709844, + "acc_norm_stderr": 0.03423465100104281 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.37435897435897436, + "acc_stderr": 0.024537591572830517, + "acc_norm": 0.37435897435897436, + "acc_norm_stderr": 0.024537591572830517 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.027309140588230193, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.027309140588230193 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3319327731092437, + "acc_stderr": 0.030588697013783663, + "acc_norm": 0.3319327731092437, + "acc_norm_stderr": 0.030588697013783663 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2913907284768212, + "acc_stderr": 0.03710185726119995, + "acc_norm": 0.2913907284768212, + "acc_norm_stderr": 0.03710185726119995 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3431192660550459, + "acc_stderr": 0.02035477773608604, + "acc_norm": 0.3431192660550459, + "acc_norm_stderr": 0.02035477773608604 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4398148148148148, + "acc_stderr": 0.033851779760448106, + "acc_norm": 0.4398148148148148, + "acc_norm_stderr": 0.033851779760448106 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.23628691983122363, + "acc_stderr": 0.02765215314415928, + "acc_norm": 0.23628691983122363, + "acc_norm_stderr": 0.02765215314415928 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.16591928251121077, + "acc_stderr": 0.02496755319654713, + "acc_norm": 0.16591928251121077, + "acc_norm_stderr": 0.02496755319654713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.29770992366412213, + "acc_stderr": 0.040103589424622034, + "acc_norm": 0.29770992366412213, + "acc_norm_stderr": 0.040103589424622034 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.1322314049586777, + "acc_stderr": 0.030922788320445826, + "acc_norm": 0.1322314049586777, + "acc_norm_stderr": 0.030922788320445826 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2822085889570552, + "acc_stderr": 0.03536117886664742, + "acc_norm": 0.2822085889570552, + "acc_norm_stderr": 0.03536117886664742 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.16964285714285715, + "acc_stderr": 0.0356236785009539, + "acc_norm": 0.16964285714285715, + "acc_norm_stderr": 0.0356236785009539 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3300970873786408, + "acc_stderr": 0.0465614711001235, + "acc_norm": 0.3300970873786408, + "acc_norm_stderr": 0.0465614711001235 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.027236013946196687, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.027236013946196687 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653695, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653695 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2554278416347382, + "acc_stderr": 0.015594955384455758, + "acc_norm": 0.2554278416347382, + "acc_norm_stderr": 0.015594955384455758 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23910614525139665, + "acc_stderr": 0.01426555419233116, + "acc_norm": 0.23910614525139665, + "acc_norm_stderr": 0.01426555419233116 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.026090162504279063, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.026090162504279063 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2540192926045016, + "acc_stderr": 0.024723861504771696, + "acc_norm": 0.2540192926045016, + "acc_norm_stderr": 0.024723861504771696 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.23765432098765432, + "acc_stderr": 0.023683591837008543, + "acc_norm": 0.23765432098765432, + "acc_norm_stderr": 0.023683591837008543 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.25177304964539005, + "acc_stderr": 0.0258921511567094, + "acc_norm": 0.25177304964539005, + "acc_norm_stderr": 0.0258921511567094 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.23598435462842243, + "acc_stderr": 0.010844802669662666, + "acc_norm": 0.23598435462842243, + "acc_norm_stderr": 0.010844802669662666 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.45955882352941174, + "acc_stderr": 0.030273325077345748, + "acc_norm": 0.45955882352941174, + "acc_norm_stderr": 0.030273325077345748 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.238562091503268, + "acc_stderr": 0.017242385828779613, + "acc_norm": 0.238562091503268, + "acc_norm_stderr": 0.017242385828779613 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.16363636363636364, + "acc_stderr": 0.035434330542986774, + "acc_norm": 0.16363636363636364, + "acc_norm_stderr": 0.035434330542986774 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3836734693877551, + "acc_stderr": 0.031130880396235936, + "acc_norm": 0.3836734693877551, + "acc_norm_stderr": 0.031130880396235936 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.208955223880597, + "acc_stderr": 0.028748298931728665, + "acc_norm": 0.208955223880597, + "acc_norm_stderr": 0.028748298931728665 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.21686746987951808, + "acc_stderr": 0.03208284450356365, + "acc_norm": 0.21686746987951808, + "acc_norm_stderr": 0.03208284450356365 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.32748538011695905, + "acc_stderr": 0.035993357714560276, + "acc_norm": 0.32748538011695905, + "acc_norm_stderr": 0.035993357714560276 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862678, + "mc2": 0.3876043911419601, + "mc2_stderr": 0.014383244996439622 + }, + "all": { + "acc": 0.2685907830558822, + "acc_stderr": 0.03179665535298017, + "acc_norm": 0.2703271013249599, + "acc_norm_stderr": 0.03180869343758274, + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862678, + "mc2": 0.3876043911419601, + "mc2_stderr": 0.014383244996439622 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "aisquared/dlite-v2-355m", + "model_sha": "f51d310aebc16a9fe0d999d2a437b5faff635716", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + } + } +} \ No newline at end of file diff --git a/eval-results/aisquared/dlite-v2-355m/results_2023-10-15T23-07-25.491864.json b/eval-results/aisquared/dlite-v2-355m/results_2023-10-15T23-07-25.491864.json new file mode 100644 index 0000000000000000000000000000000000000000..2701f6c32c3325d6ae988888b8ff5732d622abe6 --- /dev/null +++ b/eval-results/aisquared/dlite-v2-355m/results_2023-10-15T23-07-25.491864.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "aisquared/dlite-v2-355m", + "model_sha": "f51d310aebc16a9fe0d999d2a437b5faff635716", + "model_size": "679.78 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001572986577181208, + "em_stderr": 0.000405845113241774, + "f1": 0.055305159395973226, + "f1_stderr": 0.001369522078512369 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5280189423835833, + "acc_stderr": 0.014030404213405784 + }, + "all": { + "em": 0.001572986577181208, + "em_stderr": 0.000405845113241774, + "f1": 0.055305159395973226, + "f1_stderr": 0.001369522078512369, + "acc": 0.26400947119179163, + "acc_stderr": 0.007015202106702892 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "8191d8c97b46596d" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "f38a7f704b3d53cc" + }, + "truncated": 917, + "non-truncated": 402, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "6b3de69e2f87348c", + "hash_cont_tokens": "ec78292a39aa13d9" + }, + "total_evaluation_time_secondes": "9593.669703006744", + "truncated": 10207, + "non-truncated": 3182, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/aisquared/dlite-v2-774m/results_2023-07-19T14-27-10.189986.json b/eval-results/aisquared/dlite-v2-774m/results_2023-07-19T14-27-10.189986.json new file mode 100644 index 0000000000000000000000000000000000000000..631babebf136bc012ec51762b13a25979b3a05f5 --- /dev/null +++ b/eval-results/aisquared/dlite-v2-774m/results_2023-07-19T14-27-10.189986.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.27986348122866894, + "acc_stderr": 0.013119040897725922, + "acc_norm": 0.30119453924914674, + "acc_norm_stderr": 0.013406741767847626 + }, + "harness|hellaswag|10": { + "acc": 0.3754232224656443, + "acc_stderr": 0.004832423630593185, + "acc_norm": 0.47679745070703045, + "acc_norm_stderr": 0.004984405935541093 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768081, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768081 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04072314811876837, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04072314811876837 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.29605263157894735, + "acc_stderr": 0.037150621549989056, + "acc_norm": 0.29605263157894735, + "acc_norm_stderr": 0.037150621549989056 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2641509433962264, + "acc_stderr": 0.02713429162874172, + "acc_norm": 0.2641509433962264, + "acc_norm_stderr": 0.02713429162874172 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.24277456647398843, + "acc_stderr": 0.0326926380614177, + "acc_norm": 0.24277456647398843, + "acc_norm_stderr": 0.0326926380614177 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.040925639582376556, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.040925639582376556 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2851063829787234, + "acc_stderr": 0.029513196625539355, + "acc_norm": 0.2851063829787234, + "acc_norm_stderr": 0.029513196625539355 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.0404933929774814, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.0404933929774814 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.037245636197746325, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.037245636197746325 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2724867724867725, + "acc_stderr": 0.02293097307163335, + "acc_norm": 0.2724867724867725, + "acc_norm_stderr": 0.02293097307163335 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.15079365079365079, + "acc_stderr": 0.03200686497287392, + "acc_norm": 0.15079365079365079, + "acc_norm_stderr": 0.03200686497287392 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24193548387096775, + "acc_stderr": 0.0243625996930311, + "acc_norm": 0.24193548387096775, + "acc_norm_stderr": 0.0243625996930311 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3103448275862069, + "acc_stderr": 0.032550867699701024, + "acc_norm": 0.3103448275862069, + "acc_norm_stderr": 0.032550867699701024 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.28484848484848485, + "acc_stderr": 0.035243908445117836, + "acc_norm": 0.28484848484848485, + "acc_norm_stderr": 0.035243908445117836 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.02962022787479049, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.02962022787479049 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.22797927461139897, + "acc_stderr": 0.030276909945178256, + "acc_norm": 0.22797927461139897, + "acc_norm_stderr": 0.030276909945178256 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.21025641025641026, + "acc_stderr": 0.020660597485026924, + "acc_norm": 0.21025641025641026, + "acc_norm_stderr": 0.020660597485026924 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.02684205787383371, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.02684205787383371 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.026653531596715477, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.026653531596715477 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.03631329803969653, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.03631329803969653 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.22018348623853212, + "acc_stderr": 0.017765978652327576, + "acc_norm": 0.22018348623853212, + "acc_norm_stderr": 0.017765978652327576 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.028353212866863445, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.028353212866863445 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.030778554678693264, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.030778554678693264 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2616033755274262, + "acc_stderr": 0.028609516716994934, + "acc_norm": 0.2616033755274262, + "acc_norm_stderr": 0.028609516716994934 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.20179372197309417, + "acc_stderr": 0.026936111912802273, + "acc_norm": 0.20179372197309417, + "acc_norm_stderr": 0.026936111912802273 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2366412213740458, + "acc_stderr": 0.03727673575596919, + "acc_norm": 0.2366412213740458, + "acc_norm_stderr": 0.03727673575596919 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2809917355371901, + "acc_stderr": 0.04103203830514512, + "acc_norm": 0.2809917355371901, + "acc_norm_stderr": 0.04103203830514512 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.23148148148148148, + "acc_stderr": 0.04077494709252626, + "acc_norm": 0.23148148148148148, + "acc_norm_stderr": 0.04077494709252626 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3006134969325153, + "acc_stderr": 0.03602511318806771, + "acc_norm": 0.3006134969325153, + "acc_norm_stderr": 0.03602511318806771 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.042878587513404544, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.042878587513404544 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.23300970873786409, + "acc_stderr": 0.04185832598928315, + "acc_norm": 0.23300970873786409, + "acc_norm_stderr": 0.04185832598928315 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2564102564102564, + "acc_stderr": 0.02860595370200425, + "acc_norm": 0.2564102564102564, + "acc_norm_stderr": 0.02860595370200425 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2707535121328225, + "acc_stderr": 0.01588988836256049, + "acc_norm": 0.2707535121328225, + "acc_norm_stderr": 0.01588988836256049 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2947976878612717, + "acc_stderr": 0.02454761779480383, + "acc_norm": 0.2947976878612717, + "acc_norm_stderr": 0.02454761779480383 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24692737430167597, + "acc_stderr": 0.014422292204808835, + "acc_norm": 0.24692737430167597, + "acc_norm_stderr": 0.014422292204808835 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.25163398692810457, + "acc_stderr": 0.024848018263875195, + "acc_norm": 0.25163398692810457, + "acc_norm_stderr": 0.024848018263875195 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3054662379421222, + "acc_stderr": 0.026160584450140488, + "acc_norm": 0.3054662379421222, + "acc_norm_stderr": 0.026160584450140488 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2716049382716049, + "acc_stderr": 0.02474862449053737, + "acc_norm": 0.2716049382716049, + "acc_norm_stderr": 0.02474862449053737 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2695035460992908, + "acc_stderr": 0.026469036818590638, + "acc_norm": 0.2695035460992908, + "acc_norm_stderr": 0.026469036818590638 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2627118644067797, + "acc_stderr": 0.011240545514995676, + "acc_norm": 0.2627118644067797, + "acc_norm_stderr": 0.011240545514995676 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.16911764705882354, + "acc_stderr": 0.02277086801011303, + "acc_norm": 0.16911764705882354, + "acc_norm_stderr": 0.02277086801011303 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.018120224251484587, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.018120224251484587 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.20909090909090908, + "acc_stderr": 0.038950910157241364, + "acc_norm": 0.20909090909090908, + "acc_norm_stderr": 0.038950910157241364 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.21224489795918366, + "acc_stderr": 0.026176967197866767, + "acc_norm": 0.21224489795918366, + "acc_norm_stderr": 0.026176967197866767 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401468, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401468 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.22289156626506024, + "acc_stderr": 0.032400048255946876, + "acc_norm": 0.22289156626506024, + "acc_norm_stderr": 0.032400048255946876 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.28654970760233917, + "acc_stderr": 0.034678266857038266, + "acc_norm": 0.28654970760233917, + "acc_norm_stderr": 0.034678266857038266 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2178702570379437, + "mc1_stderr": 0.014450846714123892, + "mc2": 0.399983906781129, + "mc2_stderr": 0.015084392931329125 + }, + "all": { + "acc": 0.2561995259176005, + "acc_stderr": 0.031651500485581324, + "acc_norm": 0.25827927653220834, + "acc_norm_stderr": 0.03165895274278589, + "mc1": 0.2178702570379437, + "mc1_stderr": 0.014450846714123892, + "mc2": 0.399983906781129, + "mc2_stderr": 0.015084392931329125 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "aisquared/dlite-v2-774m", + "model_sha": "0ea894a33e491912cd1a65dde47b4af03f03c4f2", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + } + } +} \ No newline at end of file diff --git a/eval-results/aisquared/dlite-v2-774m/results_2023-10-13T06-47-53.119042.json b/eval-results/aisquared/dlite-v2-774m/results_2023-10-13T06-47-53.119042.json new file mode 100644 index 0000000000000000000000000000000000000000..7eb4accacb955514562fcaeaa4716bcb04f779d4 --- /dev/null +++ b/eval-results/aisquared/dlite-v2-774m/results_2023-10-13T06-47-53.119042.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "aisquared/dlite-v2-774m", + "model_sha": "0ea894a33e491912cd1a65dde47b4af03f03c4f2", + "model_size": "1.45 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.009437919463087249, + "em_stderr": 0.0009901902239103345, + "f1": 0.059256501677852416, + "f1_stderr": 0.0015878342558663697 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5398579321231255, + "acc_stderr": 0.014007765428365166 + }, + "all": { + "em": 0.009437919463087249, + "em_stderr": 0.0009901902239103345, + "f1": 0.059256501677852416, + "f1_stderr": 0.0015878342558663697, + "acc": 0.26992896606156275, + "acc_stderr": 0.007003882714182583 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "3fb10095a790c0d1" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "c0abc2f418859d6e" + }, + "truncated": 917, + "non-truncated": 402, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "6b3de69e2f87348c", + "hash_cont_tokens": "060ecaf39fc76c1a" + }, + "total_evaluation_time_secondes": "14674.07238817215", + "truncated": 10207, + "non-truncated": 3182, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/akjindal53244/Mistral-7B-v0.1-Open-Platypus/results_2023-10-09T12-52-41.880840.json b/eval-results/akjindal53244/Mistral-7B-v0.1-Open-Platypus/results_2023-10-09T12-52-41.880840.json new file mode 100644 index 0000000000000000000000000000000000000000..1549efe6194a34711e9d75d6807108aab3b63ea0 --- /dev/null +++ b/eval-results/akjindal53244/Mistral-7B-v0.1-Open-Platypus/results_2023-10-09T12-52-41.880840.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "akjindal53244/Mistral-7B-v0.1-Open-Platypus", + "model_sha": "aa2c84e89c4c8a10e0569e45021b59e6d1c08bda", + "model_size": "13.99 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5836177474402731, + "acc_stderr": 0.01440561827943617, + "acc_norm": 0.6237201365187713, + "acc_norm_stderr": 0.014157022555407158 + }, + "harness|hellaswag|10": { + "acc": 0.6590320653256323, + "acc_stderr": 0.004730658073041557, + "acc_norm": 0.8508265285799641, + "acc_norm_stderr": 0.003555312878052388 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621502, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621502 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.562962962962963, + "acc_stderr": 0.04284958639753401, + "acc_norm": 0.562962962962963, + "acc_norm_stderr": 0.04284958639753401 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6973684210526315, + "acc_stderr": 0.03738520676119669, + "acc_norm": 0.6973684210526315, + "acc_norm_stderr": 0.03738520676119669 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6754716981132075, + "acc_stderr": 0.028815615713432108, + "acc_norm": 0.6754716981132075, + "acc_norm_stderr": 0.028815615713432108 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7013888888888888, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.7013888888888888, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411019, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411019 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6763005780346821, + "acc_stderr": 0.0356760379963917, + "acc_norm": 0.6763005780346821, + "acc_norm_stderr": 0.0356760379963917 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4019607843137255, + "acc_stderr": 0.048786087144669955, + "acc_norm": 0.4019607843137255, + "acc_norm_stderr": 0.048786087144669955 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5574468085106383, + "acc_stderr": 0.03246956919789958, + "acc_norm": 0.5574468085106383, + "acc_norm_stderr": 0.03246956919789958 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4473684210526316, + "acc_stderr": 0.04677473004491199, + "acc_norm": 0.4473684210526316, + "acc_norm_stderr": 0.04677473004491199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5448275862068965, + "acc_stderr": 0.04149886942192117, + "acc_norm": 0.5448275862068965, + "acc_norm_stderr": 0.04149886942192117 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.025279850397404904, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.025279850397404904 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.49206349206349204, + "acc_stderr": 0.044715725362943486, + "acc_norm": 0.49206349206349204, + "acc_norm_stderr": 0.044715725362943486 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7225806451612903, + "acc_stderr": 0.025470196835900055, + "acc_norm": 0.7225806451612903, + "acc_norm_stderr": 0.025470196835900055 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5024630541871922, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.5024630541871922, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7525252525252525, + "acc_stderr": 0.030746300742124498, + "acc_norm": 0.7525252525252525, + "acc_norm_stderr": 0.030746300742124498 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8963730569948186, + "acc_stderr": 0.02199531196364424, + "acc_norm": 0.8963730569948186, + "acc_norm_stderr": 0.02199531196364424 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.617948717948718, + "acc_stderr": 0.024635549163908234, + "acc_norm": 0.617948717948718, + "acc_norm_stderr": 0.024635549163908234 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.36666666666666664, + "acc_stderr": 0.02938162072646507, + "acc_norm": 0.36666666666666664, + "acc_norm_stderr": 0.02938162072646507 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6596638655462185, + "acc_stderr": 0.030778057422931673, + "acc_norm": 0.6596638655462185, + "acc_norm_stderr": 0.030778057422931673 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8146788990825689, + "acc_stderr": 0.016659279700295838, + "acc_norm": 0.8146788990825689, + "acc_norm_stderr": 0.016659279700295838 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4861111111111111, + "acc_stderr": 0.03408655867977748, + "acc_norm": 0.4861111111111111, + "acc_norm_stderr": 0.03408655867977748 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7892156862745098, + "acc_stderr": 0.02862654791243741, + "acc_norm": 0.7892156862745098, + "acc_norm_stderr": 0.02862654791243741 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7890295358649789, + "acc_stderr": 0.02655837250266192, + "acc_norm": 0.7890295358649789, + "acc_norm_stderr": 0.02655837250266192 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.726457399103139, + "acc_stderr": 0.029918586707798827, + "acc_norm": 0.726457399103139, + "acc_norm_stderr": 0.029918586707798827 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7709923664122137, + "acc_stderr": 0.036853466317118506, + "acc_norm": 0.7709923664122137, + "acc_norm_stderr": 0.036853466317118506 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8016528925619835, + "acc_stderr": 0.036401182719909456, + "acc_norm": 0.8016528925619835, + "acc_norm_stderr": 0.036401182719909456 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8055555555555556, + "acc_stderr": 0.03826076324884863, + "acc_norm": 0.8055555555555556, + "acc_norm_stderr": 0.03826076324884863 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7852760736196319, + "acc_stderr": 0.032262193772867744, + "acc_norm": 0.7852760736196319, + "acc_norm_stderr": 0.032262193772867744 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.037601780060266196, + "acc_norm": 0.8252427184466019, + "acc_norm_stderr": 0.037601780060266196 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8888888888888888, + "acc_stderr": 0.020588491316092368, + "acc_norm": 0.8888888888888888, + "acc_norm_stderr": 0.020588491316092368 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8071519795657727, + "acc_stderr": 0.014108533515757431, + "acc_norm": 0.8071519795657727, + "acc_norm_stderr": 0.014108533515757431 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7167630057803468, + "acc_stderr": 0.024257901705323378, + "acc_norm": 0.7167630057803468, + "acc_norm_stderr": 0.024257901705323378 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.35307262569832404, + "acc_stderr": 0.015984204545268565, + "acc_norm": 0.35307262569832404, + "acc_norm_stderr": 0.015984204545268565 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7450980392156863, + "acc_stderr": 0.02495418432487991, + "acc_norm": 0.7450980392156863, + "acc_norm_stderr": 0.02495418432487991 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7138263665594855, + "acc_stderr": 0.025670259242188943, + "acc_norm": 0.7138263665594855, + "acc_norm_stderr": 0.025670259242188943 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7561728395061729, + "acc_stderr": 0.023891879541959607, + "acc_norm": 0.7561728395061729, + "acc_norm_stderr": 0.023891879541959607 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4858156028368794, + "acc_stderr": 0.02981549448368206, + "acc_norm": 0.4858156028368794, + "acc_norm_stderr": 0.02981549448368206 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.49869621903520206, + "acc_stderr": 0.012770192691057116, + "acc_norm": 0.49869621903520206, + "acc_norm_stderr": 0.012770192691057116 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6838235294117647, + "acc_stderr": 0.028245687391462923, + "acc_norm": 0.6838235294117647, + "acc_norm_stderr": 0.028245687391462923 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6862745098039216, + "acc_stderr": 0.018771683893528176, + "acc_norm": 0.6862745098039216, + "acc_norm_stderr": 0.018771683893528176 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6818181818181818, + "acc_stderr": 0.04461272175910509, + "acc_norm": 0.6818181818181818, + "acc_norm_stderr": 0.04461272175910509 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7061224489795919, + "acc_stderr": 0.02916273841024977, + "acc_norm": 0.7061224489795919, + "acc_norm_stderr": 0.02916273841024977 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.835820895522388, + "acc_stderr": 0.026193923544454115, + "acc_norm": 0.835820895522388, + "acc_norm_stderr": 0.026193923544454115 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.033799766898963086, + "acc_norm": 0.87, + "acc_norm_stderr": 0.033799766898963086 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5481927710843374, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.5481927710843374, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8070175438596491, + "acc_stderr": 0.030267457554898458, + "acc_norm": 0.8070175438596491, + "acc_norm_stderr": 0.030267457554898458 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.32558139534883723, + "mc1_stderr": 0.01640398946990783, + "mc2": 0.4732846266548936, + "mc2_stderr": 0.015063591761555577 + }, + "all": { + "acc": 0.6373475749467716, + "acc_stderr": 0.03299569980721215, + "acc_norm": 0.6412780300710569, + "acc_norm_stderr": 0.03297156521536439, + "mc1": 0.32558139534883723, + "mc1_stderr": 0.01640398946990783, + "mc2": 0.4732846266548936, + "mc2_stderr": 0.015063591761555577 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "4354.185311555862", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/akjindal53244/Mistral-7B-v0.1-Open-Platypus/results_2023-10-25T03-30-37.870273.json b/eval-results/akjindal53244/Mistral-7B-v0.1-Open-Platypus/results_2023-10-25T03-30-37.870273.json new file mode 100644 index 0000000000000000000000000000000000000000..134be7424cee95e0909eefc58983d27354286d0a --- /dev/null +++ b/eval-results/akjindal53244/Mistral-7B-v0.1-Open-Platypus/results_2023-10-25T03-30-37.870273.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "akjindal53244/Mistral-7B-v0.1-Open-Platypus", + "model_sha": "9d6b19937537e279cf53be0504daa0311d2c7938", + "model_size": "13.99 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.16128355704697986, + "em_stderr": 0.0037665373341562473, + "f1": 0.21934249161073788, + "f1_stderr": 0.003766121643482467 + }, + "harness|gsm8k|5": { + "acc": 0.1728582259287339, + "acc_stderr": 0.010415432246200586 + }, + "harness|winogrande|5": { + "acc": 0.77663772691397, + "acc_stderr": 0.011705697565205201 + }, + "all": { + "em": 0.16128355704697986, + "em_stderr": 0.0037665373341562473, + "f1": 0.21934249161073788, + "f1_stderr": 0.003766121643482467, + "acc": 0.47474797642135197, + "acc_stderr": 0.011060564905702893 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "d584766268609113" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "cae4e21c0491d6ff" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2397, + "non-padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "56889019cc5e7782" + }, + "total_evaluation_time_secondes": "9850.997980594635", + "truncated": 0, + "non-truncated": 13389, + "padded": 2397, + "non-padded": 10992, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/aloobun/open-llama-3b-v2-elmv3/results_2023-12-09T17-18-30.999840.json b/eval-results/aloobun/open-llama-3b-v2-elmv3/results_2023-12-09T17-18-30.999840.json new file mode 100644 index 0000000000000000000000000000000000000000..97bd93dc96d1427a9e138da218531b79a3e7cd03 --- /dev/null +++ b/eval-results/aloobun/open-llama-3b-v2-elmv3/results_2023-12-09T17-18-30.999840.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 592582.330866085, + "end_time": 597179.097031246, + "total_evaluation_time_secondes": "4596.76616516104", + "model_name": "aloobun/open-llama-3b-v2-elmv3", + "model_sha": "7e43b199ff51dc0e63934ba49758a8a31ff855de", + "model_dtype": "torch.float16", + "model_size": "6.4 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.40102389078498296, + "acc_stderr": 0.014322255790719867, + "acc_norm": 0.4206484641638225, + "acc_norm_stderr": 0.0144262112525084 + }, + "harness|hellaswag|10": { + "acc": 0.5510854411471818, + "acc_stderr": 0.004963669199433381, + "acc_norm": 0.7328221469826728, + "acc_norm_stderr": 0.0044158166963030685 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.03944624162501116, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.03944624162501116 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3092105263157895, + "acc_stderr": 0.037610708698674805, + "acc_norm": 0.3092105263157895, + "acc_norm_stderr": 0.037610708698674805 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2943396226415094, + "acc_stderr": 0.028049186315695248, + "acc_norm": 0.2943396226415094, + "acc_norm_stderr": 0.028049186315695248 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2543352601156069, + "acc_stderr": 0.0332055644308557, + "acc_norm": 0.2543352601156069, + "acc_norm_stderr": 0.0332055644308557 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.18627450980392157, + "acc_stderr": 0.03873958714149351, + "acc_norm": 0.18627450980392157, + "acc_norm_stderr": 0.03873958714149351 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.32340425531914896, + "acc_stderr": 0.030579442773610334, + "acc_norm": 0.32340425531914896, + "acc_norm_stderr": 0.030579442773610334 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.04096985139843673, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.04096985139843673 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2206896551724138, + "acc_stderr": 0.03455930201924812, + "acc_norm": 0.2206896551724138, + "acc_norm_stderr": 0.03455930201924812 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.02306818884826111, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.02306818884826111 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.20634920634920634, + "acc_stderr": 0.03619604524124252, + "acc_norm": 0.20634920634920634, + "acc_norm_stderr": 0.03619604524124252 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2645161290322581, + "acc_stderr": 0.02509189237885928, + "acc_norm": 0.2645161290322581, + "acc_norm_stderr": 0.02509189237885928 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.03178529710642749, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.03178529710642749 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2909090909090909, + "acc_stderr": 0.03546563019624337, + "acc_norm": 0.2909090909090909, + "acc_norm_stderr": 0.03546563019624337 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3282828282828283, + "acc_stderr": 0.03345678422756777, + "acc_norm": 0.3282828282828283, + "acc_norm_stderr": 0.03345678422756777 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.23316062176165803, + "acc_stderr": 0.03051611137147601, + "acc_norm": 0.23316062176165803, + "acc_norm_stderr": 0.03051611137147601 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.28205128205128205, + "acc_stderr": 0.0228158130988966, + "acc_norm": 0.28205128205128205, + "acc_norm_stderr": 0.0228158130988966 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.026719240783712163, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.026719240783712163 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2773109243697479, + "acc_stderr": 0.02907937453948001, + "acc_norm": 0.2773109243697479, + "acc_norm_stderr": 0.02907937453948001 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.03802039760107903, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.03802039760107903 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.25688073394495414, + "acc_stderr": 0.018732492928342462, + "acc_norm": 0.25688073394495414, + "acc_norm_stderr": 0.018732492928342462 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.18055555555555555, + "acc_stderr": 0.026232878971491652, + "acc_norm": 0.18055555555555555, + "acc_norm_stderr": 0.026232878971491652 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.23039215686274508, + "acc_stderr": 0.029554292605695066, + "acc_norm": 0.23039215686274508, + "acc_norm_stderr": 0.029554292605695066 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.25316455696202533, + "acc_stderr": 0.0283046579430353, + "acc_norm": 0.25316455696202533, + "acc_norm_stderr": 0.0283046579430353 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.37668161434977576, + "acc_stderr": 0.03252113489929188, + "acc_norm": 0.37668161434977576, + "acc_norm_stderr": 0.03252113489929188 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.24427480916030533, + "acc_stderr": 0.03768335959728745, + "acc_norm": 0.24427480916030533, + "acc_norm_stderr": 0.03768335959728745 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.371900826446281, + "acc_stderr": 0.044120158066245044, + "acc_norm": 0.371900826446281, + "acc_norm_stderr": 0.044120158066245044 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.3055555555555556, + "acc_stderr": 0.044531975073749834, + "acc_norm": 0.3055555555555556, + "acc_norm_stderr": 0.044531975073749834 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.26380368098159507, + "acc_stderr": 0.034624199316156234, + "acc_norm": 0.26380368098159507, + "acc_norm_stderr": 0.034624199316156234 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25892857142857145, + "acc_stderr": 0.041577515398656284, + "acc_norm": 0.25892857142857145, + "acc_norm_stderr": 0.041577515398656284 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3106796116504854, + "acc_stderr": 0.04582124160161552, + "acc_norm": 0.3106796116504854, + "acc_norm_stderr": 0.04582124160161552 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2606837606837607, + "acc_stderr": 0.028760348956523414, + "acc_norm": 0.2606837606837607, + "acc_norm_stderr": 0.028760348956523414 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.29118773946360155, + "acc_stderr": 0.016246087069701393, + "acc_norm": 0.29118773946360155, + "acc_norm_stderr": 0.016246087069701393 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2543352601156069, + "acc_stderr": 0.023445826276545536, + "acc_norm": 0.2543352601156069, + "acc_norm_stderr": 0.023445826276545536 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.025829163272757468, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.025829163272757468 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.27009646302250806, + "acc_stderr": 0.025218040373410626, + "acc_norm": 0.27009646302250806, + "acc_norm_stderr": 0.025218040373410626 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.28703703703703703, + "acc_stderr": 0.025171041915309684, + "acc_norm": 0.28703703703703703, + "acc_norm_stderr": 0.025171041915309684 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2624113475177305, + "acc_stderr": 0.026244920349843017, + "acc_norm": 0.2624113475177305, + "acc_norm_stderr": 0.026244920349843017 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24445893089960888, + "acc_stderr": 0.010976425013113897, + "acc_norm": 0.24445893089960888, + "acc_norm_stderr": 0.010976425013113897 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2536764705882353, + "acc_stderr": 0.026431329870789538, + "acc_norm": 0.2536764705882353, + "acc_norm_stderr": 0.026431329870789538 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.017740899509177795, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.017740899509177795 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.32727272727272727, + "acc_stderr": 0.04494290866252089, + "acc_norm": 0.32727272727272727, + "acc_norm_stderr": 0.04494290866252089 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3469387755102041, + "acc_stderr": 0.030472526026726496, + "acc_norm": 0.3469387755102041, + "acc_norm_stderr": 0.030472526026726496 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23383084577114427, + "acc_stderr": 0.029929415408348398, + "acc_norm": 0.23383084577114427, + "acc_norm_stderr": 0.029929415408348398 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3253012048192771, + "acc_stderr": 0.03647168523683227, + "acc_norm": 0.3253012048192771, + "acc_norm_stderr": 0.03647168523683227 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.36257309941520466, + "acc_stderr": 0.036871306155620606, + "acc_norm": 0.36257309941520466, + "acc_norm_stderr": 0.036871306155620606 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23011015911872704, + "mc1_stderr": 0.014734557959807765, + "mc2": 0.3553887503394241, + "mc2_stderr": 0.013650877161228007 + }, + "harness|winogrande|5": { + "acc": 0.6495659037095501, + "acc_stderr": 0.013409047676670184 + }, + "harness|gsm8k|5": { + "acc": 0.03411675511751327, + "acc_stderr": 0.0050002126007732675 + }, + "all": { + "acc": 0.2848210132560984, + "acc_stderr": 0.03181395493581512, + "acc_norm": 0.286301024220496, + "acc_norm_stderr": 0.03257284904688029, + "mc1": 0.23011015911872704, + "mc1_stderr": 0.014734557959807765, + "mc2": 0.3553887503394241, + "mc2_stderr": 0.013650877161228007 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "59c328d432da064f", + "hash_cont_tokens": "2e8835aa03b9c2cf" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4676, + "non_padded": 11, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "9eaa83dae54ba52a", + "hash_cont_tokens": "18a48de3edcef462" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 39987, + "non_padded": 181, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "4129e579fbf0ebc2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "85c455354ae2ebd0", + "hash_cont_tokens": "1d81fa80e3039a08" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "221506ab8405000a", + "hash_cont_tokens": "247dc44c6b578728" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "16c21dd1ddd4ee38", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "24b21e9d78658e4d", + "hash_cont_tokens": "26e3b69d5fb27bb2" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "770d74c6a8c9c0b7", + "hash_cont_tokens": "bbda31842f3930d5" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 568, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "7dea1631558d65ac", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "22600976f0f9ffc6", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "564ae334c5a56510", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "bce86eecdc3bb76a", + "hash_cont_tokens": "894854ed7bec57f7" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 688, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "1188d9d525ab28e7", + "hash_cont_tokens": "13130ec6de384bbb" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "692856445804bec5", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "5ade2ffc8b9f5d4a", + "hash_cont_tokens": "29089b8b7020611e" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "9b766b5e103ce426", + "hash_cont_tokens": "efc596dfa1a1f073" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "dd9935cf301e82f9", + "hash_cont_tokens": "70817a7ac9f44af2" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 560, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "78c8ba2ecf6e0dc2", + "hash_cont_tokens": "937cd53d06cc6e16" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "661893e4f7f37eba", + "hash_cont_tokens": "eec972abe0fc0f5a" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "4a8d10395fdc21f0", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "816c7d936dbe01da", + "hash_cont_tokens": "94971ccfe8e59c25" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "769ab5386fedf26e", + "hash_cont_tokens": "a78e38b59778a04c" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "5b6bcda94f3ca2df", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "142b719c7d7d4fe0", + "hash_cont_tokens": "91dc522e4e4e91c3" + }, + "truncated": 660, + "non_truncated": -495, + "padded": 0, + "non_padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "281dcc445ad0af4a", + "hash_cont_tokens": "f275c901b3d285f9" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "bb8f5852975ec963", + "hash_cont_tokens": "85eb58f423437cce" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 770, + "non_padded": 2, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "e769357a349b7644", + "hash_cont_tokens": "39a93706184f896b" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "4ab345e3c0507320", + "hash_cont_tokens": "d41065d20b689af3" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "52ec665069da063e", + "hash_cont_tokens": "28c1f7c11bf85409" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "f23b89453c7c6050", + "hash_cont_tokens": "78c510e6c5d316ac" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "bb0f46fa5669c46e", + "hash_cont_tokens": "0ba4ecffc67603c5" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "db3276d6935c41ac", + "hash_cont_tokens": "4a0339e9ad3efa6d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a54112084a848a44", + "hash_cont_tokens": "2529d55ec490f81f" + }, + "truncated": 816, + "non_truncated": -612, + "padded": 0, + "non_padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "89cf33fb840f27be", + "hash_cont_tokens": "21808b54f5df97b2" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "ecf9f32ac289d1be", + "hash_cont_tokens": "92acdd467ed943e1" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ebf05f3ed8d69562", + "hash_cont_tokens": "a6034ed95a124315" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "b0d9e6f90b58599e", + "hash_cont_tokens": "223fbf3fd106c04b" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "ddb8c4eaa3d71594", + "hash_cont_tokens": "7c8e30f486ff156a" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 428, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "a04883884a711ebf", + "hash_cont_tokens": "b4cc4a8d31bbaa03" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 636, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "d5511967956880ea", + "hash_cont_tokens": "7f0e1289ec188e82" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "8c35c18f5a96b3b3", + "hash_cont_tokens": "66b726b356a02feb" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "a80e346390d1f88c", + "hash_cont_tokens": "f08457005b652d25" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "5caf5eb895cd3ccd", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "795c466e9f87e4c1", + "hash_cont_tokens": "647bcbd68f292558" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "505a224f2325b0ec", + "hash_cont_tokens": "6849b7fe56c50dda" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1368, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3f767d07e9ec8662", + "hash_cont_tokens": "81585ec455b1e3e5" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "0bc8cefb3f763640", + "hash_cont_tokens": "471b68eb20e5d34b" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "36e85ac3fd3f3c64", + "hash_cont_tokens": "6e39384b9c0a8cc2" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "1b04a90b19ce0623", + "hash_cont_tokens": "bfe513578190093f" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "8db39e7efe9edb93", + "hash_cont_tokens": "9ce431b67350b312" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "f638aace411a0bd9", + "hash_cont_tokens": "0ff990d9cc38024d" + }, + "truncated": 168, + "non_truncated": 1366, + "padded": 5968, + "non_padded": 168, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c0f160879d378d4d", + "hash_cont_tokens": "bc3c70e15bc7dce0" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "a66dcd2d6795f6ec", + "hash_cont_tokens": "58464ea26d81f908" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5263b25641f9702c", + "hash_cont_tokens": "eaf6a5d3ddd39a12" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "0350ab02a3d50c5f", + "hash_cont_tokens": "618fd4f954253134" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "2c8688ec4c1a1673", + "hash_cont_tokens": "b4962d9e583b12c0" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "c24ed5c990a2b92c", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "59ca81fd3abf68b3", + "hash_cont_tokens": "397a75462a9735e3" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4cebe9a8da92320d", + "hash_cont_tokens": "de629d1414e01de8" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "3e6036a8ea87ff4f", + "hash_cont_tokens": "df48bc66e06781f2" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "0591af93c06ece74", + "hash_cont_tokens": "828897df1f4f08a1" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bf7d8c6b5e4f7948", + "hash_cont_tokens": "6432c2e217e0ed46" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "8864448e1d4b68e8", + "hash_cont_tokens": "899bbfc962d3f702" + }, + "truncated": 1644, + "non_truncated": 27015, + "padded": 111639, + "non_padded": 3233, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/aloobun/open-llama-3b-v2-elmv3/results_2023-12-09T18-25-59.224844.json b/eval-results/aloobun/open-llama-3b-v2-elmv3/results_2023-12-09T18-25-59.224844.json new file mode 100644 index 0000000000000000000000000000000000000000..5bb27bff4113248a2a65b446ee7e1d85ae80cdb4 --- /dev/null +++ b/eval-results/aloobun/open-llama-3b-v2-elmv3/results_2023-12-09T18-25-59.224844.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 592359.534663049, + "end_time": 601229.397720785, + "total_evaluation_time_secondes": "8869.863057735958", + "model_name": "aloobun/open-llama-3b-v2-elmv3", + "model_sha": "7e43b199ff51dc0e63934ba49758a8a31ff855de", + "model_dtype": "8bit", + "model_size": "3.4 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.3873720136518771, + "acc_stderr": 0.014235872487909874, + "acc_norm": 0.42150170648464164, + "acc_norm_stderr": 0.014430197069326023 + }, + "harness|hellaswag|10": { + "acc": 0.551185022903804, + "acc_stderr": 0.004963567029129055, + "acc_norm": 0.7326229834694284, + "acc_norm_stderr": 0.004416861919100999 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.0416333199893227, + "acc_norm": 0.22, + "acc_norm_stderr": 0.0416333199893227 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3111111111111111, + "acc_stderr": 0.03999262876617721, + "acc_norm": 0.3111111111111111, + "acc_norm_stderr": 0.03999262876617721 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.03690677986137282, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.03690677986137282 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2943396226415094, + "acc_stderr": 0.028049186315695248, + "acc_norm": 0.2943396226415094, + "acc_norm_stderr": 0.028049186315695248 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.24305555555555555, + "acc_stderr": 0.0358687928008034, + "acc_norm": 0.24305555555555555, + "acc_norm_stderr": 0.0358687928008034 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.26011560693641617, + "acc_stderr": 0.033450369167889925, + "acc_norm": 0.26011560693641617, + "acc_norm_stderr": 0.033450369167889925 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.03793281185307811, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.03793281185307811 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.32340425531914896, + "acc_stderr": 0.030579442773610334, + "acc_norm": 0.32340425531914896, + "acc_norm_stderr": 0.030579442773610334 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.04096985139843673, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.04096985139843673 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.21379310344827587, + "acc_stderr": 0.03416520447747549, + "acc_norm": 0.21379310344827587, + "acc_norm_stderr": 0.03416520447747549 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2804232804232804, + "acc_stderr": 0.023135287974325628, + "acc_norm": 0.2804232804232804, + "acc_norm_stderr": 0.023135287974325628 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.16666666666666666, + "acc_stderr": 0.03333333333333339, + "acc_norm": 0.16666666666666666, + "acc_norm_stderr": 0.03333333333333339 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25483870967741934, + "acc_stderr": 0.024790118459332208, + "acc_norm": 0.25483870967741934, + "acc_norm_stderr": 0.024790118459332208 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.28078817733990147, + "acc_stderr": 0.03161856335358611, + "acc_norm": 0.28078817733990147, + "acc_norm_stderr": 0.03161856335358611 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.296969696969697, + "acc_stderr": 0.03567969772268049, + "acc_norm": 0.296969696969697, + "acc_norm_stderr": 0.03567969772268049 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.31313131313131315, + "acc_stderr": 0.033042050878136525, + "acc_norm": 0.31313131313131315, + "acc_norm_stderr": 0.033042050878136525 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.24352331606217617, + "acc_stderr": 0.030975436386845426, + "acc_norm": 0.24352331606217617, + "acc_norm_stderr": 0.030975436386845426 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.28974358974358977, + "acc_stderr": 0.02300062824368796, + "acc_norm": 0.28974358974358977, + "acc_norm_stderr": 0.02300062824368796 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.22592592592592592, + "acc_stderr": 0.02549753263960955, + "acc_norm": 0.22592592592592592, + "acc_norm_stderr": 0.02549753263960955 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2773109243697479, + "acc_stderr": 0.029079374539480007, + "acc_norm": 0.2773109243697479, + "acc_norm_stderr": 0.029079374539480007 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.037345356767871984, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.037345356767871984 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.25137614678899084, + "acc_stderr": 0.018599206360287415, + "acc_norm": 0.25137614678899084, + "acc_norm_stderr": 0.018599206360287415 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.17592592592592593, + "acc_stderr": 0.025967420958258533, + "acc_norm": 0.17592592592592593, + "acc_norm_stderr": 0.025967420958258533 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.22058823529411764, + "acc_stderr": 0.029102254389674082, + "acc_norm": 0.22058823529411764, + "acc_norm_stderr": 0.029102254389674082 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2616033755274262, + "acc_stderr": 0.028609516716994934, + "acc_norm": 0.2616033755274262, + "acc_norm_stderr": 0.028609516716994934 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.37668161434977576, + "acc_stderr": 0.03252113489929188, + "acc_norm": 0.37668161434977576, + "acc_norm_stderr": 0.03252113489929188 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.25190839694656486, + "acc_stderr": 0.03807387116306086, + "acc_norm": 0.25190839694656486, + "acc_norm_stderr": 0.03807387116306086 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.35537190082644626, + "acc_stderr": 0.04369236326573981, + "acc_norm": 0.35537190082644626, + "acc_norm_stderr": 0.04369236326573981 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.04414343666854933, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.04414343666854933 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2331288343558282, + "acc_stderr": 0.0332201579577674, + "acc_norm": 0.2331288343558282, + "acc_norm_stderr": 0.0332201579577674 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2767857142857143, + "acc_stderr": 0.042466243366976256, + "acc_norm": 0.2767857142857143, + "acc_norm_stderr": 0.042466243366976256 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.32038834951456313, + "acc_stderr": 0.04620284082280039, + "acc_norm": 0.32038834951456313, + "acc_norm_stderr": 0.04620284082280039 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.029343114798094476, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.029343114798094476 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.28607918263090676, + "acc_stderr": 0.016160871405127532, + "acc_norm": 0.28607918263090676, + "acc_norm_stderr": 0.016160871405127532 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.25722543352601157, + "acc_stderr": 0.023532925431044287, + "acc_norm": 0.25722543352601157, + "acc_norm_stderr": 0.023532925431044287 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.3006535947712418, + "acc_stderr": 0.02625605383571896, + "acc_norm": 0.3006535947712418, + "acc_norm_stderr": 0.02625605383571896 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.27009646302250806, + "acc_stderr": 0.025218040373410622, + "acc_norm": 0.27009646302250806, + "acc_norm_stderr": 0.025218040373410622 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.29012345679012347, + "acc_stderr": 0.025251173936495022, + "acc_norm": 0.29012345679012347, + "acc_norm_stderr": 0.025251173936495022 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2624113475177305, + "acc_stderr": 0.026244920349843017, + "acc_norm": 0.2624113475177305, + "acc_norm_stderr": 0.026244920349843017 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24185136897001303, + "acc_stderr": 0.010936550813827065, + "acc_norm": 0.24185136897001303, + "acc_norm_stderr": 0.010936550813827065 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.22794117647058823, + "acc_stderr": 0.025483081468029804, + "acc_norm": 0.22794117647058823, + "acc_norm_stderr": 0.025483081468029804 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2630718954248366, + "acc_stderr": 0.017812676542320657, + "acc_norm": 0.2630718954248366, + "acc_norm_stderr": 0.017812676542320657 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.33636363636363636, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.33636363636363636, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.33877551020408164, + "acc_stderr": 0.030299506562154185, + "acc_norm": 0.33877551020408164, + "acc_norm_stderr": 0.030299506562154185 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23880597014925373, + "acc_stderr": 0.030147775935409224, + "acc_norm": 0.23880597014925373, + "acc_norm_stderr": 0.030147775935409224 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3253012048192771, + "acc_stderr": 0.03647168523683227, + "acc_norm": 0.3253012048192771, + "acc_norm_stderr": 0.03647168523683227 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3508771929824561, + "acc_stderr": 0.03660298834049163, + "acc_norm": 0.3508771929824561, + "acc_norm_stderr": 0.03660298834049163 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.22888616891064872, + "mc1_stderr": 0.014706994909055027, + "mc2": 0.3550624387136162, + "mc2_stderr": 0.01364292328900912 + }, + "harness|winogrande|5": { + "acc": 0.6495659037095501, + "acc_stderr": 0.013409047676670184 + }, + "harness|gsm8k|5": { + "acc": 0.037149355572403335, + "acc_stderr": 0.0052095162830737675 + }, + "all": { + "acc": 0.2804692579613333, + "acc_stderr": 0.03160774886030324, + "acc_norm": 0.28199113779250456, + "acc_norm_stderr": 0.0323576565422058, + "mc1": 0.22888616891064872, + "mc1_stderr": 0.014706994909055027, + "mc2": 0.3550624387136162, + "mc2_stderr": 0.01364292328900912 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "59c328d432da064f", + "hash_cont_tokens": "2e8835aa03b9c2cf" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4676, + "non_padded": 11, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "9eaa83dae54ba52a", + "hash_cont_tokens": "18a48de3edcef462" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 39987, + "non_padded": 181, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "4129e579fbf0ebc2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "85c455354ae2ebd0", + "hash_cont_tokens": "1d81fa80e3039a08" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "221506ab8405000a", + "hash_cont_tokens": "247dc44c6b578728" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "16c21dd1ddd4ee38", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "24b21e9d78658e4d", + "hash_cont_tokens": "26e3b69d5fb27bb2" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "770d74c6a8c9c0b7", + "hash_cont_tokens": "bbda31842f3930d5" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 568, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "7dea1631558d65ac", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "22600976f0f9ffc6", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "564ae334c5a56510", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "bce86eecdc3bb76a", + "hash_cont_tokens": "894854ed7bec57f7" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 688, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "1188d9d525ab28e7", + "hash_cont_tokens": "13130ec6de384bbb" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "692856445804bec5", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "5ade2ffc8b9f5d4a", + "hash_cont_tokens": "29089b8b7020611e" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "9b766b5e103ce426", + "hash_cont_tokens": "efc596dfa1a1f073" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "dd9935cf301e82f9", + "hash_cont_tokens": "70817a7ac9f44af2" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 560, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "78c8ba2ecf6e0dc2", + "hash_cont_tokens": "937cd53d06cc6e16" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "661893e4f7f37eba", + "hash_cont_tokens": "eec972abe0fc0f5a" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "4a8d10395fdc21f0", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "816c7d936dbe01da", + "hash_cont_tokens": "94971ccfe8e59c25" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "769ab5386fedf26e", + "hash_cont_tokens": "a78e38b59778a04c" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "5b6bcda94f3ca2df", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "142b719c7d7d4fe0", + "hash_cont_tokens": "91dc522e4e4e91c3" + }, + "truncated": 660, + "non_truncated": -495, + "padded": 0, + "non_padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "281dcc445ad0af4a", + "hash_cont_tokens": "f275c901b3d285f9" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "bb8f5852975ec963", + "hash_cont_tokens": "85eb58f423437cce" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 770, + "non_padded": 2, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "e769357a349b7644", + "hash_cont_tokens": "39a93706184f896b" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "4ab345e3c0507320", + "hash_cont_tokens": "d41065d20b689af3" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "52ec665069da063e", + "hash_cont_tokens": "28c1f7c11bf85409" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "f23b89453c7c6050", + "hash_cont_tokens": "78c510e6c5d316ac" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "bb0f46fa5669c46e", + "hash_cont_tokens": "0ba4ecffc67603c5" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "db3276d6935c41ac", + "hash_cont_tokens": "4a0339e9ad3efa6d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a54112084a848a44", + "hash_cont_tokens": "2529d55ec490f81f" + }, + "truncated": 816, + "non_truncated": -612, + "padded": 0, + "non_padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "89cf33fb840f27be", + "hash_cont_tokens": "21808b54f5df97b2" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "ecf9f32ac289d1be", + "hash_cont_tokens": "92acdd467ed943e1" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ebf05f3ed8d69562", + "hash_cont_tokens": "a6034ed95a124315" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "b0d9e6f90b58599e", + "hash_cont_tokens": "223fbf3fd106c04b" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "ddb8c4eaa3d71594", + "hash_cont_tokens": "7c8e30f486ff156a" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 428, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "a04883884a711ebf", + "hash_cont_tokens": "b4cc4a8d31bbaa03" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 636, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "d5511967956880ea", + "hash_cont_tokens": "7f0e1289ec188e82" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "8c35c18f5a96b3b3", + "hash_cont_tokens": "66b726b356a02feb" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "a80e346390d1f88c", + "hash_cont_tokens": "f08457005b652d25" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "5caf5eb895cd3ccd", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "795c466e9f87e4c1", + "hash_cont_tokens": "647bcbd68f292558" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "505a224f2325b0ec", + "hash_cont_tokens": "6849b7fe56c50dda" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1368, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3f767d07e9ec8662", + "hash_cont_tokens": "81585ec455b1e3e5" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "0bc8cefb3f763640", + "hash_cont_tokens": "471b68eb20e5d34b" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "36e85ac3fd3f3c64", + "hash_cont_tokens": "6e39384b9c0a8cc2" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "1b04a90b19ce0623", + "hash_cont_tokens": "bfe513578190093f" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "8db39e7efe9edb93", + "hash_cont_tokens": "9ce431b67350b312" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "f638aace411a0bd9", + "hash_cont_tokens": "0ff990d9cc38024d" + }, + "truncated": 168, + "non_truncated": 1366, + "padded": 5968, + "non_padded": 168, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c0f160879d378d4d", + "hash_cont_tokens": "bc3c70e15bc7dce0" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "a66dcd2d6795f6ec", + "hash_cont_tokens": "58464ea26d81f908" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5263b25641f9702c", + "hash_cont_tokens": "eaf6a5d3ddd39a12" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "0350ab02a3d50c5f", + "hash_cont_tokens": "618fd4f954253134" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "2c8688ec4c1a1673", + "hash_cont_tokens": "b4962d9e583b12c0" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "c24ed5c990a2b92c", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "59ca81fd3abf68b3", + "hash_cont_tokens": "397a75462a9735e3" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4cebe9a8da92320d", + "hash_cont_tokens": "de629d1414e01de8" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "3e6036a8ea87ff4f", + "hash_cont_tokens": "df48bc66e06781f2" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "0591af93c06ece74", + "hash_cont_tokens": "828897df1f4f08a1" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bf7d8c6b5e4f7948", + "hash_cont_tokens": "63c1a8b55712d12c" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "8864448e1d4b68e8", + "hash_cont_tokens": "6b4a3ad2e97a1de1" + }, + "truncated": 1644, + "non_truncated": 27015, + "padded": 111639, + "non_padded": 3233, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/amazingvince/where-llambo-7b/results_2023-12-09T18-44-39.604520.json b/eval-results/amazingvince/where-llambo-7b/results_2023-12-09T18-44-39.604520.json new file mode 100644 index 0000000000000000000000000000000000000000..ffe5242de9e4872d18eaf5174008b95cbf9d83b6 --- /dev/null +++ b/eval-results/amazingvince/where-llambo-7b/results_2023-12-09T18-44-39.604520.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 593133.962769939, + "end_time": 602343.428104723, + "total_evaluation_time_secondes": "9209.465334784007", + "model_name": "amazingvince/where-llambo-7b", + "model_sha": "554d9c7bab7ea6deabef0266aef17aa98f758543", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5452218430034129, + "acc_stderr": 0.014551507060836357, + "acc_norm": 0.5844709897610921, + "acc_norm_stderr": 0.014401366641216386 + }, + "harness|hellaswag|10": { + "acc": 0.612427803226449, + "acc_stderr": 0.004862003566798543, + "acc_norm": 0.8205536745668194, + "acc_norm_stderr": 0.00382941380511398 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6296296296296297, + "acc_stderr": 0.04171654161354543, + "acc_norm": 0.6296296296296297, + "acc_norm_stderr": 0.04171654161354543 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6710526315789473, + "acc_stderr": 0.03823428969926604, + "acc_norm": 0.6710526315789473, + "acc_norm_stderr": 0.03823428969926604 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695238, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695238 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6981132075471698, + "acc_stderr": 0.02825420034443866, + "acc_norm": 0.6981132075471698, + "acc_norm_stderr": 0.02825420034443866 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7361111111111112, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.7361111111111112, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.036928207672648664, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.036928207672648664 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4019607843137255, + "acc_stderr": 0.048786087144669955, + "acc_norm": 0.4019607843137255, + "acc_norm_stderr": 0.048786087144669955 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.79, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5319148936170213, + "acc_stderr": 0.03261936918467382, + "acc_norm": 0.5319148936170213, + "acc_norm_stderr": 0.03261936918467382 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.046970851366478626, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.046970851366478626 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.42328042328042326, + "acc_stderr": 0.02544636563440678, + "acc_norm": 0.42328042328042326, + "acc_norm_stderr": 0.02544636563440678 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.0442626668137991, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.0442626668137991 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7677419354838709, + "acc_stderr": 0.024022256130308235, + "acc_norm": 0.7677419354838709, + "acc_norm_stderr": 0.024022256130308235 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5123152709359606, + "acc_stderr": 0.035169204442208966, + "acc_norm": 0.5123152709359606, + "acc_norm_stderr": 0.035169204442208966 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7757575757575758, + "acc_stderr": 0.03256866661681102, + "acc_norm": 0.7757575757575758, + "acc_norm_stderr": 0.03256866661681102 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.797979797979798, + "acc_stderr": 0.028606204289229862, + "acc_norm": 0.797979797979798, + "acc_norm_stderr": 0.028606204289229862 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8549222797927462, + "acc_stderr": 0.025416343096306433, + "acc_norm": 0.8549222797927462, + "acc_norm_stderr": 0.025416343096306433 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6205128205128205, + "acc_stderr": 0.02460362692409742, + "acc_norm": 0.6205128205128205, + "acc_norm_stderr": 0.02460362692409742 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3296296296296296, + "acc_stderr": 0.028661201116524575, + "acc_norm": 0.3296296296296296, + "acc_norm_stderr": 0.028661201116524575 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6218487394957983, + "acc_stderr": 0.031499305777849054, + "acc_norm": 0.6218487394957983, + "acc_norm_stderr": 0.031499305777849054 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943343, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943343 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8348623853211009, + "acc_stderr": 0.01591955782997604, + "acc_norm": 0.8348623853211009, + "acc_norm_stderr": 0.01591955782997604 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5, + "acc_stderr": 0.034099716973523674, + "acc_norm": 0.5, + "acc_norm_stderr": 0.034099716973523674 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7745098039215687, + "acc_stderr": 0.02933116229425174, + "acc_norm": 0.7745098039215687, + "acc_norm_stderr": 0.02933116229425174 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.810126582278481, + "acc_stderr": 0.02553010046023349, + "acc_norm": 0.810126582278481, + "acc_norm_stderr": 0.02553010046023349 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.695067264573991, + "acc_stderr": 0.030898610882477515, + "acc_norm": 0.695067264573991, + "acc_norm_stderr": 0.030898610882477515 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7633587786259542, + "acc_stderr": 0.03727673575596913, + "acc_norm": 0.7633587786259542, + "acc_norm_stderr": 0.03727673575596913 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.03984979653302872, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.03984979653302872 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8055555555555556, + "acc_stderr": 0.038260763248848646, + "acc_norm": 0.8055555555555556, + "acc_norm_stderr": 0.038260763248848646 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7423312883435583, + "acc_stderr": 0.03436150827846917, + "acc_norm": 0.7423312883435583, + "acc_norm_stderr": 0.03436150827846917 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4642857142857143, + "acc_stderr": 0.04733667890053756, + "acc_norm": 0.4642857142857143, + "acc_norm_stderr": 0.04733667890053756 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.021901905115073325, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.021901905115073325 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8148148148148148, + "acc_stderr": 0.013890862162876173, + "acc_norm": 0.8148148148148148, + "acc_norm_stderr": 0.013890862162876173 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7312138728323699, + "acc_stderr": 0.023868003262500097, + "acc_norm": 0.7312138728323699, + "acc_norm_stderr": 0.023868003262500097 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.27039106145251396, + "acc_stderr": 0.014854993938010076, + "acc_norm": 0.27039106145251396, + "acc_norm_stderr": 0.014854993938010076 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6928104575163399, + "acc_stderr": 0.026415601914388992, + "acc_norm": 0.6928104575163399, + "acc_norm_stderr": 0.026415601914388992 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7106109324758842, + "acc_stderr": 0.025755865922632945, + "acc_norm": 0.7106109324758842, + "acc_norm_stderr": 0.025755865922632945 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.024383665531035454, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.024383665531035454 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4574468085106383, + "acc_stderr": 0.029719281272236844, + "acc_norm": 0.4574468085106383, + "acc_norm_stderr": 0.029719281272236844 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4511082138200782, + "acc_stderr": 0.012709037347346233, + "acc_norm": 0.4511082138200782, + "acc_norm_stderr": 0.012709037347346233 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6139705882352942, + "acc_stderr": 0.02957326913441112, + "acc_norm": 0.6139705882352942, + "acc_norm_stderr": 0.02957326913441112 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6486928104575164, + "acc_stderr": 0.019312676065786565, + "acc_norm": 0.6486928104575164, + "acc_norm_stderr": 0.019312676065786565 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6181818181818182, + "acc_stderr": 0.046534298079135075, + "acc_norm": 0.6181818181818182, + "acc_norm_stderr": 0.046534298079135075 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7510204081632653, + "acc_stderr": 0.027682979522960238, + "acc_norm": 0.7510204081632653, + "acc_norm_stderr": 0.027682979522960238 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8059701492537313, + "acc_stderr": 0.027962677604768914, + "acc_norm": 0.8059701492537313, + "acc_norm_stderr": 0.027962677604768914 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5180722891566265, + "acc_stderr": 0.03889951252827216, + "acc_norm": 0.5180722891566265, + "acc_norm_stderr": 0.03889951252827216 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.34394124847001223, + "mc1_stderr": 0.01662908751427678, + "mc2": 0.4961220088630948, + "mc2_stderr": 0.014820546287012869 + }, + "harness|winogrande|5": { + "acc": 0.7853196527229677, + "acc_stderr": 0.011539912734345402 + }, + "harness|gsm8k|5": { + "acc": 0.6520090978013646, + "acc_stderr": 0.013120581030382134 + }, + "all": { + "acc": 0.6276007814719067, + "acc_stderr": 0.03245983620498288, + "acc_norm": 0.6287066769044074, + "acc_norm_stderr": 0.03312214889081226, + "mc1": 0.34394124847001223, + "mc1_stderr": 0.01662908751427678, + "mc2": 0.4961220088630948, + "mc2_stderr": 0.014820546287012869 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "3407353968ce474e" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "5a1ddef544ecbe3f" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/amazingvince/zephyr-smol_llama-100m-dpo-full/results_2023-11-21T00-05-44.603867.json b/eval-results/amazingvince/zephyr-smol_llama-100m-dpo-full/results_2023-11-21T00-05-44.603867.json new file mode 100644 index 0000000000000000000000000000000000000000..aa41699b0fb863ef92042f1712bba2273c61f9be --- /dev/null +++ b/eval-results/amazingvince/zephyr-smol_llama-100m-dpo-full/results_2023-11-21T00-05-44.603867.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 361322.488350323, + "end_time": 366647.345498345, + "total_evaluation_time_secondes": "5324.857148021983", + "model_name": "amazingvince/zephyr-smol_llama-100m-dpo-full", + "model_sha": "be3400c89d66ed66f0aa96f1b8131604c118b67b", + "model_dtype": "torch.float16", + "model_size": "193.89 MB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.19795221843003413, + "acc_stderr": 0.011643990971573395, + "acc_norm": 0.25, + "acc_norm_stderr": 0.012653835621466646 + }, + "harness|hellaswag|10": { + "acc": 0.2779326827325234, + "acc_stderr": 0.004470644845242894, + "acc_norm": 0.28540131447918743, + "acc_norm_stderr": 0.0045068240943332985 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816507, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816507 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.34074074074074073, + "acc_stderr": 0.04094376269996793, + "acc_norm": 0.34074074074074073, + "acc_norm_stderr": 0.04094376269996793 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.22641509433962265, + "acc_stderr": 0.025757559893106727, + "acc_norm": 0.22641509433962265, + "acc_norm_stderr": 0.025757559893106727 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749874, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749874 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.040969851398436695, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.040969851398436695 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2328042328042328, + "acc_stderr": 0.02176596167215452, + "acc_norm": 0.2328042328042328, + "acc_norm_stderr": 0.02176596167215452 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.15079365079365079, + "acc_stderr": 0.03200686497287392, + "acc_norm": 0.15079365079365079, + "acc_norm_stderr": 0.03200686497287392 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.17, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.17, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3032258064516129, + "acc_stderr": 0.02614868593067175, + "acc_norm": 0.3032258064516129, + "acc_norm_stderr": 0.02614868593067175 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.28078817733990147, + "acc_stderr": 0.03161856335358609, + "acc_norm": 0.28078817733990147, + "acc_norm_stderr": 0.03161856335358609 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.23737373737373738, + "acc_stderr": 0.0303137105381989, + "acc_norm": 0.23737373737373738, + "acc_norm_stderr": 0.0303137105381989 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.26424870466321243, + "acc_stderr": 0.03182155050916647, + "acc_norm": 0.26424870466321243, + "acc_norm_stderr": 0.03182155050916647 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.28974358974358977, + "acc_stderr": 0.023000628243687954, + "acc_norm": 0.28974358974358977, + "acc_norm_stderr": 0.023000628243687954 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.026719240783712166, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.026719240783712166 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2773109243697479, + "acc_stderr": 0.029079374539480007, + "acc_norm": 0.2773109243697479, + "acc_norm_stderr": 0.029079374539480007 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.20733944954128442, + "acc_stderr": 0.017381415563608674, + "acc_norm": 0.20733944954128442, + "acc_norm_stderr": 0.017381415563608674 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.44907407407407407, + "acc_stderr": 0.03392238405321617, + "acc_norm": 0.44907407407407407, + "acc_norm_stderr": 0.03392238405321617 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.30392156862745096, + "acc_stderr": 0.032282103870378935, + "acc_norm": 0.30392156862745096, + "acc_norm_stderr": 0.032282103870378935 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.26582278481012656, + "acc_stderr": 0.028756799629658335, + "acc_norm": 0.26582278481012656, + "acc_norm_stderr": 0.028756799629658335 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.30493273542600896, + "acc_stderr": 0.030898610882477515, + "acc_norm": 0.30493273542600896, + "acc_norm_stderr": 0.030898610882477515 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2231404958677686, + "acc_stderr": 0.03800754475228732, + "acc_norm": 0.2231404958677686, + "acc_norm_stderr": 0.03800754475228732 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.04284467968052192, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.04284467968052192 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.294478527607362, + "acc_stderr": 0.03581165790474082, + "acc_norm": 0.294478527607362, + "acc_norm_stderr": 0.03581165790474082 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.04521829902833585, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.04521829902833585 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.21359223300970873, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.21359223300970873, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.28205128205128205, + "acc_stderr": 0.029480360549541194, + "acc_norm": 0.28205128205128205, + "acc_norm_stderr": 0.029480360549541194 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.24010217113665389, + "acc_stderr": 0.015274685213734195, + "acc_norm": 0.24010217113665389, + "acc_norm_stderr": 0.015274685213734195 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24566473988439305, + "acc_stderr": 0.02317629820399201, + "acc_norm": 0.24566473988439305, + "acc_norm_stderr": 0.02317629820399201 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24804469273743016, + "acc_stderr": 0.014444157808261427, + "acc_norm": 0.24804469273743016, + "acc_norm_stderr": 0.014444157808261427 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.02182859605310841, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.02182859605310841 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.20257234726688103, + "acc_stderr": 0.02282731749105968, + "acc_norm": 0.20257234726688103, + "acc_norm_stderr": 0.02282731749105968 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.20987654320987653, + "acc_stderr": 0.02265834408598136, + "acc_norm": 0.20987654320987653, + "acc_norm_stderr": 0.02265834408598136 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.22695035460992907, + "acc_stderr": 0.02498710636564297, + "acc_norm": 0.22695035460992907, + "acc_norm_stderr": 0.02498710636564297 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24185136897001303, + "acc_stderr": 0.010936550813827061, + "acc_norm": 0.24185136897001303, + "acc_norm_stderr": 0.010936550813827061 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.31985294117647056, + "acc_stderr": 0.028332959514031225, + "acc_norm": 0.31985294117647056, + "acc_norm_stderr": 0.028332959514031225 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.017630827375148383, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.017630827375148383 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.18181818181818182, + "acc_stderr": 0.03694284335337798, + "acc_norm": 0.18181818181818182, + "acc_norm_stderr": 0.03694284335337798 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.24897959183673468, + "acc_stderr": 0.02768297952296023, + "acc_norm": 0.24897959183673468, + "acc_norm_stderr": 0.02768297952296023 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.25870646766169153, + "acc_stderr": 0.03096590312357304, + "acc_norm": 0.25870646766169153, + "acc_norm_stderr": 0.03096590312357304 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.25903614457831325, + "acc_stderr": 0.034106466140718564, + "acc_norm": 0.25903614457831325, + "acc_norm_stderr": 0.034106466140718564 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.0312678171466318, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.0312678171466318 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2558139534883721, + "mc1_stderr": 0.015274176219283366, + "mc2": 0.457475004641809, + "mc2_stderr": 0.01524616416347799 + }, + "harness|winogrande|5": { + "acc": 0.510655090765588, + "acc_stderr": 0.0140492945362904 + }, + "harness|drop|3": { + "em": 0.0008389261744966443, + "em_stderr": 0.00029649629898012553, + "f1": 0.030236996644295366, + "f1_stderr": 0.0009878008881522781 + }, + "harness|gsm8k|5": { + "acc": 0.0037907505686125853, + "acc_stderr": 0.0016927007401501986 + }, + "all": { + "acc": 0.2515294158941045, + "acc_stderr": 0.030647256452421157, + "acc_norm": 0.25234516850038646, + "acc_norm_stderr": 0.03143706224102091, + "mc1": 0.2558139534883721, + "mc1_stderr": 0.015274176219283366, + "mc2": 0.457475004641809, + "mc2_stderr": 0.01524616416347799, + "em": 0.0008389261744966443, + "em_stderr": 0.00029649629898012553, + "f1": 0.030236996644295366, + "f1_stderr": 0.0009878008881522781 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "5d8cdf520a3f50ef", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 3615, + "non_truncated": -2443, + "padded": 1058, + "non_padded": 3629, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "e971728a7c740de4", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 13414, + "non_truncated": -3372, + "padded": 26570, + "non_padded": 13598, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "f475efca0e10a741", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "127d227602b3518a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 20, + "non_truncated": 153, + "padded": 672, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "2ddcbd14cf557c7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "75cd52f4257cb76f", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 84, + "non_truncated": 16, + "padded": 312, + "non_padded": 88, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "ebe7fb04d1592fdd", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non_truncated": -495, + "padded": 0, + "non_padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "8c7914cfa8e96791", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 8, + "non_truncated": 208, + "padded": 852, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "9bee5220ce5dc150", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non_truncated": -612, + "padded": 0, + "non_padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "86d46d622be1f624", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 948, + "non_truncated": -711, + "padded": 0, + "non_padded": 948, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "4280fed4470c2ae4", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "a18ebb82a5ae1dfa", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d242b8849eed0354", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 6136, + "non_truncated": -4602, + "padded": 0, + "non_padded": 6136, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "05edfda4110bfddf", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 1088, + "non_truncated": -816, + "padded": 0, + "non_padded": 1088, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "133739b8f766d88e", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "f60dcd735e1ffd14", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 980, + "non_truncated": -735, + "padded": 0, + "non_padded": 980, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "d1f1add3774138f4", + "hash_cont_tokens": "a77b941f1c2362f3" + }, + "truncated": 9531, + "non_truncated": 5, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "176bf7f0eaea05e8", + "hash_cont_tokens": "1229233ad792fea4" + }, + "truncated": 1304, + "non_truncated": 15, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "a806e3dbebcd6ef4", + "hash_cont_tokens": "dcf207ee563350b7" + }, + "truncated": 38604, + "non_truncated": -409, + "padded": 85526, + "non_padded": 38882, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/amazingvince/zephyr-smol_llama-100m-dpo-full/results_2023-12-03T16-25-51.768387.json b/eval-results/amazingvince/zephyr-smol_llama-100m-dpo-full/results_2023-12-03T16-25-51.768387.json new file mode 100644 index 0000000000000000000000000000000000000000..7427c47ce1204d4dd9336fc9b6141a63aa2afeb5 --- /dev/null +++ b/eval-results/amazingvince/zephyr-smol_llama-100m-dpo-full/results_2023-12-03T16-25-51.768387.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 74973.48510467, + "end_time": 75624.02632603, + "total_evaluation_time_secondes": "650.541221359992", + "model_name": "amazingvince/zephyr-smol_llama-100m-dpo-full", + "model_sha": "be3400c89d66ed66f0aa96f1b8131604c118b67b", + "model_dtype": "torch.float16", + "model_size": "193.89 MB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.006823351023502654, + "acc_stderr": 0.002267537102254512 + }, + "all": { + "acc": 0.006823351023502654, + "acc_stderr": 0.002267537102254512 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "176bf7f0eaea05e8", + "hash_cont_tokens": "1229233ad792fea4" + }, + "truncated": 1304, + "non_truncated": 15, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "596c675757820865", + "hash_cont_tokens": "5789c07185cda508" + }, + "truncated": 1304, + "non_truncated": 15, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/amazon/LightGPT/results_2023-08-23T11-09-14.917369.json b/eval-results/amazon/LightGPT/results_2023-08-23T11-09-14.917369.json new file mode 100644 index 0000000000000000000000000000000000000000..d7b701b1f6339da630e49989d0c12a89012912dc --- /dev/null +++ b/eval-results/amazon/LightGPT/results_2023-08-23T11-09-14.917369.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.3720136518771331, + "acc_stderr": 0.014124597881844461, + "acc_norm": 0.3993174061433447, + "acc_norm_stderr": 0.014312094557946707 + }, + "harness|hellaswag|10": { + "acc": 0.47191794463254333, + "acc_stderr": 0.004981905293878142, + "acc_norm": 0.6382194781915953, + "acc_norm_stderr": 0.004795337009118189 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.03944624162501116, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.03944624162501116 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.29605263157894735, + "acc_stderr": 0.03715062154998905, + "acc_norm": 0.29605263157894735, + "acc_norm_stderr": 0.03715062154998905 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.30566037735849055, + "acc_stderr": 0.028353298073322666, + "acc_norm": 0.30566037735849055, + "acc_norm_stderr": 0.028353298073322666 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720683, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720683 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2947976878612717, + "acc_stderr": 0.0347659960751648, + "acc_norm": 0.2947976878612717, + "acc_norm_stderr": 0.0347659960751648 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.043898699568087785, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.043898699568087785 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.33617021276595743, + "acc_stderr": 0.030881618520676942, + "acc_norm": 0.33617021276595743, + "acc_norm_stderr": 0.030881618520676942 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21929824561403508, + "acc_stderr": 0.03892431106518754, + "acc_norm": 0.21929824561403508, + "acc_norm_stderr": 0.03892431106518754 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2896551724137931, + "acc_stderr": 0.03780019230438014, + "acc_norm": 0.2896551724137931, + "acc_norm_stderr": 0.03780019230438014 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.022182037202948365, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.022182037202948365 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.03809523809523811, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.03809523809523811 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25806451612903225, + "acc_stderr": 0.024892469172462833, + "acc_norm": 0.25806451612903225, + "acc_norm_stderr": 0.024892469172462833 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.22660098522167488, + "acc_stderr": 0.02945486383529297, + "acc_norm": 0.22660098522167488, + "acc_norm_stderr": 0.02945486383529297 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.24242424242424243, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2878787878787879, + "acc_stderr": 0.03225883512300993, + "acc_norm": 0.2878787878787879, + "acc_norm_stderr": 0.03225883512300993 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.38341968911917096, + "acc_stderr": 0.03508984236295342, + "acc_norm": 0.38341968911917096, + "acc_norm_stderr": 0.03508984236295342 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.31025641025641026, + "acc_stderr": 0.02345467488940429, + "acc_norm": 0.31025641025641026, + "acc_norm_stderr": 0.02345467488940429 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.026067159222275794, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.026067159222275794 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3025210084033613, + "acc_stderr": 0.02983796238829194, + "acc_norm": 0.3025210084033613, + "acc_norm_stderr": 0.02983796238829194 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23841059602649006, + "acc_stderr": 0.0347918557259966, + "acc_norm": 0.23841059602649006, + "acc_norm_stderr": 0.0347918557259966 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.25504587155963304, + "acc_stderr": 0.018688500856535846, + "acc_norm": 0.25504587155963304, + "acc_norm_stderr": 0.018688500856535846 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2916666666666667, + "acc_stderr": 0.03099866630456053, + "acc_norm": 0.2916666666666667, + "acc_norm_stderr": 0.03099866630456053 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.03166009679399812, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.03166009679399812 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.28270042194092826, + "acc_stderr": 0.029312814153955924, + "acc_norm": 0.28270042194092826, + "acc_norm_stderr": 0.029312814153955924 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.38565022421524664, + "acc_stderr": 0.03266842214289202, + "acc_norm": 0.38565022421524664, + "acc_norm_stderr": 0.03266842214289202 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.19083969465648856, + "acc_stderr": 0.03446513350752597, + "acc_norm": 0.19083969465648856, + "acc_norm_stderr": 0.03446513350752597 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.34710743801652894, + "acc_stderr": 0.04345724570292534, + "acc_norm": 0.34710743801652894, + "acc_norm_stderr": 0.04345724570292534 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.04414343666854933, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.04414343666854933 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.25153374233128833, + "acc_stderr": 0.034089978868575295, + "acc_norm": 0.25153374233128833, + "acc_norm_stderr": 0.034089978868575295 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.26785714285714285, + "acc_stderr": 0.04203277291467762, + "acc_norm": 0.26785714285714285, + "acc_norm_stderr": 0.04203277291467762 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.22330097087378642, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.22330097087378642, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.24358974358974358, + "acc_stderr": 0.028120966503914404, + "acc_norm": 0.24358974358974358, + "acc_norm_stderr": 0.028120966503914404 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.28735632183908044, + "acc_stderr": 0.016182410730682696, + "acc_norm": 0.28735632183908044, + "acc_norm_stderr": 0.016182410730682696 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24277456647398843, + "acc_stderr": 0.0230836585869842, + "acc_norm": 0.24277456647398843, + "acc_norm_stderr": 0.0230836585869842 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2446927374301676, + "acc_stderr": 0.014378169884098398, + "acc_norm": 0.2446927374301676, + "acc_norm_stderr": 0.014378169884098398 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.026090162504279046, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.026090162504279046 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.31189710610932475, + "acc_stderr": 0.02631185807185416, + "acc_norm": 0.31189710610932475, + "acc_norm_stderr": 0.02631185807185416 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.3487654320987654, + "acc_stderr": 0.026517597724465013, + "acc_norm": 0.3487654320987654, + "acc_norm_stderr": 0.026517597724465013 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.28368794326241137, + "acc_stderr": 0.026891709428343957, + "acc_norm": 0.28368794326241137, + "acc_norm_stderr": 0.026891709428343957 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2953063885267275, + "acc_stderr": 0.011651061936208818, + "acc_norm": 0.2953063885267275, + "acc_norm_stderr": 0.011651061936208818 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.0276784686421447, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.0276784686421447 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.27941176470588236, + "acc_stderr": 0.018152871051538812, + "acc_norm": 0.27941176470588236, + "acc_norm_stderr": 0.018152871051538812 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.35454545454545455, + "acc_stderr": 0.045820048415054174, + "acc_norm": 0.35454545454545455, + "acc_norm_stderr": 0.045820048415054174 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4122448979591837, + "acc_stderr": 0.03151236044674281, + "acc_norm": 0.4122448979591837, + "acc_norm_stderr": 0.03151236044674281 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.29850746268656714, + "acc_stderr": 0.03235743789355042, + "acc_norm": 0.29850746268656714, + "acc_norm_stderr": 0.03235743789355042 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3192771084337349, + "acc_stderr": 0.03629335329947859, + "acc_norm": 0.3192771084337349, + "acc_norm_stderr": 0.03629335329947859 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.32748538011695905, + "acc_stderr": 0.035993357714560276, + "acc_norm": 0.32748538011695905, + "acc_norm_stderr": 0.035993357714560276 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.21664626682986537, + "mc1_stderr": 0.014421468452506983, + "mc2": 0.36692080564199897, + "mc2_stderr": 0.013804406871160373 + }, + "all": { + "acc": 0.28913543641978023, + "acc_stderr": 0.03273164016307234, + "acc_norm": 0.29241688197614063, + "acc_norm_stderr": 0.03273165589851882, + "mc1": 0.21664626682986537, + "mc1_stderr": 0.014421468452506983, + "mc2": 0.36692080564199897, + "mc2_stderr": 0.013804406871160373 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "amazon/LightGPT", + "model_sha": "1f6ffd8f162030396a3bc1ca2e3504896dbe6434", + "model_dtype": "torch.float16", + "lighteval_sha": "2d7f9b0219a3536f201c55d7e8126251127b731c", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "d57e59a4130853e0" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4685, + "non-padded": 2, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "d8973ec3a510d4bc" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40045, + "non-padded": 123, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "4a75531cbfd07f95" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "accb7cef363cf18e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "16b3626c8a5e3797" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "14362f67beb028ba" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "69d91a3fd2e4511e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 680, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "4468714c283b10f9" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "8d66c298f1a52c46" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "f23c2d0723d2f830" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "9cf4df701a8e97ca" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "120b77ffae8b0591" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "1ba11ec0fba0a4bb" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "822c5217a581c95f" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "a745b56725d20832" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "969464bbd6828346" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "f00cfc03022d559a" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "f6dd7cf291429cd9" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "ad79993e5e453770" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "5904fef477924132" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "201895f1be790f02" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "38fadc6201499c0e" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "dcdd301556b5df9e" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "67c525ef797587ce" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "0d9fbe99f871c5c5" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 16, + "non-truncated": 6120, + "padded": 6120, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "01ddc79c7e1f2f6d" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "fa0fc10c4bdd757c" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "6483ae9688e0a0d6" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "9ec52ea7962c54f5" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "bc42db2c568e27d6" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "c8f2395107c4b82b" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "0893dfcb83435e7d", + "hash_cont_tokens": "f1f2fb65023f2668" + }, + "total_evaluation_time_secondes": "4351.106058835983", + "truncated": 1492, + "non-truncated": 109527, + "padded": 109290, + "non-padded": 1729, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/amazon/LightGPT/results_2023-09-18T05-09-39.039109.json b/eval-results/amazon/LightGPT/results_2023-09-18T05-09-39.039109.json new file mode 100644 index 0000000000000000000000000000000000000000..a515122fb315439d6abc5341e1c110b5ba18749a --- /dev/null +++ b/eval-results/amazon/LightGPT/results_2023-09-18T05-09-39.039109.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "amazon/LightGPT", + "model_sha": "1f6ffd8f162030396a3bc1ca2e3504896dbe6434", + "model_size": "11.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.053796140939597316, + "em_stderr": 0.0023105084978365595, + "f1": 0.11191694630872479, + "f1_stderr": 0.0026210067753728973 + }, + "harness|gsm8k|5": { + "acc": 0.03866565579984837, + "acc_stderr": 0.005310583162098055 + }, + "harness|winogrande|5": { + "acc": 0.6448303078137332, + "acc_stderr": 0.013450047479569257 + }, + "all": { + "em": 0.053796140939597316, + "em_stderr": 0.0023105084978365595, + "f1": 0.11191694630872479, + "f1_stderr": 0.0026210067753728973, + "acc": 0.3417479818067908, + "acc_stderr": 0.009380315320833657 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f21277d2c2d2e06c", + "hash_cont_tokens": "0ebcab152c154429" + }, + "truncated": 382, + "non-truncated": 9154, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "5da8d52e810e33f1" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "26cd3631535039d0", + "hash_cont_tokens": "15e51fd321f11a1c" + }, + "total_evaluation_time_secondes": "19713.976402044296", + "truncated": 382, + "non-truncated": 13007, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/amazon/MistralLite/results_2023-11-28T07-29-42.691462.json b/eval-results/amazon/MistralLite/results_2023-11-28T07-29-42.691462.json new file mode 100644 index 0000000000000000000000000000000000000000..0f80a2372c690f89a0bbadf89d48487f7c9cec16 --- /dev/null +++ b/eval-results/amazon/MistralLite/results_2023-11-28T07-29-42.691462.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 925048.346521465, + "end_time": 934483.395381303, + "total_evaluation_time_secondes": "9435.048859838047", + "model_name": "amazon/MistralLite", + "model_sha": "23486089ab7ba741b34adc69ab7555885f8abe71", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5622866894197952, + "acc_stderr": 0.014497573881108285, + "acc_norm": 0.5955631399317406, + "acc_norm_stderr": 0.014342036483436177 + }, + "harness|hellaswag|10": { + "acc": 0.627365066719777, + "acc_stderr": 0.004825179407757565, + "acc_norm": 0.8183628759211312, + "acc_norm_stderr": 0.003847572259636413 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4148148148148148, + "acc_stderr": 0.042561937679014075, + "acc_norm": 0.4148148148148148, + "acc_norm_stderr": 0.042561937679014075 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5263157894736842, + "acc_stderr": 0.04063302731486671, + "acc_norm": 0.5263157894736842, + "acc_norm_stderr": 0.04063302731486671 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5660377358490566, + "acc_stderr": 0.030503292013342596, + "acc_norm": 0.5660377358490566, + "acc_norm_stderr": 0.030503292013342596 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6319444444444444, + "acc_stderr": 0.04032999053960719, + "acc_norm": 0.6319444444444444, + "acc_norm_stderr": 0.04032999053960719 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.48554913294797686, + "acc_stderr": 0.03810871630454764, + "acc_norm": 0.48554913294797686, + "acc_norm_stderr": 0.03810871630454764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.39215686274509803, + "acc_stderr": 0.04858083574266345, + "acc_norm": 0.39215686274509803, + "acc_norm_stderr": 0.04858083574266345 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.43829787234042555, + "acc_stderr": 0.03243618636108101, + "acc_norm": 0.43829787234042555, + "acc_norm_stderr": 0.03243618636108101 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.43859649122807015, + "acc_stderr": 0.04668000738510455, + "acc_norm": 0.43859649122807015, + "acc_norm_stderr": 0.04668000738510455 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4482758620689655, + "acc_stderr": 0.041443118108781526, + "acc_norm": 0.4482758620689655, + "acc_norm_stderr": 0.041443118108781526 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.02479606060269995, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.02479606060269995 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.041905964388711366, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.041905964388711366 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.47096774193548385, + "acc_stderr": 0.028396016402761005, + "acc_norm": 0.47096774193548385, + "acc_norm_stderr": 0.028396016402761005 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43842364532019706, + "acc_stderr": 0.03491207857486518, + "acc_norm": 0.43842364532019706, + "acc_norm_stderr": 0.03491207857486518 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956913, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956913 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6848484848484848, + "acc_stderr": 0.0362773057502241, + "acc_norm": 0.6848484848484848, + "acc_norm_stderr": 0.0362773057502241 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5909090909090909, + "acc_stderr": 0.03502975799413007, + "acc_norm": 0.5909090909090909, + "acc_norm_stderr": 0.03502975799413007 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7979274611398963, + "acc_stderr": 0.02897908979429673, + "acc_norm": 0.7979274611398963, + "acc_norm_stderr": 0.02897908979429673 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5435897435897435, + "acc_stderr": 0.025254485424799602, + "acc_norm": 0.5435897435897435, + "acc_norm_stderr": 0.025254485424799602 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24814814814814815, + "acc_stderr": 0.0263357394040558, + "acc_norm": 0.24814814814814815, + "acc_norm_stderr": 0.0263357394040558 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.0322529423239964, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.0322529423239964 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.037579499229433426, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.037579499229433426 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.5688073394495413, + "acc_stderr": 0.02123336503031956, + "acc_norm": 0.5688073394495413, + "acc_norm_stderr": 0.02123336503031956 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.39351851851851855, + "acc_stderr": 0.03331747876370312, + "acc_norm": 0.39351851851851855, + "acc_norm_stderr": 0.03331747876370312 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6715686274509803, + "acc_stderr": 0.03296245110172229, + "acc_norm": 0.6715686274509803, + "acc_norm_stderr": 0.03296245110172229 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7172995780590717, + "acc_stderr": 0.029312814153955934, + "acc_norm": 0.7172995780590717, + "acc_norm_stderr": 0.029312814153955934 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.600896860986547, + "acc_stderr": 0.03286745312567961, + "acc_norm": 0.600896860986547, + "acc_norm_stderr": 0.03286745312567961 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5343511450381679, + "acc_stderr": 0.043749285605997376, + "acc_norm": 0.5343511450381679, + "acc_norm_stderr": 0.043749285605997376 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.71900826446281, + "acc_stderr": 0.04103203830514511, + "acc_norm": 0.71900826446281, + "acc_norm_stderr": 0.04103203830514511 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.04766075165356461, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.04766075165356461 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5705521472392638, + "acc_stderr": 0.03889066619112722, + "acc_norm": 0.5705521472392638, + "acc_norm_stderr": 0.03889066619112722 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.49107142857142855, + "acc_stderr": 0.04745033255489122, + "acc_norm": 0.49107142857142855, + "acc_norm_stderr": 0.04745033255489122 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6213592233009708, + "acc_stderr": 0.04802694698258973, + "acc_norm": 0.6213592233009708, + "acc_norm_stderr": 0.04802694698258973 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6581196581196581, + "acc_stderr": 0.03107502852650775, + "acc_norm": 0.6581196581196581, + "acc_norm_stderr": 0.03107502852650775 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.644955300127714, + "acc_stderr": 0.017112085772772994, + "acc_norm": 0.644955300127714, + "acc_norm_stderr": 0.017112085772772994 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5751445086705202, + "acc_stderr": 0.026613350840261733, + "acc_norm": 0.5751445086705202, + "acc_norm_stderr": 0.026613350840261733 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.27039106145251396, + "acc_stderr": 0.014854993938010083, + "acc_norm": 0.27039106145251396, + "acc_norm_stderr": 0.014854993938010083 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5392156862745098, + "acc_stderr": 0.028541722692618874, + "acc_norm": 0.5392156862745098, + "acc_norm_stderr": 0.028541722692618874 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6270096463022508, + "acc_stderr": 0.027466610213140123, + "acc_norm": 0.6270096463022508, + "acc_norm_stderr": 0.027466610213140123 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5648148148148148, + "acc_stderr": 0.027586006221607697, + "acc_norm": 0.5648148148148148, + "acc_norm_stderr": 0.027586006221607697 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3971631205673759, + "acc_stderr": 0.029189805673587085, + "acc_norm": 0.3971631205673759, + "acc_norm_stderr": 0.029189805673587085 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4198174706649283, + "acc_stderr": 0.012604960816087368, + "acc_norm": 0.4198174706649283, + "acc_norm_stderr": 0.012604960816087368 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5257352941176471, + "acc_stderr": 0.030332578094555033, + "acc_norm": 0.5257352941176471, + "acc_norm_stderr": 0.030332578094555033 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5147058823529411, + "acc_stderr": 0.020219083895133924, + "acc_norm": 0.5147058823529411, + "acc_norm_stderr": 0.020219083895133924 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5, + "acc_stderr": 0.04789131426105757, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04789131426105757 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6285714285714286, + "acc_stderr": 0.030932858792789848, + "acc_norm": 0.6285714285714286, + "acc_norm_stderr": 0.030932858792789848 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.43781094527363185, + "acc_stderr": 0.03508080112199839, + "acc_norm": 0.43781094527363185, + "acc_norm_stderr": 0.03508080112199839 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.76, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.76, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.45180722891566266, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.45180722891566266, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6140350877192983, + "acc_stderr": 0.03733756969066164, + "acc_norm": 0.6140350877192983, + "acc_norm_stderr": 0.03733756969066164 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2521419828641371, + "mc1_stderr": 0.015201522246299962, + "mc2": 0.37869582092697873, + "mc2_stderr": 0.01461237378950434 + }, + "harness|winogrande|5": { + "acc": 0.7742699289660616, + "acc_stderr": 0.01174962626090255 + }, + "harness|drop|3": { + "em": 0.33074664429530204, + "em_stderr": 0.004818175925966227, + "f1": 0.36544672818791973, + "f1_stderr": 0.0047576318375412065 + }, + "harness|gsm8k|5": { + "acc": 0.00530705079605762, + "acc_stderr": 0.002001305720948091 + }, + "all": { + "acc": 0.5082136252383307, + "acc_stderr": 0.03429101447445889, + "acc_norm": 0.5160292952455823, + "acc_norm_stderr": 0.03520114926126014, + "mc1": 0.2521419828641371, + "mc1_stderr": 0.015201522246299962, + "mc2": 0.37869582092697873, + "mc2_stderr": 0.01461237378950434, + "em": 0.33074664429530204, + "em_stderr": 0.004818175925966227, + "f1": 0.36544672818791973, + "f1_stderr": 0.0047576318375412065 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "28b6a6d9924ab564" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "1ca60ef282f17b29" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "47dd6e760fd9f2e2" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/amazon/MistralLite/results_2023-12-02T12-51-23.360186.json b/eval-results/amazon/MistralLite/results_2023-12-02T12-51-23.360186.json new file mode 100644 index 0000000000000000000000000000000000000000..4ed10239a3c7f819f5b0fd694f55a1518e0035e9 --- /dev/null +++ b/eval-results/amazon/MistralLite/results_2023-12-02T12-51-23.360186.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1399695.920313416, + "end_time": 1401984.810167699, + "total_evaluation_time_secondes": "2288.889854282839", + "model_name": "amazon/MistralLite", + "model_sha": "23486089ab7ba741b34adc69ab7555885f8abe71", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.01061410159211524, + "acc_stderr": 0.002822713322387704 + }, + "all": { + "acc": 0.01061410159211524, + "acc_stderr": 0.002822713322387704 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "840c4486fc0436fb" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "f17391d49d33b9c0", + "hash_cont_tokens": "4b12b1ba88f99196" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/andreaskoepf/llama2-13b-megacode2_min100/results_2023-08-17T10-46-30.131407.json b/eval-results/andreaskoepf/llama2-13b-megacode2_min100/results_2023-08-17T10-46-30.131407.json new file mode 100644 index 0000000000000000000000000000000000000000..268b951b9fca30208532da0e6fcd7b9bd8c9ea25 --- /dev/null +++ b/eval-results/andreaskoepf/llama2-13b-megacode2_min100/results_2023-08-17T10-46-30.131407.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5588737201365188, + "acc_stderr": 0.014509747749064663, + "acc_norm": 0.60580204778157, + "acc_norm_stderr": 0.014280522667467327 + }, + "harness|hellaswag|10": { + "acc": 0.6141206930890261, + "acc_stderr": 0.004858074013443992, + "acc_norm": 0.8125871340370444, + "acc_norm_stderr": 0.0038944505016930363 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4740740740740741, + "acc_stderr": 0.04313531696750574, + "acc_norm": 0.4740740740740741, + "acc_norm_stderr": 0.04313531696750574 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5657894736842105, + "acc_stderr": 0.040335656678483205, + "acc_norm": 0.5657894736842105, + "acc_norm_stderr": 0.040335656678483205 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6075471698113207, + "acc_stderr": 0.030052580579557845, + "acc_norm": 0.6075471698113207, + "acc_norm_stderr": 0.030052580579557845 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5972222222222222, + "acc_stderr": 0.04101405519842426, + "acc_norm": 0.5972222222222222, + "acc_norm_stderr": 0.04101405519842426 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.046550104113196177, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.046550104113196177 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4553191489361702, + "acc_stderr": 0.03255525359340354, + "acc_norm": 0.4553191489361702, + "acc_norm_stderr": 0.03255525359340354 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537315, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537315 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5448275862068965, + "acc_stderr": 0.04149886942192117, + "acc_norm": 0.5448275862068965, + "acc_norm_stderr": 0.04149886942192117 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.02413015829976261, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.02413015829976261 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.04403438954768177, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.04403438954768177 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7193548387096774, + "acc_stderr": 0.025560604721022895, + "acc_norm": 0.7193548387096774, + "acc_norm_stderr": 0.025560604721022895 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4482758620689655, + "acc_stderr": 0.034991131376767445, + "acc_norm": 0.4482758620689655, + "acc_norm_stderr": 0.034991131376767445 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7212121212121212, + "acc_stderr": 0.03501438706296781, + "acc_norm": 0.7212121212121212, + "acc_norm_stderr": 0.03501438706296781 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.03053289223393203, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.03053289223393203 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8082901554404145, + "acc_stderr": 0.02840895362624526, + "acc_norm": 0.8082901554404145, + "acc_norm_stderr": 0.02840895362624526 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5641025641025641, + "acc_stderr": 0.025141801511177498, + "acc_norm": 0.5641025641025641, + "acc_norm_stderr": 0.025141801511177498 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.02831753349606648, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.02831753349606648 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5462184873949579, + "acc_stderr": 0.03233943468182088, + "acc_norm": 0.5462184873949579, + "acc_norm_stderr": 0.03233943468182088 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7577981651376147, + "acc_stderr": 0.01836817630659862, + "acc_norm": 0.7577981651376147, + "acc_norm_stderr": 0.01836817630659862 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.46296296296296297, + "acc_stderr": 0.03400603625538271, + "acc_norm": 0.46296296296296297, + "acc_norm_stderr": 0.03400603625538271 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7598039215686274, + "acc_stderr": 0.02998373305591361, + "acc_norm": 0.7598039215686274, + "acc_norm_stderr": 0.02998373305591361 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7721518987341772, + "acc_stderr": 0.027303484599069425, + "acc_norm": 0.7721518987341772, + "acc_norm_stderr": 0.027303484599069425 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6502242152466368, + "acc_stderr": 0.03200736719484503, + "acc_norm": 0.6502242152466368, + "acc_norm_stderr": 0.03200736719484503 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6564885496183206, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.6564885496183206, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7851239669421488, + "acc_stderr": 0.03749492448709697, + "acc_norm": 0.7851239669421488, + "acc_norm_stderr": 0.03749492448709697 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.0401910747255735, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.0401910747255735 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6503067484662577, + "acc_stderr": 0.037466683254700206, + "acc_norm": 0.6503067484662577, + "acc_norm_stderr": 0.037466683254700206 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8162393162393162, + "acc_stderr": 0.025372139671722933, + "acc_norm": 0.8162393162393162, + "acc_norm_stderr": 0.025372139671722933 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.776500638569604, + "acc_stderr": 0.01489723522945071, + "acc_norm": 0.776500638569604, + "acc_norm_stderr": 0.01489723522945071 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.630057803468208, + "acc_stderr": 0.02599247202930639, + "acc_norm": 0.630057803468208, + "acc_norm_stderr": 0.02599247202930639 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.47374301675977654, + "acc_stderr": 0.016699427672784765, + "acc_norm": 0.47374301675977654, + "acc_norm_stderr": 0.016699427672784765 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6274509803921569, + "acc_stderr": 0.027684181883302895, + "acc_norm": 0.6274509803921569, + "acc_norm_stderr": 0.027684181883302895 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6109324758842444, + "acc_stderr": 0.027690337536485372, + "acc_norm": 0.6109324758842444, + "acc_norm_stderr": 0.027690337536485372 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6512345679012346, + "acc_stderr": 0.026517597724465013, + "acc_norm": 0.6512345679012346, + "acc_norm_stderr": 0.026517597724465013 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.43617021276595747, + "acc_stderr": 0.029583452036284062, + "acc_norm": 0.43617021276595747, + "acc_norm_stderr": 0.029583452036284062 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4152542372881356, + "acc_stderr": 0.012585471793400664, + "acc_norm": 0.4152542372881356, + "acc_norm_stderr": 0.012585471793400664 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.03016191193076711, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.03016191193076711 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.020102583895887188, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.020102583895887188 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6816326530612244, + "acc_stderr": 0.029822533793982062, + "acc_norm": 0.6816326530612244, + "acc_norm_stderr": 0.029822533793982062 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7512437810945274, + "acc_stderr": 0.030567675938916714, + "acc_norm": 0.7512437810945274, + "acc_norm_stderr": 0.030567675938916714 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.035887028128263686, + "acc_norm": 0.85, + "acc_norm_stderr": 0.035887028128263686 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4879518072289157, + "acc_stderr": 0.03891364495835821, + "acc_norm": 0.4879518072289157, + "acc_norm_stderr": 0.03891364495835821 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8128654970760234, + "acc_stderr": 0.029913127232368036, + "acc_norm": 0.8128654970760234, + "acc_norm_stderr": 0.029913127232368036 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3390452876376989, + "mc1_stderr": 0.016571797910626608, + "mc2": 0.48893759481045423, + "mc2_stderr": 0.015166999616571152 + }, + "all": { + "acc": 0.5794124642378694, + "acc_stderr": 0.03416834318288517, + "acc_norm": 0.5835716976038536, + "acc_norm_stderr": 0.03414812541011655, + "mc1": 0.3390452876376989, + "mc1_stderr": 0.016571797910626608, + "mc2": 0.48893759481045423, + "mc2_stderr": 0.015166999616571152 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "andreaskoepf/llama2-13b-megacode2_min100", + "model_sha": "b38d1b53c358a0313c69bcceebe97628327ada82", + "model_dtype": "torch.float16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "3898.0683465003967", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/andreaskoepf/llama2-13b-megacode2_min100/results_2023-09-22T15-48-34.680007.json b/eval-results/andreaskoepf/llama2-13b-megacode2_min100/results_2023-09-22T15-48-34.680007.json new file mode 100644 index 0000000000000000000000000000000000000000..5916036fc1855740051546668d6c3631a2ca5508 --- /dev/null +++ b/eval-results/andreaskoepf/llama2-13b-megacode2_min100/results_2023-09-22T15-48-34.680007.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "andreaskoepf/llama2-13b-megacode2_min100", + "model_sha": "277905838b976bb59844ecd3b77c795423908c3a", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0030411073825503355, + "em_stderr": 0.0005638896908753115, + "f1": 0.07890205536912773, + "f1_stderr": 0.0016368809848969982 + }, + "harness|gsm8k|5": { + "acc": 0.15921152388172857, + "acc_stderr": 0.010077966717551878 + }, + "harness|winogrande|5": { + "acc": 0.7695343330702447, + "acc_stderr": 0.01183587216483668 + }, + "all": { + "em": 0.0030411073825503355, + "em_stderr": 0.0005638896908753115, + "f1": 0.07890205536912773, + "f1_stderr": 0.0016368809848969982, + "acc": 0.4643729284759866, + "acc_stderr": 0.010956919441194278 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "664acecb6dabb66e" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "8d62f23326531979" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "78365b6c8fbc4cd7" + }, + "total_evaluation_time_secondes": "12337.54636311531", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/anton-l/gpt-j-tiny-random/results_2023-07-18T16-12-24.842449.json b/eval-results/anton-l/gpt-j-tiny-random/results_2023-07-18T16-12-24.842449.json new file mode 100644 index 0000000000000000000000000000000000000000..cffad5c3bb56c2868299628b6f5d52daa4eb7158 --- /dev/null +++ b/eval-results/anton-l/gpt-j-tiny-random/results_2023-07-18T16-12-24.842449.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.23037542662116042, + "acc_stderr": 0.01230492841874761, + "acc_norm": 0.2636518771331058, + "acc_norm_stderr": 0.01287592915129707 + }, + "harness|hellaswag|10": { + "acc": 0.2559251145190201, + "acc_stderr": 0.004354881005789729, + "acc_norm": 0.25761800438159727, + "acc_norm_stderr": 0.004364287353415464 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768081, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768081 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.037498507091740206, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.037498507091740206 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.18421052631578946, + "acc_stderr": 0.0315469804508223, + "acc_norm": 0.18421052631578946, + "acc_norm_stderr": 0.0315469804508223 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2679245283018868, + "acc_stderr": 0.027257260322494845, + "acc_norm": 0.2679245283018868, + "acc_norm_stderr": 0.027257260322494845 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.03476590104304134, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.03476590104304134 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.15, + "acc_stderr": 0.03588702812826372, + "acc_norm": 0.15, + "acc_norm_stderr": 0.03588702812826372 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.0309528902177499, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.0309528902177499 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179961, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179961 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.32340425531914896, + "acc_stderr": 0.030579442773610334, + "acc_norm": 0.32340425531914896, + "acc_norm_stderr": 0.030579442773610334 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.04227054451232199, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.04227054451232199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2206896551724138, + "acc_stderr": 0.03455930201924811, + "acc_norm": 0.2206896551724138, + "acc_norm_stderr": 0.03455930201924811 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2566137566137566, + "acc_stderr": 0.022494510767503154, + "acc_norm": 0.2566137566137566, + "acc_norm_stderr": 0.022494510767503154 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.1984126984126984, + "acc_stderr": 0.03567016675276864, + "acc_norm": 0.1984126984126984, + "acc_norm_stderr": 0.03567016675276864 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25483870967741934, + "acc_stderr": 0.024790118459332208, + "acc_norm": 0.25483870967741934, + "acc_norm_stderr": 0.024790118459332208 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.270935960591133, + "acc_stderr": 0.031270907132976984, + "acc_norm": 0.270935960591133, + "acc_norm_stderr": 0.031270907132976984 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.24242424242424243, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.21717171717171718, + "acc_stderr": 0.029376616484945637, + "acc_norm": 0.21717171717171718, + "acc_norm_stderr": 0.029376616484945637 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.20725388601036268, + "acc_stderr": 0.02925282329180362, + "acc_norm": 0.20725388601036268, + "acc_norm_stderr": 0.02925282329180362 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2205128205128205, + "acc_stderr": 0.02102067268082791, + "acc_norm": 0.2205128205128205, + "acc_norm_stderr": 0.02102067268082791 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.026842057873833706, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.026842057873833706 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.23109243697478993, + "acc_stderr": 0.027381406927868966, + "acc_norm": 0.23109243697478993, + "acc_norm_stderr": 0.027381406927868966 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.1986754966887417, + "acc_stderr": 0.03257847384436775, + "acc_norm": 0.1986754966887417, + "acc_norm_stderr": 0.03257847384436775 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.23669724770642203, + "acc_stderr": 0.01822407811729908, + "acc_norm": 0.23669724770642203, + "acc_norm_stderr": 0.01822407811729908 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.16203703703703703, + "acc_stderr": 0.02513045365226846, + "acc_norm": 0.16203703703703703, + "acc_norm_stderr": 0.02513045365226846 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.029771775228145628, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.029771775228145628 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2616033755274262, + "acc_stderr": 0.028609516716994934, + "acc_norm": 0.2616033755274262, + "acc_norm_stderr": 0.028609516716994934 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.37668161434977576, + "acc_stderr": 0.032521134899291884, + "acc_norm": 0.37668161434977576, + "acc_norm_stderr": 0.032521134899291884 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.22900763358778625, + "acc_stderr": 0.036853466317118506, + "acc_norm": 0.22900763358778625, + "acc_norm_stderr": 0.036853466317118506 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.24793388429752067, + "acc_stderr": 0.03941897526516303, + "acc_norm": 0.24793388429752067, + "acc_norm_stderr": 0.03941897526516303 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.04414343666854933, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.04414343666854933 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.24539877300613497, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.24539877300613497, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.042878587513404544, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.042878587513404544 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.2524271844660194, + "acc_stderr": 0.04301250399690877, + "acc_norm": 0.2524271844660194, + "acc_norm_stderr": 0.04301250399690877 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2564102564102564, + "acc_stderr": 0.028605953702004253, + "acc_norm": 0.2564102564102564, + "acc_norm_stderr": 0.028605953702004253 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.28735632183908044, + "acc_stderr": 0.0161824107306827, + "acc_norm": 0.28735632183908044, + "acc_norm_stderr": 0.0161824107306827 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24566473988439305, + "acc_stderr": 0.02317629820399201, + "acc_norm": 0.24566473988439305, + "acc_norm_stderr": 0.02317629820399201 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22875816993464052, + "acc_stderr": 0.024051029739912258, + "acc_norm": 0.22875816993464052, + "acc_norm_stderr": 0.024051029739912258 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2733118971061093, + "acc_stderr": 0.02531176597542612, + "acc_norm": 0.2733118971061093, + "acc_norm_stderr": 0.02531176597542612 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2654320987654321, + "acc_stderr": 0.024569223600460845, + "acc_norm": 0.2654320987654321, + "acc_norm_stderr": 0.024569223600460845 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2553191489361702, + "acc_stderr": 0.02601199293090201, + "acc_norm": 0.2553191489361702, + "acc_norm_stderr": 0.02601199293090201 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2392438070404172, + "acc_stderr": 0.010896123652676651, + "acc_norm": 0.2392438070404172, + "acc_norm_stderr": 0.010896123652676651 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.20220588235294118, + "acc_stderr": 0.02439819298665492, + "acc_norm": 0.20220588235294118, + "acc_norm_stderr": 0.02439819298665492 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2565359477124183, + "acc_stderr": 0.01766784161237899, + "acc_norm": 0.2565359477124183, + "acc_norm_stderr": 0.01766784161237899 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.34545454545454546, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.34545454545454546, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.17142857142857143, + "acc_stderr": 0.02412746346265015, + "acc_norm": 0.17142857142857143, + "acc_norm_stderr": 0.02412746346265015 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23880597014925373, + "acc_stderr": 0.030147775935409224, + "acc_norm": 0.23880597014925373, + "acc_norm_stderr": 0.030147775935409224 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3192771084337349, + "acc_stderr": 0.0362933532994786, + "acc_norm": 0.3192771084337349, + "acc_norm_stderr": 0.0362933532994786 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.0312678171466318, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.0312678171466318 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862661, + "mc2": 0.47437931463443955, + "mc2_stderr": 0.015952073432116347 + }, + "all": { + "acc": 0.24457036052718048, + "acc_stderr": 0.031198426586247278, + "acc_norm": 0.24516306121149442, + "acc_norm_stderr": 0.03120826399438584, + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862661, + "mc2": 0.47437931463443955, + "mc2_stderr": 0.015952073432116347 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "anton-l/gpt-j-tiny-random", + "model_sha": "feea91564dac0081f73aeb6744979c6cfe553fff", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "ed17e576dbafa5da" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "0875c25c8fc0a94d" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "18cfffb76bc8f0d1" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "16b3626c8a5e3797" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "21f0989f5760198a" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "f7d801bfd913884d" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "23f9089575432d5a" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "04b8293f2ab7fbbf" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "7994d94bfa36d003" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "a2c91752be5b1798" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "db71da66ed82b921" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "e81cf9738ad7e157" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "4a2d5f00cb00d9b7" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e9bcfaa6beefb456" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "6f8215a3de7eebd1" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "aacac708cd4c5a61" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "16b6c6e390eb7cea" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "4130880a19c4edb0" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "96b81f570a84328b" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "e3a7592f84b44888" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "f9edf462e8201551" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "ecf7754754c2bb76" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "30b07e31cf9b5c6f" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "4d1dc7c4ad251829" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "d36b9d9f0f4424fe" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "a0a7af55ac7ae037" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "84fd36aa004c8578" + } + } +} \ No newline at end of file diff --git a/eval-results/anton-l/gpt-j-tiny-random/results_2023-10-28T06-54-36.859964.json b/eval-results/anton-l/gpt-j-tiny-random/results_2023-10-28T06-54-36.859964.json new file mode 100644 index 0000000000000000000000000000000000000000..a9b96255077dcffdc7ee2ca60effb4b68ec20244 --- /dev/null +++ b/eval-results/anton-l/gpt-j-tiny-random/results_2023-10-28T06-54-36.859964.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "anton-l/gpt-j-tiny-random", + "model_sha": "feea91564dac0081f73aeb6744979c6cfe553fff", + "model_size": "150.59 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 9.857382550335573e-05, + "f1_stderr": 2.430375363900546e-05 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.494869771112865, + "acc_stderr": 0.014051745961790516 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 9.857382550335573e-05, + "f1_stderr": 2.430375363900546e-05, + "acc": 0.2474348855564325, + "acc_stderr": 0.007025872980895258 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f21277d2c2d2e06c", + "hash_cont_tokens": "b0906acdaae9f0e4" + }, + "truncated": 382, + "non-truncated": 9154, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "d6b78d4f8dac1730" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "26cd3631535039d0", + "hash_cont_tokens": "9bcdadc1c7903e21" + }, + "total_evaluation_time_secondes": "8249.86591386795", + "truncated": 382, + "non-truncated": 13007, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/ashercn97/giraffe-7b/results_2023-08-02T15-44-19.746565.json b/eval-results/ashercn97/giraffe-7b/results_2023-08-02T15-44-19.746565.json new file mode 100644 index 0000000000000000000000000000000000000000..7155e6608d43c23027766999dfad1fda0b2f4cf9 --- /dev/null +++ b/eval-results/ashercn97/giraffe-7b/results_2023-08-02T15-44-19.746565.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.4453924914675768, + "acc_stderr": 0.014523987638344085, + "acc_norm": 0.4718430034129693, + "acc_norm_stderr": 0.014588204105102202 + }, + "harness|hellaswag|10": { + "acc": 0.579964150567616, + "acc_stderr": 0.004925556104679418, + "acc_norm": 0.755327623979287, + "acc_norm_stderr": 0.004290142029921662 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.40131578947368424, + "acc_stderr": 0.039889037033362836, + "acc_norm": 0.40131578947368424, + "acc_norm_stderr": 0.039889037033362836 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4490566037735849, + "acc_stderr": 0.030612730713641092, + "acc_norm": 0.4490566037735849, + "acc_norm_stderr": 0.030612730713641092 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3402777777777778, + "acc_stderr": 0.03962135573486219, + "acc_norm": 0.3402777777777778, + "acc_norm_stderr": 0.03962135573486219 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.15, + "acc_stderr": 0.03588702812826369, + "acc_norm": 0.15, + "acc_norm_stderr": 0.03588702812826369 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3699421965317919, + "acc_stderr": 0.036812296333943194, + "acc_norm": 0.3699421965317919, + "acc_norm_stderr": 0.036812296333943194 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179962, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179962 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.33191489361702126, + "acc_stderr": 0.03078373675774566, + "acc_norm": 0.33191489361702126, + "acc_norm_stderr": 0.03078373675774566 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.04227054451232199, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.04227054451232199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4206896551724138, + "acc_stderr": 0.0411391498118926, + "acc_norm": 0.4206896551724138, + "acc_norm_stderr": 0.0411391498118926 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.26455026455026454, + "acc_stderr": 0.022717467897708624, + "acc_norm": 0.26455026455026454, + "acc_norm_stderr": 0.022717467897708624 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.0404061017820884, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.0404061017820884 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.36774193548387096, + "acc_stderr": 0.02743086657997346, + "acc_norm": 0.36774193548387096, + "acc_norm_stderr": 0.02743086657997346 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2561576354679803, + "acc_stderr": 0.0307127300709826, + "acc_norm": 0.2561576354679803, + "acc_norm_stderr": 0.0307127300709826 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.4121212121212121, + "acc_stderr": 0.03843566993588717, + "acc_norm": 0.4121212121212121, + "acc_norm_stderr": 0.03843566993588717 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.4494949494949495, + "acc_stderr": 0.0354413249194797, + "acc_norm": 0.4494949494949495, + "acc_norm_stderr": 0.0354413249194797 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.5233160621761658, + "acc_stderr": 0.03604513672442202, + "acc_norm": 0.5233160621761658, + "acc_norm_stderr": 0.03604513672442202 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3923076923076923, + "acc_stderr": 0.024756000382130945, + "acc_norm": 0.3923076923076923, + "acc_norm_stderr": 0.024756000382130945 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085622, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085622 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.031041941304059288, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.031041941304059288 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.037345356767871984, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.037345356767871984 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.48807339449541287, + "acc_stderr": 0.021431223617362223, + "acc_norm": 0.48807339449541287, + "acc_norm_stderr": 0.021431223617362223 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.20833333333333334, + "acc_stderr": 0.027696910713093936, + "acc_norm": 0.20833333333333334, + "acc_norm_stderr": 0.027696910713093936 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.36764705882352944, + "acc_stderr": 0.03384132045674118, + "acc_norm": 0.36764705882352944, + "acc_norm_stderr": 0.03384132045674118 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.4810126582278481, + "acc_stderr": 0.032523751480904466, + "acc_norm": 0.4810126582278481, + "acc_norm_stderr": 0.032523751480904466 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.45739910313901344, + "acc_stderr": 0.033435777055830646, + "acc_norm": 0.45739910313901344, + "acc_norm_stderr": 0.033435777055830646 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5038167938931297, + "acc_stderr": 0.043851623256015534, + "acc_norm": 0.5038167938931297, + "acc_norm_stderr": 0.043851623256015534 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.512396694214876, + "acc_stderr": 0.04562951548180765, + "acc_norm": 0.512396694214876, + "acc_norm_stderr": 0.04562951548180765 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.0478034362693679, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.0478034362693679 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4171779141104294, + "acc_stderr": 0.038741028598180814, + "acc_norm": 0.4171779141104294, + "acc_norm_stderr": 0.038741028598180814 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.044328040552915185, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.044328040552915185 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.4563106796116505, + "acc_stderr": 0.049318019942204146, + "acc_norm": 0.4563106796116505, + "acc_norm_stderr": 0.049318019942204146 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6025641025641025, + "acc_stderr": 0.03205953453789293, + "acc_norm": 0.6025641025641025, + "acc_norm_stderr": 0.03205953453789293 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.017869330154003705, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.017869330154003705 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.41329479768786126, + "acc_stderr": 0.026511261369409247, + "acc_norm": 0.41329479768786126, + "acc_norm_stderr": 0.026511261369409247 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.27039106145251396, + "acc_stderr": 0.014854993938010073, + "acc_norm": 0.27039106145251396, + "acc_norm_stderr": 0.014854993938010073 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4215686274509804, + "acc_stderr": 0.02827549015679143, + "acc_norm": 0.4215686274509804, + "acc_norm_stderr": 0.02827549015679143 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.4405144694533762, + "acc_stderr": 0.028196400574197426, + "acc_norm": 0.4405144694533762, + "acc_norm_stderr": 0.028196400574197426 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.4228395061728395, + "acc_stderr": 0.027487472980871598, + "acc_norm": 0.4228395061728395, + "acc_norm_stderr": 0.027487472980871598 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3404255319148936, + "acc_stderr": 0.028267657482650147, + "acc_norm": 0.3404255319148936, + "acc_norm_stderr": 0.028267657482650147 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2900912646675359, + "acc_stderr": 0.011590375554733095, + "acc_norm": 0.2900912646675359, + "acc_norm_stderr": 0.011590375554733095 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.3382352941176471, + "acc_stderr": 0.02873932851398358, + "acc_norm": 0.3382352941176471, + "acc_norm_stderr": 0.02873932851398358 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.3758169934640523, + "acc_stderr": 0.019594021136577443, + "acc_norm": 0.3758169934640523, + "acc_norm_stderr": 0.019594021136577443 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.44545454545454544, + "acc_stderr": 0.047605488214603246, + "acc_norm": 0.44545454545454544, + "acc_norm_stderr": 0.047605488214603246 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.39183673469387753, + "acc_stderr": 0.031251275910891656, + "acc_norm": 0.39183673469387753, + "acc_norm_stderr": 0.031251275910891656 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.5074626865671642, + "acc_stderr": 0.03535140084276719, + "acc_norm": 0.5074626865671642, + "acc_norm_stderr": 0.03535140084276719 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3795180722891566, + "acc_stderr": 0.03777798822748017, + "acc_norm": 0.3795180722891566, + "acc_norm_stderr": 0.03777798822748017 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.5321637426900585, + "acc_stderr": 0.03826882417660368, + "acc_norm": 0.5321637426900585, + "acc_norm_stderr": 0.03826882417660368 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2558139534883721, + "mc1_stderr": 0.015274176219283361, + "mc2": 0.3847837297072023, + "mc2_stderr": 0.014407487285517882 + }, + "all": { + "acc": 0.3930879439944123, + "acc_stderr": 0.03484097392739301, + "acc_norm": 0.3965085200174134, + "acc_norm_stderr": 0.034831292612003184, + "mc1": 0.2558139534883721, + "mc1_stderr": 0.015274176219283361, + "mc2": 0.3847837297072023, + "mc2_stderr": 0.014407487285517882 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "ashercn97/giraffe-7b", + "model_sha": "9af88449bed5be4709befcfbbba123ee75805479", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "7cefb32e2563a8e3", + "hash_cont_tokens": "69111ccf8c982ca3" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "e4a72fc2bbea66ff", + "hash_cont_tokens": "95e9e7b994fc9459" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40144, + "non-padded": 24, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "1430bf2cb1d054e2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "c4f45f8ebf944893", + "hash_cont_tokens": "1d81fa80e3039a08" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "7b6c0659a104d6af", + "hash_cont_tokens": "66af3c333e2e33b4" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ca33ffee63980ac1", + "hash_cont_tokens": "aaaffbddbbdeecf6" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "a6aba95384c46b37", + "hash_cont_tokens": "26e3b69d5fb27bb2" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "95d92a1a2c158e2c", + "hash_cont_tokens": "439194ce25a22be1" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "70284e3c06933186", + "hash_cont_tokens": "61d2a6a419b64891" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "028608b4301fcfd2", + "hash_cont_tokens": "c6e8af4875843f62" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "02619f96ae20cf1e", + "hash_cont_tokens": "16dc0a68339e577b" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0282a73e02cf4b34", + "hash_cont_tokens": "0002f8908e2c5604" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5d0425cf2abddd51", + "hash_cont_tokens": "e76629783418737c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "560574f683641143", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "dc3987c35bc329e5", + "hash_cont_tokens": "29089b8b7020611e" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "be83fdd674b48356", + "hash_cont_tokens": "1a48dc73e5858180" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "00155bf1a1a1ebc7", + "hash_cont_tokens": "70817a7ac9f44af2" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "ce05b52b00498cf6", + "hash_cont_tokens": "5f0fe4a20633fc93" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "728bd41242158358", + "hash_cont_tokens": "f6e9cfb72237b427" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "190511206bf21530", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "2bc219567947ac68", + "hash_cont_tokens": "b433f62158dd2580" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "8477b93b8643d23f", + "hash_cont_tokens": "684af197bf78c021" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "0e15ea7b43890b3c", + "hash_cont_tokens": "54a0f1c97373f6fc" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "142b719c7d7d4fe0", + "hash_cont_tokens": "91dc522e4e4e91c3" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4bf76efe7796945e", + "hash_cont_tokens": "f275c901b3d285f9" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "e3a453e5fb044f52", + "hash_cont_tokens": "0bd598173199fc25" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "f47a1c2b0c018aff", + "hash_cont_tokens": "39a93706184f896b" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "35bc9ee85a563c15", + "hash_cont_tokens": "f0399631229c4bbe" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62a083d4ceb83864", + "hash_cont_tokens": "28c1f7c11bf85409" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "cd96d409604783e4", + "hash_cont_tokens": "8c47901880333cb3" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "3c716ffc27f83e15", + "hash_cont_tokens": "f249c949ec94fca0" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "fd8217f7edf722f8", + "hash_cont_tokens": "ddd1c111a92fc7bb" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a54112084a848a44", + "hash_cont_tokens": "2529d55ec490f81f" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "89cf33fb840f27be", + "hash_cont_tokens": "b34590804e071493" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "0a2b6ab3ae0e3b7c", + "hash_cont_tokens": "92acdd467ed943e1" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f28777a6fdce1d2b", + "hash_cont_tokens": "a6034ed95a124315" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "8282921a7a07bd5a", + "hash_cont_tokens": "74ff4b135356f4df" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "3aa62568b80ee7ca", + "hash_cont_tokens": "7c8e30f486ff156a" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "731b1d04f2da3d9a", + "hash_cont_tokens": "a457f0c06facf520" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96e1af14c8358ac2", + "hash_cont_tokens": "64c3774d71dc7eb8" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "bc2e4bf4e7cf5c39", + "hash_cont_tokens": "66b726b356a02feb" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "abed130d5c3867a4", + "hash_cont_tokens": "f08457005b652d25" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "83d7d50bc2ebab43", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "57004a232a08258a", + "hash_cont_tokens": "647bcbd68f292558" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "bb9518d436087f70", + "hash_cont_tokens": "5a7b498edf3beb7f" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1365, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3edebd0b46a85682", + "hash_cont_tokens": "1999ef9e9c46608f" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "815607301732a13f", + "hash_cont_tokens": "6017425ca4648660" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "952254859587db3e", + "hash_cont_tokens": "6e39384b9c0a8cc2" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "1429d150f124f76e", + "hash_cont_tokens": "87b66d935a56bb5e" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "9f8bfa3b87b58a38", + "hash_cont_tokens": "e7d0d323ac74ab59" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "f638aace411a0bd9", + "hash_cont_tokens": "0ff990d9cc38024d" + }, + "truncated": 168, + "non-truncated": 5968, + "padded": 5968, + "non-padded": 168, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c0f160879d378d4d", + "hash_cont_tokens": "a271b36d0db8278e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "548450e483004f15", + "hash_cont_tokens": "defde1e859d464f7" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "47f43ebfaa773712", + "hash_cont_tokens": "14bc759bc8de7252" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "0350ab02a3d50c5f", + "hash_cont_tokens": "b708a77b01f2529c" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "e010003b38f6d86a", + "hash_cont_tokens": "b4962d9e583b12c0" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "99959731e92e9eb1", + "hash_cont_tokens": "e19f8e17c9c18790" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "841a69043fcd7645", + "hash_cont_tokens": "397a75462a9735e3" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6faa0998b440e497", + "hash_cont_tokens": "6e5059a6697f3e71" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "fe347abbeff2a4c1", + "hash_cont_tokens": "a48530ac09baa92c" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3f79e8edf26f0efd", + "hash_cont_tokens": "ce4faf0c896cc73e" + }, + "total_evaluation_time_secondes": "2515.82621717453", + "truncated": 1644, + "non-truncated": 109375, + "padded": 109332, + "non-padded": 1687, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/ashercn97/giraffe-7b/results_2023-09-22T20-53-47.065964.json b/eval-results/ashercn97/giraffe-7b/results_2023-09-22T20-53-47.065964.json new file mode 100644 index 0000000000000000000000000000000000000000..5ee5478c99a1399b7e31cbf82d8570234d3eb178 --- /dev/null +++ b/eval-results/ashercn97/giraffe-7b/results_2023-09-22T20-53-47.065964.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "ashercn97/giraffe-7b", + "model_sha": "9af88449bed5be4709befcfbbba123ee75805479", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.00388003355704698, + "em_stderr": 0.0006366682825520032, + "f1": 0.06388317953020159, + "f1_stderr": 0.0014760537495948263 + }, + "harness|gsm8k|5": { + "acc": 0.026535253980288095, + "acc_stderr": 0.004427045987265172 + }, + "harness|winogrande|5": { + "acc": 0.6898184688239937, + "acc_stderr": 0.013000454144859902 + }, + "all": { + "em": 0.00388003355704698, + "em_stderr": 0.0006366682825520032, + "f1": 0.06388317953020159, + "f1_stderr": 0.0014760537495948263, + "acc": 0.3581768614021409, + "acc_stderr": 0.008713750066062537 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a65c9eacad86ea52", + "hash_cont_tokens": "622e1f141a84e228" + }, + "truncated": 980, + "non-truncated": 8556, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bf7d8c6b5e4f7948", + "hash_cont_tokens": "704db9879078d09c" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "647d8b2cafc100bc", + "hash_cont_tokens": "828897df1f4f08a1" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2433, + "non-padded": 101, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a65e1c92b9137d17", + "hash_cont_tokens": "8e0ee06260df6f06" + }, + "total_evaluation_time_secondes": "10384.346270084381", + "truncated": 980, + "non-truncated": 12409, + "padded": 2433, + "non-padded": 10956, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/ashercn97/manatee-7b/results_2023-08-02T16-08-56.879142.json b/eval-results/ashercn97/manatee-7b/results_2023-08-02T16-08-56.879142.json new file mode 100644 index 0000000000000000000000000000000000000000..779247ed60e077c329252e871847e0260a8306d4 --- /dev/null +++ b/eval-results/ashercn97/manatee-7b/results_2023-08-02T16-08-56.879142.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.514505119453925, + "acc_stderr": 0.014605241081370056, + "acc_norm": 0.5452218430034129, + "acc_norm_stderr": 0.014551507060836355 + }, + "harness|hellaswag|10": { + "acc": 0.5987851025692094, + "acc_stderr": 0.0048914265333906285, + "acc_norm": 0.7894841665006971, + "acc_norm_stderr": 0.00406841841727567 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768081, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768081 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.04316378599511324, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.04316378599511324 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4144736842105263, + "acc_stderr": 0.04008973785779206, + "acc_norm": 0.4144736842105263, + "acc_norm_stderr": 0.04008973785779206 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5132075471698113, + "acc_stderr": 0.030762134874500476, + "acc_norm": 0.5132075471698113, + "acc_norm_stderr": 0.030762134874500476 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4861111111111111, + "acc_stderr": 0.041795966175810016, + "acc_norm": 0.4861111111111111, + "acc_norm_stderr": 0.041795966175810016 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.43352601156069365, + "acc_stderr": 0.03778621079092055, + "acc_norm": 0.43352601156069365, + "acc_norm_stderr": 0.03778621079092055 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.04023382273617747, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.04023382273617747 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.58, + "acc_stderr": 0.04960449637488583, + "acc_norm": 0.58, + "acc_norm_stderr": 0.04960449637488583 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4297872340425532, + "acc_stderr": 0.03236214467715563, + "acc_norm": 0.4297872340425532, + "acc_norm_stderr": 0.03236214467715563 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322004, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322004 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.496551724137931, + "acc_stderr": 0.04166567577101579, + "acc_norm": 0.496551724137931, + "acc_norm_stderr": 0.04166567577101579 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29894179894179895, + "acc_stderr": 0.023577604791655805, + "acc_norm": 0.29894179894179895, + "acc_norm_stderr": 0.023577604791655805 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04216370213557835, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04216370213557835 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5516129032258065, + "acc_stderr": 0.02829205683011273, + "acc_norm": 0.5516129032258065, + "acc_norm_stderr": 0.02829205683011273 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3891625615763547, + "acc_stderr": 0.03430462416103872, + "acc_norm": 0.3891625615763547, + "acc_norm_stderr": 0.03430462416103872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6181818181818182, + "acc_stderr": 0.037937131711656344, + "acc_norm": 0.6181818181818182, + "acc_norm_stderr": 0.037937131711656344 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.0347327959083696, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.0347327959083696 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7046632124352331, + "acc_stderr": 0.03292296639155141, + "acc_norm": 0.7046632124352331, + "acc_norm_stderr": 0.03292296639155141 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.46153846153846156, + "acc_stderr": 0.025275892070240634, + "acc_norm": 0.46153846153846156, + "acc_norm_stderr": 0.025275892070240634 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085622, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085622 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4579831932773109, + "acc_stderr": 0.03236361111951941, + "acc_norm": 0.4579831932773109, + "acc_norm_stderr": 0.03236361111951941 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943342, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943342 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6568807339449542, + "acc_stderr": 0.02035477773608604, + "acc_norm": 0.6568807339449542, + "acc_norm_stderr": 0.02035477773608604 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.37962962962962965, + "acc_stderr": 0.03309682581119035, + "acc_norm": 0.37962962962962965, + "acc_norm_stderr": 0.03309682581119035 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6813725490196079, + "acc_stderr": 0.03270287181482081, + "acc_norm": 0.6813725490196079, + "acc_norm_stderr": 0.03270287181482081 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.030685820596610812, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.030685820596610812 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5919282511210763, + "acc_stderr": 0.03298574607842821, + "acc_norm": 0.5919282511210763, + "acc_norm_stderr": 0.03298574607842821 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5419847328244275, + "acc_stderr": 0.04369802690578756, + "acc_norm": 0.5419847328244275, + "acc_norm_stderr": 0.04369802690578756 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.628099173553719, + "acc_stderr": 0.044120158066245044, + "acc_norm": 0.628099173553719, + "acc_norm_stderr": 0.044120158066245044 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.048262172941398944, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.048262172941398944 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5214723926380368, + "acc_stderr": 0.03924746876751129, + "acc_norm": 0.5214723926380368, + "acc_norm_stderr": 0.03924746876751129 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3392857142857143, + "acc_stderr": 0.0449394906861354, + "acc_norm": 0.3392857142857143, + "acc_norm_stderr": 0.0449394906861354 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6796116504854369, + "acc_stderr": 0.04620284082280041, + "acc_norm": 0.6796116504854369, + "acc_norm_stderr": 0.04620284082280041 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7435897435897436, + "acc_stderr": 0.028605953702004253, + "acc_norm": 0.7435897435897436, + "acc_norm_stderr": 0.028605953702004253 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6781609195402298, + "acc_stderr": 0.0167063814150579, + "acc_norm": 0.6781609195402298, + "acc_norm_stderr": 0.0167063814150579 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.546242774566474, + "acc_stderr": 0.02680372058320618, + "acc_norm": 0.546242774566474, + "acc_norm_stderr": 0.02680372058320618 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2782122905027933, + "acc_stderr": 0.014987325439963556, + "acc_norm": 0.2782122905027933, + "acc_norm_stderr": 0.014987325439963556 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5359477124183006, + "acc_stderr": 0.02855582751652878, + "acc_norm": 0.5359477124183006, + "acc_norm_stderr": 0.02855582751652878 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.617363344051447, + "acc_stderr": 0.02760468902858199, + "acc_norm": 0.617363344051447, + "acc_norm_stderr": 0.02760468902858199 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5154320987654321, + "acc_stderr": 0.027807490044276198, + "acc_norm": 0.5154320987654321, + "acc_norm_stderr": 0.027807490044276198 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.39361702127659576, + "acc_stderr": 0.029144544781596147, + "acc_norm": 0.39361702127659576, + "acc_norm_stderr": 0.029144544781596147 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.37809647979139505, + "acc_stderr": 0.012384878406798095, + "acc_norm": 0.37809647979139505, + "acc_norm_stderr": 0.012384878406798095 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5257352941176471, + "acc_stderr": 0.03033257809455504, + "acc_norm": 0.5257352941176471, + "acc_norm_stderr": 0.03033257809455504 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.47549019607843135, + "acc_stderr": 0.020203517280261443, + "acc_norm": 0.47549019607843135, + "acc_norm_stderr": 0.020203517280261443 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5909090909090909, + "acc_stderr": 0.04709306978661895, + "acc_norm": 0.5909090909090909, + "acc_norm_stderr": 0.04709306978661895 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5142857142857142, + "acc_stderr": 0.03199615232806286, + "acc_norm": 0.5142857142857142, + "acc_norm_stderr": 0.03199615232806286 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03333333333333334, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03333333333333334 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.65, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.65, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.39759036144578314, + "acc_stderr": 0.038099730845402184, + "acc_norm": 0.39759036144578314, + "acc_norm_stderr": 0.038099730845402184 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6842105263157895, + "acc_stderr": 0.03565079670708311, + "acc_norm": 0.6842105263157895, + "acc_norm_stderr": 0.03565079670708311 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.32068543451652387, + "mc1_stderr": 0.0163391703732809, + "mc2": 0.46772048615027084, + "mc2_stderr": 0.015035643780659039 + }, + "all": { + "acc": 0.4948058965365206, + "acc_stderr": 0.035191453076268246, + "acc_norm": 0.4985587064938253, + "acc_norm_stderr": 0.03517659304005386, + "mc1": 0.32068543451652387, + "mc1_stderr": 0.0163391703732809, + "mc2": 0.46772048615027084, + "mc2_stderr": 0.015035643780659039 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "ashercn97/manatee-7b", + "model_sha": "e66094c43ffe6c5b3f4164cd4ba048d3bc422fd0", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4016.8486897945404", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/ashercn97/manatee-7b/results_2023-09-17T18-42-42.384089.json b/eval-results/ashercn97/manatee-7b/results_2023-09-17T18-42-42.384089.json new file mode 100644 index 0000000000000000000000000000000000000000..4ffb89d8e49e384e85ec5c6a375554611548d538 --- /dev/null +++ b/eval-results/ashercn97/manatee-7b/results_2023-09-17T18-42-42.384089.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "ashercn97/manatee-7b", + "model_sha": "21fb249c067b2290e8ef40c41074f55a06bbb1f7", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0030411073825503355, + "em_stderr": 0.0005638896908753201, + "f1": 0.059899328859060456, + "f1_stderr": 0.001397556369094792 + }, + "harness|gsm8k|5": { + "acc": 0.07050796057619409, + "acc_stderr": 0.0070515438139836135 + }, + "harness|winogrande|5": { + "acc": 0.745067087608524, + "acc_stderr": 0.012248806969376422 + }, + "all": { + "em": 0.0030411073825503355, + "em_stderr": 0.0005638896908753201, + "f1": 0.059899328859060456, + "f1_stderr": 0.001397556369094792, + "acc": 0.4077875240923591, + "acc_stderr": 0.009650175391680019 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "2f37256f700ede68" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "e7ddc1d84f8bc8ef" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "5518a0b991700763" + }, + "total_evaluation_time_secondes": "9941.0641913414", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/bertin-project/bertin-gpt-j-6B-alpaca/results_2023-08-17T15-41-33.782681.json b/eval-results/bertin-project/bertin-gpt-j-6B-alpaca/results_2023-08-17T15-41-33.782681.json new file mode 100644 index 0000000000000000000000000000000000000000..9109012be17dc69d08612d1806d1721d3a709236 --- /dev/null +++ b/eval-results/bertin-project/bertin-gpt-j-6B-alpaca/results_2023-08-17T15-41-33.782681.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.33361774744027306, + "acc_stderr": 0.013778687054176546, + "acc_norm": 0.36006825938566556, + "acc_norm_stderr": 0.01402751681458519 + }, + "harness|hellaswag|10": { + "acc": 0.42013543118900615, + "acc_stderr": 0.004925717008099713, + "acc_norm": 0.5430193188607847, + "acc_norm_stderr": 0.004971278309204198 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.21481481481481482, + "acc_stderr": 0.035478541985608236, + "acc_norm": 0.21481481481481482, + "acc_norm_stderr": 0.035478541985608236 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.0378272898086547, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.0378272898086547 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.3283018867924528, + "acc_stderr": 0.02890159361241178, + "acc_norm": 0.3283018867924528, + "acc_norm_stderr": 0.02890159361241178 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2708333333333333, + "acc_stderr": 0.037161774375660164, + "acc_norm": 0.2708333333333333, + "acc_norm_stderr": 0.037161774375660164 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3352601156069364, + "acc_stderr": 0.03599586301247077, + "acc_norm": 0.3352601156069364, + "acc_norm_stderr": 0.03599586301247077 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.04389869956808778, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.04389869956808778 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.25957446808510637, + "acc_stderr": 0.028659179374292323, + "acc_norm": 0.25957446808510637, + "acc_norm_stderr": 0.028659179374292323 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.04142439719489362, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.04142439719489362 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.03724563619774632, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.03724563619774632 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.022182037202948365, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.022182037202948365 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.03852273364924316, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.03852273364924316 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24193548387096775, + "acc_stderr": 0.0243625996930311, + "acc_norm": 0.24193548387096775, + "acc_norm_stderr": 0.0243625996930311 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.26108374384236455, + "acc_stderr": 0.030903796952114468, + "acc_norm": 0.26108374384236455, + "acc_norm_stderr": 0.030903796952114468 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.24242424242424243, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3434343434343434, + "acc_stderr": 0.03383201223244444, + "acc_norm": 0.3434343434343434, + "acc_norm_stderr": 0.03383201223244444 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.3471502590673575, + "acc_stderr": 0.03435696168361355, + "acc_norm": 0.3471502590673575, + "acc_norm_stderr": 0.03435696168361355 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.31794871794871793, + "acc_stderr": 0.02361088430892786, + "acc_norm": 0.31794871794871793, + "acc_norm_stderr": 0.02361088430892786 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.02620276653465215, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.02620276653465215 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2815126050420168, + "acc_stderr": 0.02921354941437216, + "acc_norm": 0.2815126050420168, + "acc_norm_stderr": 0.02921354941437216 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.03631329803969653, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.03631329803969653 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3211009174311927, + "acc_stderr": 0.020018149772733744, + "acc_norm": 0.3211009174311927, + "acc_norm_stderr": 0.020018149772733744 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.39814814814814814, + "acc_stderr": 0.033384734032074016, + "acc_norm": 0.39814814814814814, + "acc_norm_stderr": 0.033384734032074016 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.031660096793998116, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.031660096793998116 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.23628691983122363, + "acc_stderr": 0.027652153144159274, + "acc_norm": 0.23628691983122363, + "acc_norm_stderr": 0.027652153144159274 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.242152466367713, + "acc_stderr": 0.028751392398694755, + "acc_norm": 0.242152466367713, + "acc_norm_stderr": 0.028751392398694755 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.20610687022900764, + "acc_stderr": 0.035477710041594654, + "acc_norm": 0.20610687022900764, + "acc_norm_stderr": 0.035477710041594654 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.21487603305785125, + "acc_stderr": 0.03749492448709698, + "acc_norm": 0.21487603305785125, + "acc_norm_stderr": 0.03749492448709698 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.04284467968052191, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.04284467968052191 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.26380368098159507, + "acc_stderr": 0.03462419931615623, + "acc_norm": 0.26380368098159507, + "acc_norm_stderr": 0.03462419931615623 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25, + "acc_stderr": 0.04109974682633932, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04109974682633932 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.2815533980582524, + "acc_stderr": 0.044532548363264673, + "acc_norm": 0.2815533980582524, + "acc_norm_stderr": 0.044532548363264673 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.21367521367521367, + "acc_stderr": 0.026853450377009182, + "acc_norm": 0.21367521367521367, + "acc_norm_stderr": 0.026853450377009182 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.23754789272030652, + "acc_stderr": 0.015218733046150191, + "acc_norm": 0.23754789272030652, + "acc_norm_stderr": 0.015218733046150191 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2543352601156069, + "acc_stderr": 0.02344582627654554, + "acc_norm": 0.2543352601156069, + "acc_norm_stderr": 0.02344582627654554 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.27124183006535946, + "acc_stderr": 0.025457756696667874, + "acc_norm": 0.27124183006535946, + "acc_norm_stderr": 0.025457756696667874 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2540192926045016, + "acc_stderr": 0.024723861504771696, + "acc_norm": 0.2540192926045016, + "acc_norm_stderr": 0.024723861504771696 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.24382716049382716, + "acc_stderr": 0.023891879541959607, + "acc_norm": 0.24382716049382716, + "acc_norm_stderr": 0.023891879541959607 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2730496453900709, + "acc_stderr": 0.02657786094330786, + "acc_norm": 0.2730496453900709, + "acc_norm_stderr": 0.02657786094330786 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2522816166883963, + "acc_stderr": 0.01109278905687524, + "acc_norm": 0.2522816166883963, + "acc_norm_stderr": 0.01109278905687524 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4007352941176471, + "acc_stderr": 0.0297682635289331, + "acc_norm": 0.4007352941176471, + "acc_norm_stderr": 0.0297682635289331 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2434640522875817, + "acc_stderr": 0.01736247376214662, + "acc_norm": 0.2434640522875817, + "acc_norm_stderr": 0.01736247376214662 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.3, + "acc_stderr": 0.04389311454644287, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04389311454644287 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.39183673469387753, + "acc_stderr": 0.03125127591089165, + "acc_norm": 0.39183673469387753, + "acc_norm_stderr": 0.03125127591089165 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401466, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401466 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.27710843373493976, + "acc_stderr": 0.034843315926805875, + "acc_norm": 0.27710843373493976, + "acc_norm_stderr": 0.034843315926805875 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.22807017543859648, + "acc_stderr": 0.03218093795602357, + "acc_norm": 0.22807017543859648, + "acc_norm_stderr": 0.03218093795602357 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.27539779681762544, + "mc1_stderr": 0.015638135667775523, + "mc2": 0.4337930440514085, + "mc2_stderr": 0.015323951043221954 + }, + "all": { + "acc": 0.2799977305622539, + "acc_stderr": 0.03255815504380166, + "acc_norm": 0.282528822081189, + "acc_norm_stderr": 0.03256314472281036, + "mc1": 0.27539779681762544, + "mc1_stderr": 0.015638135667775523, + "mc2": 0.4337930440514085, + "mc2_stderr": 0.015323951043221954 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "bertin-project/bertin-gpt-j-6B-alpaca", + "model_sha": "636b17d6044189343475d1889f076aba73036905", + "model_dtype": "torch.float16", + "lighteval_sha": "8bab069fee0c6e75ffa4c1ef8a9591c28ee0e049", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4685, + "non-padded": 2, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40045, + "non-padded": 123, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 680, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 16, + "non-truncated": 6120, + "padded": 6120, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "0893dfcb83435e7d", + "hash_cont_tokens": "6159bf1904a8c8fb" + }, + "total_evaluation_time_secondes": "2565.103686571121", + "truncated": 1492, + "non-truncated": 109527, + "padded": 109290, + "non-padded": 1729, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/bertin-project/bertin-gpt-j-6B-alpaca/results_2023-09-22T17-02-02.199354.json b/eval-results/bertin-project/bertin-gpt-j-6B-alpaca/results_2023-09-22T17-02-02.199354.json new file mode 100644 index 0000000000000000000000000000000000000000..ea470ffeb079dc73dc37408d1131f90c4a9616da --- /dev/null +++ b/eval-results/bertin-project/bertin-gpt-j-6B-alpaca/results_2023-09-22T17-02-02.199354.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "bertin-project/bertin-gpt-j-6B-alpaca", + "model_sha": "636b17d6044189343475d1889f076aba73036905", + "model_size": "11.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.016568791946308725, + "em_stderr": 0.0013072452323527502, + "f1": 0.07589660234899354, + "f1_stderr": 0.0018842940437008274 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5580110497237569, + "acc_stderr": 0.013957584079108989 + }, + "all": { + "em": 0.016568791946308725, + "em_stderr": 0.0013072452323527502, + "f1": 0.07589660234899354, + "f1_stderr": 0.0018842940437008274, + "acc": 0.27900552486187846, + "acc_stderr": 0.006978792039554494 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f21277d2c2d2e06c", + "hash_cont_tokens": "56140c59aabbb5e0" + }, + "truncated": 382, + "non-truncated": 9154, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "c5f2f8ac19ffcc28" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "26cd3631535039d0", + "hash_cont_tokens": "fdeaf0251f2e7ad0" + }, + "total_evaluation_time_secondes": "7005.4176478385925", + "truncated": 382, + "non-truncated": 13007, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/boomerchan/magpie-13b/results_2023-10-03T11-48-49.581129.json b/eval-results/boomerchan/magpie-13b/results_2023-10-03T11-48-49.581129.json new file mode 100644 index 0000000000000000000000000000000000000000..20dc09289dad50d5e7197225cbce2c313d3ab0ba --- /dev/null +++ b/eval-results/boomerchan/magpie-13b/results_2023-10-03T11-48-49.581129.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "boomerchan/magpie-13b", + "model_sha": "a58124cdc9f39ccd59d4290a8bdfda93ff3690dc", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5955631399317406, + "acc_stderr": 0.014342036483436175, + "acc_norm": 0.6331058020477816, + "acc_norm_stderr": 0.014084133118104298 + }, + "harness|hellaswag|10": { + "acc": 0.6403106950806612, + "acc_stderr": 0.004789284723955857, + "acc_norm": 0.8424616610237005, + "acc_norm_stderr": 0.0036356303524759065 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252606, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252606 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5723684210526315, + "acc_stderr": 0.04026097083296564, + "acc_norm": 0.5723684210526315, + "acc_norm_stderr": 0.04026097083296564 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6075471698113207, + "acc_stderr": 0.030052580579557845, + "acc_norm": 0.6075471698113207, + "acc_norm_stderr": 0.030052580579557845 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6319444444444444, + "acc_stderr": 0.04032999053960718, + "acc_norm": 0.6319444444444444, + "acc_norm_stderr": 0.04032999053960718 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5780346820809249, + "acc_stderr": 0.037657466938651504, + "acc_norm": 0.5780346820809249, + "acc_norm_stderr": 0.037657466938651504 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.04389869956808777, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.04389869956808777 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.03261936918467382, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.03261936918467382 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.043727482902780064, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.043727482902780064 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.496551724137931, + "acc_stderr": 0.041665675771015785, + "acc_norm": 0.496551724137931, + "acc_norm_stderr": 0.041665675771015785 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3306878306878307, + "acc_stderr": 0.024229965298425072, + "acc_norm": 0.3306878306878307, + "acc_norm_stderr": 0.024229965298425072 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.043062412591271526, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.043062412591271526 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6580645161290323, + "acc_stderr": 0.026985289576552746, + "acc_norm": 0.6580645161290323, + "acc_norm_stderr": 0.026985289576552746 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.45320197044334976, + "acc_stderr": 0.035025446508458714, + "acc_norm": 0.45320197044334976, + "acc_norm_stderr": 0.035025446508458714 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6848484848484848, + "acc_stderr": 0.0362773057502241, + "acc_norm": 0.6848484848484848, + "acc_norm_stderr": 0.0362773057502241 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7474747474747475, + "acc_stderr": 0.030954055470365907, + "acc_norm": 0.7474747474747475, + "acc_norm_stderr": 0.030954055470365907 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8290155440414507, + "acc_stderr": 0.02717121368316453, + "acc_norm": 0.8290155440414507, + "acc_norm_stderr": 0.02717121368316453 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5358974358974359, + "acc_stderr": 0.025285585990017848, + "acc_norm": 0.5358974358974359, + "acc_norm_stderr": 0.025285585990017848 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32592592592592595, + "acc_stderr": 0.028578348365473072, + "acc_norm": 0.32592592592592595, + "acc_norm_stderr": 0.028578348365473072 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6134453781512605, + "acc_stderr": 0.0316314580755238, + "acc_norm": 0.6134453781512605, + "acc_norm_stderr": 0.0316314580755238 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7743119266055046, + "acc_stderr": 0.017923087667803064, + "acc_norm": 0.7743119266055046, + "acc_norm_stderr": 0.017923087667803064 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4537037037037037, + "acc_stderr": 0.03395322726375797, + "acc_norm": 0.4537037037037037, + "acc_norm_stderr": 0.03395322726375797 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.028867431449849313, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.028867431449849313 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7552742616033755, + "acc_stderr": 0.02798569938703642, + "acc_norm": 0.7552742616033755, + "acc_norm_stderr": 0.02798569938703642 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.672645739910314, + "acc_stderr": 0.03149384670994131, + "acc_norm": 0.672645739910314, + "acc_norm_stderr": 0.03149384670994131 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.03984979653302873, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.03984979653302873 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7962962962962963, + "acc_stderr": 0.03893542518824847, + "acc_norm": 0.7962962962962963, + "acc_norm_stderr": 0.03893542518824847 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6993865030674846, + "acc_stderr": 0.03602511318806771, + "acc_norm": 0.6993865030674846, + "acc_norm_stderr": 0.03602511318806771 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285713, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285713 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7669902912621359, + "acc_stderr": 0.04185832598928315, + "acc_norm": 0.7669902912621359, + "acc_norm_stderr": 0.04185832598928315 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8034188034188035, + "acc_stderr": 0.02603538609895129, + "acc_norm": 0.8034188034188035, + "acc_norm_stderr": 0.02603538609895129 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7752234993614304, + "acc_stderr": 0.014927447101937148, + "acc_norm": 0.7752234993614304, + "acc_norm_stderr": 0.014927447101937148 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6560693641618497, + "acc_stderr": 0.02557412378654666, + "acc_norm": 0.6560693641618497, + "acc_norm_stderr": 0.02557412378654666 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4670391061452514, + "acc_stderr": 0.016686126653013934, + "acc_norm": 0.4670391061452514, + "acc_norm_stderr": 0.016686126653013934 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6437908496732027, + "acc_stderr": 0.027420477662629235, + "acc_norm": 0.6437908496732027, + "acc_norm_stderr": 0.027420477662629235 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6495176848874598, + "acc_stderr": 0.027098652621301754, + "acc_norm": 0.6495176848874598, + "acc_norm_stderr": 0.027098652621301754 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6604938271604939, + "acc_stderr": 0.026348564412011624, + "acc_norm": 0.6604938271604939, + "acc_norm_stderr": 0.026348564412011624 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.44680851063829785, + "acc_stderr": 0.029658235097666904, + "acc_norm": 0.44680851063829785, + "acc_norm_stderr": 0.029658235097666904 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44132985658409385, + "acc_stderr": 0.01268201633564667, + "acc_norm": 0.44132985658409385, + "acc_norm_stderr": 0.01268201633564667 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.625, + "acc_stderr": 0.029408372932278746, + "acc_norm": 0.625, + "acc_norm_stderr": 0.029408372932278746 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5915032679738562, + "acc_stderr": 0.019886221037501862, + "acc_norm": 0.5915032679738562, + "acc_norm_stderr": 0.019886221037501862 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6181818181818182, + "acc_stderr": 0.046534298079135075, + "acc_norm": 0.6181818181818182, + "acc_norm_stderr": 0.046534298079135075 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6489795918367347, + "acc_stderr": 0.030555316755573637, + "acc_norm": 0.6489795918367347, + "acc_norm_stderr": 0.030555316755573637 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7810945273631841, + "acc_stderr": 0.029239174636647, + "acc_norm": 0.7810945273631841, + "acc_norm_stderr": 0.029239174636647 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.03487350880197769, + "acc_norm": 0.86, + "acc_norm_stderr": 0.03487350880197769 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4879518072289157, + "acc_stderr": 0.03891364495835821, + "acc_norm": 0.4879518072289157, + "acc_norm_stderr": 0.03891364495835821 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.783625730994152, + "acc_stderr": 0.031581495393387324, + "acc_norm": 0.783625730994152, + "acc_norm_stderr": 0.031581495393387324 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.34394124847001223, + "mc1_stderr": 0.01662908751427678, + "mc2": 0.49146975171261703, + "mc2_stderr": 0.015182175866066504 + }, + "all": { + "acc": 0.5827073934681583, + "acc_stderr": 0.034048991446061445, + "acc_norm": 0.5867699973335664, + "acc_norm_stderr": 0.03402506673865785, + "mc1": 0.34394124847001223, + "mc1_stderr": 0.01662908751427678, + "mc2": 0.49146975171261703, + "mc2_stderr": 0.015182175866066504 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6397.179164171219", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/boomerchan/magpie-13b/results_2023-10-27T04-34-42.967550.json b/eval-results/boomerchan/magpie-13b/results_2023-10-27T04-34-42.967550.json new file mode 100644 index 0000000000000000000000000000000000000000..135004c542a20ab352cb0caa0c5e373276b97b2e --- /dev/null +++ b/eval-results/boomerchan/magpie-13b/results_2023-10-27T04-34-42.967550.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "boomerchan/magpie-13b", + "model_sha": "a58124cdc9f39ccd59d4290a8bdfda93ff3690dc", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.14272231543624161, + "em_stderr": 0.003582171317651424, + "f1": 0.20778418624161069, + "f1_stderr": 0.0036307604368272656 + }, + "harness|gsm8k|5": { + "acc": 0.14480667172100076, + "acc_stderr": 0.009693234799052706 + }, + "harness|winogrande|5": { + "acc": 0.7647987371744278, + "acc_stderr": 0.011920008163650877 + }, + "all": { + "em": 0.14272231543624161, + "em_stderr": 0.003582171317651424, + "f1": 0.20778418624161069, + "f1_stderr": 0.0036307604368272656, + "acc": 0.4548027044477143, + "acc_stderr": 0.01080662148135179 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "bf4962afd80e0807" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "753d5337be27f809" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "f19ba687de4d22bf" + }, + "total_evaluation_time_secondes": "12137.073063850403", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/camel-ai/CAMEL-13B-Combined-Data/results_2023-07-19T18-34-56.119658.json b/eval-results/camel-ai/CAMEL-13B-Combined-Data/results_2023-07-19T18-34-56.119658.json new file mode 100644 index 0000000000000000000000000000000000000000..7e5317f3b229f3ab57878a4cd48bb0d02754f875 --- /dev/null +++ b/eval-results/camel-ai/CAMEL-13B-Combined-Data/results_2023-07-19T18-34-56.119658.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.514505119453925, + "acc_stderr": 0.014605241081370053, + "acc_norm": 0.5563139931740614, + "acc_norm_stderr": 0.01451842182567044 + }, + "harness|hellaswag|10": { + "acc": 0.5948018323043218, + "acc_stderr": 0.004899270310557987, + "acc_norm": 0.7924716191993627, + "acc_norm_stderr": 0.00404708312009885 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45185185185185184, + "acc_stderr": 0.04299268905480864, + "acc_norm": 0.45185185185185184, + "acc_norm_stderr": 0.04299268905480864 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4934210526315789, + "acc_stderr": 0.040685900502249704, + "acc_norm": 0.4934210526315789, + "acc_norm_stderr": 0.040685900502249704 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5132075471698113, + "acc_stderr": 0.030762134874500482, + "acc_norm": 0.5132075471698113, + "acc_norm_stderr": 0.030762134874500482 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5138888888888888, + "acc_stderr": 0.041795966175810016, + "acc_norm": 0.5138888888888888, + "acc_norm_stderr": 0.041795966175810016 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4797687861271676, + "acc_stderr": 0.03809342081273958, + "acc_norm": 0.4797687861271676, + "acc_norm_stderr": 0.03809342081273958 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.04488482852329017, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.04488482852329017 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3574468085106383, + "acc_stderr": 0.03132941789476425, + "acc_norm": 0.3574468085106383, + "acc_norm_stderr": 0.03132941789476425 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.34210526315789475, + "acc_stderr": 0.04462917535336937, + "acc_norm": 0.34210526315789475, + "acc_norm_stderr": 0.04462917535336937 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4413793103448276, + "acc_stderr": 0.04137931034482758, + "acc_norm": 0.4413793103448276, + "acc_norm_stderr": 0.04137931034482758 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2724867724867725, + "acc_stderr": 0.022930973071633345, + "acc_norm": 0.2724867724867725, + "acc_norm_stderr": 0.022930973071633345 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.041049472699033945, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.041049472699033945 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5548387096774193, + "acc_stderr": 0.028272410186214906, + "acc_norm": 0.5548387096774193, + "acc_norm_stderr": 0.028272410186214906 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.33497536945812806, + "acc_stderr": 0.033208527423483104, + "acc_norm": 0.33497536945812806, + "acc_norm_stderr": 0.033208527423483104 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6121212121212121, + "acc_stderr": 0.038049136539710114, + "acc_norm": 0.6121212121212121, + "acc_norm_stderr": 0.038049136539710114 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6565656565656566, + "acc_stderr": 0.033832012232444426, + "acc_norm": 0.6565656565656566, + "acc_norm_stderr": 0.033832012232444426 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6632124352331606, + "acc_stderr": 0.03410780251836184, + "acc_norm": 0.6632124352331606, + "acc_norm_stderr": 0.03410780251836184 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.47692307692307695, + "acc_stderr": 0.025323990861736125, + "acc_norm": 0.47692307692307695, + "acc_norm_stderr": 0.025323990861736125 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085622, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085622 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4957983193277311, + "acc_stderr": 0.03247734334448111, + "acc_norm": 0.4957983193277311, + "acc_norm_stderr": 0.03247734334448111 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6825688073394496, + "acc_stderr": 0.019957152198460493, + "acc_norm": 0.6825688073394496, + "acc_norm_stderr": 0.019957152198460493 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4212962962962963, + "acc_stderr": 0.03367462138896078, + "acc_norm": 0.4212962962962963, + "acc_norm_stderr": 0.03367462138896078 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6323529411764706, + "acc_stderr": 0.03384132045674119, + "acc_norm": 0.6323529411764706, + "acc_norm_stderr": 0.03384132045674119 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6708860759493671, + "acc_stderr": 0.030587326294702368, + "acc_norm": 0.6708860759493671, + "acc_norm_stderr": 0.030587326294702368 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5067264573991032, + "acc_stderr": 0.033554765962343545, + "acc_norm": 0.5067264573991032, + "acc_norm_stderr": 0.033554765962343545 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6335877862595419, + "acc_stderr": 0.042258754519696365, + "acc_norm": 0.6335877862595419, + "acc_norm_stderr": 0.042258754519696365 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6776859504132231, + "acc_stderr": 0.04266416363352168, + "acc_norm": 0.6776859504132231, + "acc_norm_stderr": 0.04266416363352168 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5648148148148148, + "acc_stderr": 0.04792898170907061, + "acc_norm": 0.5648148148148148, + "acc_norm_stderr": 0.04792898170907061 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5644171779141104, + "acc_stderr": 0.03895632464138937, + "acc_norm": 0.5644171779141104, + "acc_norm_stderr": 0.03895632464138937 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.0443280405529152, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.0443280405529152 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6699029126213593, + "acc_stderr": 0.0465614711001235, + "acc_norm": 0.6699029126213593, + "acc_norm_stderr": 0.0465614711001235 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7606837606837606, + "acc_stderr": 0.027951826808924333, + "acc_norm": 0.7606837606837606, + "acc_norm_stderr": 0.027951826808924333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6615581098339719, + "acc_stderr": 0.01692086958621066, + "acc_norm": 0.6615581098339719, + "acc_norm_stderr": 0.01692086958621066 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5173410404624278, + "acc_stderr": 0.02690290045866664, + "acc_norm": 0.5173410404624278, + "acc_norm_stderr": 0.02690290045866664 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24692737430167597, + "acc_stderr": 0.01442229220480884, + "acc_norm": 0.24692737430167597, + "acc_norm_stderr": 0.01442229220480884 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5718954248366013, + "acc_stderr": 0.028332397483664278, + "acc_norm": 0.5718954248366013, + "acc_norm_stderr": 0.028332397483664278 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5498392282958199, + "acc_stderr": 0.028256660723360173, + "acc_norm": 0.5498392282958199, + "acc_norm_stderr": 0.028256660723360173 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.02780165621232366, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.02780165621232366 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.32978723404255317, + "acc_stderr": 0.0280459469420424, + "acc_norm": 0.32978723404255317, + "acc_norm_stderr": 0.0280459469420424 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3956975228161669, + "acc_stderr": 0.01248929073544901, + "acc_norm": 0.3956975228161669, + "acc_norm_stderr": 0.01248929073544901 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5257352941176471, + "acc_stderr": 0.030332578094555033, + "acc_norm": 0.5257352941176471, + "acc_norm_stderr": 0.030332578094555033 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4526143790849673, + "acc_stderr": 0.020136790918492534, + "acc_norm": 0.4526143790849673, + "acc_norm_stderr": 0.020136790918492534 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5272727272727272, + "acc_stderr": 0.04782001791380061, + "acc_norm": 0.5272727272727272, + "acc_norm_stderr": 0.04782001791380061 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.03168091161233882, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.03168091161233882 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.681592039800995, + "acc_stderr": 0.032941184790540944, + "acc_norm": 0.681592039800995, + "acc_norm_stderr": 0.032941184790540944 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.77, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.77, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.038367221765980515, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.038367221765980515 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7192982456140351, + "acc_stderr": 0.034462962170884265, + "acc_norm": 0.7192982456140351, + "acc_norm_stderr": 0.034462962170884265 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.33414932680538556, + "mc1_stderr": 0.016512530677150538, + "mc2": 0.47421249569474433, + "mc2_stderr": 0.015003774736918588 + }, + "all": { + "acc": 0.499304046136865, + "acc_stderr": 0.0350688129792108, + "acc_norm": 0.5033630064862747, + "acc_norm_stderr": 0.03505289761571659, + "mc1": 0.33414932680538556, + "mc1_stderr": 0.016512530677150538, + "mc2": 0.47421249569474433, + "mc2_stderr": 0.015003774736918588 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "camel-ai/CAMEL-13B-Combined-Data", + "model_sha": "6d98f2801f13d89de7978ee9f348a52ea46a24ec", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/camel-ai/CAMEL-13B-Combined-Data/results_2023-09-23T12-27-31.812773.json b/eval-results/camel-ai/CAMEL-13B-Combined-Data/results_2023-09-23T12-27-31.812773.json new file mode 100644 index 0000000000000000000000000000000000000000..42dd890647e0cdc68a14cc0533cd2ab83b73ae33 --- /dev/null +++ b/eval-results/camel-ai/CAMEL-13B-Combined-Data/results_2023-09-23T12-27-31.812773.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "camel-ai/CAMEL-13B-Combined-Data", + "model_sha": "6d98f2801f13d89de7978ee9f348a52ea46a24ec", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.01604446308724832, + "em_stderr": 0.0012867375725646064, + "f1": 0.07856963087248349, + "f1_stderr": 0.0018370090964164025 + }, + "harness|gsm8k|5": { + "acc": 0.0712661106899166, + "acc_stderr": 0.0070864621279544925 + }, + "harness|winogrande|5": { + "acc": 0.7545382794001578, + "acc_stderr": 0.012095272937183639 + }, + "all": { + "em": 0.01604446308724832, + "em_stderr": 0.0012867375725646064, + "f1": 0.07856963087248349, + "f1_stderr": 0.0018370090964164025, + "acc": 0.4129021950450372, + "acc_stderr": 0.009590867532569065 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "aa3616e1443a8647" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "8b7fa789de023396" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "7ff9cfb353b949f3" + }, + "total_evaluation_time_secondes": "38934.43047094345", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/camel-ai/CAMEL-13B-Role-Playing-Data/results_2023-07-19T18-40-55.376784.json b/eval-results/camel-ai/CAMEL-13B-Role-Playing-Data/results_2023-07-19T18-40-55.376784.json new file mode 100644 index 0000000000000000000000000000000000000000..66033ddc82110f41285f016de318fbd2da43b2c4 --- /dev/null +++ b/eval-results/camel-ai/CAMEL-13B-Role-Playing-Data/results_2023-07-19T18-40-55.376784.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5034129692832765, + "acc_stderr": 0.014611050403244077, + "acc_norm": 0.5494880546075085, + "acc_norm_stderr": 0.014539646098471627 + }, + "harness|hellaswag|10": { + "acc": 0.5967934674367655, + "acc_stderr": 0.004895390341445624, + "acc_norm": 0.7924716191993627, + "acc_norm_stderr": 0.004047083120098848 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.43703703703703706, + "acc_stderr": 0.04284958639753399, + "acc_norm": 0.43703703703703706, + "acc_norm_stderr": 0.04284958639753399 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4473684210526316, + "acc_stderr": 0.0404633688397825, + "acc_norm": 0.4473684210526316, + "acc_norm_stderr": 0.0404633688397825 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4867924528301887, + "acc_stderr": 0.030762134874500476, + "acc_norm": 0.4867924528301887, + "acc_norm_stderr": 0.030762134874500476 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4166666666666667, + "acc_stderr": 0.04122728707651282, + "acc_norm": 0.4166666666666667, + "acc_norm_stderr": 0.04122728707651282 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.37572254335260113, + "acc_stderr": 0.036928207672648664, + "acc_norm": 0.37572254335260113, + "acc_norm_stderr": 0.036928207672648664 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237655, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237655 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.37872340425531914, + "acc_stderr": 0.03170995606040655, + "acc_norm": 0.37872340425531914, + "acc_norm_stderr": 0.03170995606040655 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748141, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748141 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4, + "acc_stderr": 0.04082482904638628, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04082482904638628 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25132275132275134, + "acc_stderr": 0.022340482339643895, + "acc_norm": 0.25132275132275134, + "acc_norm_stderr": 0.022340482339643895 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.04073524322147125, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.04073524322147125 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.4935483870967742, + "acc_stderr": 0.02844163823354051, + "acc_norm": 0.4935483870967742, + "acc_norm_stderr": 0.02844163823354051 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.29064039408866993, + "acc_stderr": 0.0319474007226554, + "acc_norm": 0.29064039408866993, + "acc_norm_stderr": 0.0319474007226554 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6121212121212121, + "acc_stderr": 0.038049136539710114, + "acc_norm": 0.6121212121212121, + "acc_norm_stderr": 0.038049136539710114 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5454545454545454, + "acc_stderr": 0.03547601494006937, + "acc_norm": 0.5454545454545454, + "acc_norm_stderr": 0.03547601494006937 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6476683937823834, + "acc_stderr": 0.03447478286414357, + "acc_norm": 0.6476683937823834, + "acc_norm_stderr": 0.03447478286414357 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.41025641025641024, + "acc_stderr": 0.02493931390694079, + "acc_norm": 0.41025641025641024, + "acc_norm_stderr": 0.02493931390694079 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.026466117538959916, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.026466117538959916 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.032145368597886394, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.032145368597886394 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.036586032627637426, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.036586032627637426 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6128440366972477, + "acc_stderr": 0.02088423199264345, + "acc_norm": 0.6128440366972477, + "acc_norm_stderr": 0.02088423199264345 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.27314814814814814, + "acc_stderr": 0.030388051301678116, + "acc_norm": 0.27314814814814814, + "acc_norm_stderr": 0.030388051301678116 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6519607843137255, + "acc_stderr": 0.03343311240488418, + "acc_norm": 0.6519607843137255, + "acc_norm_stderr": 0.03343311240488418 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.679324894514768, + "acc_stderr": 0.030381931949990407, + "acc_norm": 0.679324894514768, + "acc_norm_stderr": 0.030381931949990407 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5739910313901345, + "acc_stderr": 0.033188332862172806, + "acc_norm": 0.5739910313901345, + "acc_norm_stderr": 0.033188332862172806 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5572519083969466, + "acc_stderr": 0.0435644720266507, + "acc_norm": 0.5572519083969466, + "acc_norm_stderr": 0.0435644720266507 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6611570247933884, + "acc_stderr": 0.0432076780753667, + "acc_norm": 0.6611570247933884, + "acc_norm_stderr": 0.0432076780753667 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.49074074074074076, + "acc_stderr": 0.04832853553437055, + "acc_norm": 0.49074074074074076, + "acc_norm_stderr": 0.04832853553437055 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.49079754601226994, + "acc_stderr": 0.039277056007874414, + "acc_norm": 0.49079754601226994, + "acc_norm_stderr": 0.039277056007874414 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.45535714285714285, + "acc_stderr": 0.047268355537191, + "acc_norm": 0.45535714285714285, + "acc_norm_stderr": 0.047268355537191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6310679611650486, + "acc_stderr": 0.0477761518115674, + "acc_norm": 0.6310679611650486, + "acc_norm_stderr": 0.0477761518115674 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7435897435897436, + "acc_stderr": 0.02860595370200425, + "acc_norm": 0.7435897435897436, + "acc_norm_stderr": 0.02860595370200425 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6743295019157088, + "acc_stderr": 0.016757989458549675, + "acc_norm": 0.6743295019157088, + "acc_norm_stderr": 0.016757989458549675 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.49421965317919075, + "acc_stderr": 0.026917296179149116, + "acc_norm": 0.49421965317919075, + "acc_norm_stderr": 0.026917296179149116 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574884, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574884 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5228758169934641, + "acc_stderr": 0.028599936776089782, + "acc_norm": 0.5228758169934641, + "acc_norm_stderr": 0.028599936776089782 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5562700964630225, + "acc_stderr": 0.028217683556652315, + "acc_norm": 0.5562700964630225, + "acc_norm_stderr": 0.028217683556652315 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5216049382716049, + "acc_stderr": 0.02779476010500874, + "acc_norm": 0.5216049382716049, + "acc_norm_stderr": 0.02779476010500874 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.32269503546099293, + "acc_stderr": 0.027889139300534785, + "acc_norm": 0.32269503546099293, + "acc_norm_stderr": 0.027889139300534785 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3833116036505867, + "acc_stderr": 0.012417603662901188, + "acc_norm": 0.3833116036505867, + "acc_norm_stderr": 0.012417603662901188 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4632352941176471, + "acc_stderr": 0.030290619180485697, + "acc_norm": 0.4632352941176471, + "acc_norm_stderr": 0.030290619180485697 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.46568627450980393, + "acc_stderr": 0.02018014484330729, + "acc_norm": 0.46568627450980393, + "acc_norm_stderr": 0.02018014484330729 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5454545454545454, + "acc_stderr": 0.04769300568972744, + "acc_norm": 0.5454545454545454, + "acc_norm_stderr": 0.04769300568972744 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5428571428571428, + "acc_stderr": 0.031891418324213966, + "acc_norm": 0.5428571428571428, + "acc_norm_stderr": 0.031891418324213966 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6218905472636815, + "acc_stderr": 0.034288678487786564, + "acc_norm": 0.6218905472636815, + "acc_norm_stderr": 0.034288678487786564 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.73, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.73, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.40963855421686746, + "acc_stderr": 0.038284011150790206, + "acc_norm": 0.40963855421686746, + "acc_norm_stderr": 0.038284011150790206 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.036155076303109365, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.036155076303109365 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.33047735618115054, + "mc1_stderr": 0.0164667696136983, + "mc2": 0.46348950664321525, + "mc2_stderr": 0.01509009372110883 + }, + "all": { + "acc": 0.4689322812801516, + "acc_stderr": 0.03499607056608464, + "acc_norm": 0.4730297937731487, + "acc_norm_stderr": 0.03498048223513347, + "mc1": 0.33047735618115054, + "mc1_stderr": 0.0164667696136983, + "mc2": 0.46348950664321525, + "mc2_stderr": 0.01509009372110883 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "camel-ai/CAMEL-13B-Role-Playing-Data", + "model_sha": "762ecb0d85572c8f8bcbca06d27f7f64a4d74615", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/camel-ai/CAMEL-13B-Role-Playing-Data/results_2023-10-25T02-33-54.730423.json b/eval-results/camel-ai/CAMEL-13B-Role-Playing-Data/results_2023-10-25T02-33-54.730423.json new file mode 100644 index 0000000000000000000000000000000000000000..5d5504dcb2c9a0e99f4df70459f7b37894cf747f --- /dev/null +++ b/eval-results/camel-ai/CAMEL-13B-Role-Playing-Data/results_2023-10-25T02-33-54.730423.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "camel-ai/CAMEL-13B-Role-Playing-Data", + "model_sha": "762ecb0d85572c8f8bcbca06d27f7f64a4d74615", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.004404362416107382, + "em_stderr": 0.000678145162047963, + "f1": 0.06661703020134248, + "f1_stderr": 0.001491591221438747 + }, + "harness|gsm8k|5": { + "acc": 0.07354056103108415, + "acc_stderr": 0.007189835754365264 + }, + "harness|winogrande|5": { + "acc": 0.7403314917127072, + "acc_stderr": 0.012322700705552667 + }, + "all": { + "em": 0.004404362416107382, + "em_stderr": 0.000678145162047963, + "f1": 0.06661703020134248, + "f1_stderr": 0.001491591221438747, + "acc": 0.4069360263718957, + "acc_stderr": 0.009756268229958965 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "d1060c08b5b97756" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "c266d08723dd85a3" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "6e724f0b4a2cd976" + }, + "total_evaluation_time_secondes": "37601.45416927338", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/camel-ai/CAMEL-33B-Combined-Data/results_2023-08-01T13-41-43.051311.json b/eval-results/camel-ai/CAMEL-33B-Combined-Data/results_2023-08-01T13-41-43.051311.json new file mode 100644 index 0000000000000000000000000000000000000000..8a2a2e0fb4b031db2fe8c19da0a52ec7977cd8cc --- /dev/null +++ b/eval-results/camel-ai/CAMEL-33B-Combined-Data/results_2023-08-01T13-41-43.051311.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5947098976109215, + "acc_stderr": 0.014346869060229315, + "acc_norm": 0.6296928327645052, + "acc_norm_stderr": 0.01411129875167495 + }, + "harness|hellaswag|10": { + "acc": 0.6410077673770165, + "acc_stderr": 0.004787245377967104, + "acc_norm": 0.8382792272455686, + "acc_norm_stderr": 0.00367441979935367 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5855263157894737, + "acc_stderr": 0.04008973785779206, + "acc_norm": 0.5855263157894737, + "acc_norm_stderr": 0.04008973785779206 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5886792452830188, + "acc_stderr": 0.03028500925900979, + "acc_norm": 0.5886792452830188, + "acc_norm_stderr": 0.03028500925900979 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6041666666666666, + "acc_stderr": 0.04089465449325582, + "acc_norm": 0.6041666666666666, + "acc_norm_stderr": 0.04089465449325582 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.046550104113196177, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.046550104113196177 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.48936170212765956, + "acc_stderr": 0.03267862331014063, + "acc_norm": 0.48936170212765956, + "acc_norm_stderr": 0.03267862331014063 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3508771929824561, + "acc_stderr": 0.044895393502707, + "acc_norm": 0.3508771929824561, + "acc_norm_stderr": 0.044895393502707 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.04166567577101579, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.04166567577101579 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.35978835978835977, + "acc_stderr": 0.024718075944129284, + "acc_norm": 0.35978835978835977, + "acc_norm_stderr": 0.024718075944129284 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04285714285714281, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04285714285714281 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6903225806451613, + "acc_stderr": 0.026302774983517418, + "acc_norm": 0.6903225806451613, + "acc_norm_stderr": 0.026302774983517418 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.45320197044334976, + "acc_stderr": 0.03502544650845872, + "acc_norm": 0.45320197044334976, + "acc_norm_stderr": 0.03502544650845872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.0347769116216366, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.0347769116216366 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7323232323232324, + "acc_stderr": 0.03154449888270285, + "acc_norm": 0.7323232323232324, + "acc_norm_stderr": 0.03154449888270285 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8393782383419689, + "acc_stderr": 0.02649905770139746, + "acc_norm": 0.8393782383419689, + "acc_norm_stderr": 0.02649905770139746 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5615384615384615, + "acc_stderr": 0.02515826601686858, + "acc_norm": 0.5615384615384615, + "acc_norm_stderr": 0.02515826601686858 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.02763490726417854, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.02763490726417854 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.592436974789916, + "acc_stderr": 0.031918633744784645, + "acc_norm": 0.592436974789916, + "acc_norm_stderr": 0.031918633744784645 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526732, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526732 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7743119266055046, + "acc_stderr": 0.017923087667803064, + "acc_norm": 0.7743119266055046, + "acc_norm_stderr": 0.017923087667803064 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4675925925925926, + "acc_stderr": 0.03402801581358966, + "acc_norm": 0.4675925925925926, + "acc_norm_stderr": 0.03402801581358966 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8088235294117647, + "acc_stderr": 0.02759917430064077, + "acc_norm": 0.8088235294117647, + "acc_norm_stderr": 0.02759917430064077 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.810126582278481, + "acc_stderr": 0.02553010046023349, + "acc_norm": 0.810126582278481, + "acc_norm_stderr": 0.02553010046023349 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6636771300448431, + "acc_stderr": 0.031708824268455005, + "acc_norm": 0.6636771300448431, + "acc_norm_stderr": 0.031708824268455005 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6564885496183206, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.6564885496183206, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.039849796533028725, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.039849796533028725 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.042844679680521934, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.042844679680521934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7055214723926381, + "acc_stderr": 0.03581165790474082, + "acc_norm": 0.7055214723926381, + "acc_norm_stderr": 0.03581165790474082 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7864077669902912, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.7864077669902912, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8632478632478633, + "acc_stderr": 0.022509033937077802, + "acc_norm": 0.8632478632478633, + "acc_norm_stderr": 0.022509033937077802 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7713920817369093, + "acc_stderr": 0.015016884698539882, + "acc_norm": 0.7713920817369093, + "acc_norm_stderr": 0.015016884698539882 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6445086705202312, + "acc_stderr": 0.02577029208297724, + "acc_norm": 0.6445086705202312, + "acc_norm_stderr": 0.02577029208297724 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4122905027932961, + "acc_stderr": 0.016463200238114525, + "acc_norm": 0.4122905027932961, + "acc_norm_stderr": 0.016463200238114525 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.02782610930728369, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.02782610930728369 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6752411575562701, + "acc_stderr": 0.02659678228769704, + "acc_norm": 0.6752411575562701, + "acc_norm_stderr": 0.02659678228769704 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6790123456790124, + "acc_stderr": 0.02597656601086274, + "acc_norm": 0.6790123456790124, + "acc_norm_stderr": 0.02597656601086274 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4432624113475177, + "acc_stderr": 0.029634838473766002, + "acc_norm": 0.4432624113475177, + "acc_norm_stderr": 0.029634838473766002 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44654498044328556, + "acc_stderr": 0.012697046024399685, + "acc_norm": 0.44654498044328556, + "acc_norm_stderr": 0.012697046024399685 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5698529411764706, + "acc_stderr": 0.030074971917302875, + "acc_norm": 0.5698529411764706, + "acc_norm_stderr": 0.030074971917302875 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6143790849673203, + "acc_stderr": 0.01969145905235404, + "acc_norm": 0.6143790849673203, + "acc_norm_stderr": 0.01969145905235404 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7, + "acc_stderr": 0.04389311454644287, + "acc_norm": 0.7, + "acc_norm_stderr": 0.04389311454644287 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6612244897959184, + "acc_stderr": 0.030299506562154185, + "acc_norm": 0.6612244897959184, + "acc_norm_stderr": 0.030299506562154185 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7810945273631841, + "acc_stderr": 0.029239174636647, + "acc_norm": 0.7810945273631841, + "acc_norm_stderr": 0.029239174636647 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774708, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774708 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4939759036144578, + "acc_stderr": 0.03892212195333045, + "acc_norm": 0.4939759036144578, + "acc_norm_stderr": 0.03892212195333045 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.030944459778533193, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.030944459778533193 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3268053855569155, + "mc1_stderr": 0.01641987473113503, + "mc2": 0.502092078787114, + "mc2_stderr": 0.014933048490660556 + }, + "all": { + "acc": 0.5907405806920975, + "acc_stderr": 0.03395780996928073, + "acc_norm": 0.5946770958619642, + "acc_norm_stderr": 0.03393495580170161, + "mc1": 0.3268053855569155, + "mc1_stderr": 0.01641987473113503, + "mc2": 0.502092078787114, + "mc2_stderr": 0.014933048490660556 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "camel-ai/CAMEL-33B-Combined-Data", + "model_sha": "62c74e7531625c1383bbbdc7c8346a996e9d1e21", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "8211.982079744339", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/camel-ai/CAMEL-33B-Combined-Data/results_2023-09-17T14-06-04.717229.json b/eval-results/camel-ai/CAMEL-33B-Combined-Data/results_2023-09-17T14-06-04.717229.json new file mode 100644 index 0000000000000000000000000000000000000000..57388273a57afbd5fb0f5661643da87a8d7d3751 --- /dev/null +++ b/eval-results/camel-ai/CAMEL-33B-Combined-Data/results_2023-09-17T14-06-04.717229.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "camel-ai/CAMEL-33B-Combined-Data", + "model_sha": "62c74e7531625c1383bbbdc7c8346a996e9d1e21", + "model_size": "60.65 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.004404362416107382, + "em_stderr": 0.0006781451620479537, + "f1": 0.07118393456375847, + "f1_stderr": 0.001525704115056517 + }, + "harness|gsm8k|5": { + "acc": 0.14101592115238817, + "acc_stderr": 0.009586695349244102 + }, + "harness|winogrande|5": { + "acc": 0.7829518547750592, + "acc_stderr": 0.01158587171020941 + }, + "all": { + "em": 0.004404362416107382, + "em_stderr": 0.0006781451620479537, + "f1": 0.07118393456375847, + "f1_stderr": 0.001525704115056517, + "acc": 0.4619838879637237, + "acc_stderr": 0.010586283529726756 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "723ec113f6ad031e" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "0d4f013a1409c480" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "43dd755ba6b68fb6" + }, + "total_evaluation_time_secondes": "20885.882546663284", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/chaoyi-wu/MedLLaMA_13B/results_2023-07-24T13-04-01.266274.json b/eval-results/chaoyi-wu/MedLLaMA_13B/results_2023-07-24T13-04-01.266274.json new file mode 100644 index 0000000000000000000000000000000000000000..4f94a708885d16531b7beb8aaf9cc90060fea83d --- /dev/null +++ b/eval-results/chaoyi-wu/MedLLaMA_13B/results_2023-07-24T13-04-01.266274.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5102389078498294, + "acc_stderr": 0.014608326906285012, + "acc_norm": 0.5426621160409556, + "acc_norm_stderr": 0.014558106543924065 + }, + "harness|hellaswag|10": { + "acc": 0.5862378012348137, + "acc_stderr": 0.004915003499517829, + "acc_norm": 0.7853017327225652, + "acc_norm_stderr": 0.004097736838432052 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5259259259259259, + "acc_stderr": 0.04313531696750575, + "acc_norm": 0.5259259259259259, + "acc_norm_stderr": 0.04313531696750575 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.48026315789473684, + "acc_stderr": 0.040657710025626036, + "acc_norm": 0.48026315789473684, + "acc_norm_stderr": 0.040657710025626036 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.49056603773584906, + "acc_stderr": 0.0307673947078081, + "acc_norm": 0.49056603773584906, + "acc_norm_stderr": 0.0307673947078081 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4791666666666667, + "acc_stderr": 0.041775789507399935, + "acc_norm": 0.4791666666666667, + "acc_norm_stderr": 0.041775789507399935 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.42196531791907516, + "acc_stderr": 0.03765746693865151, + "acc_norm": 0.42196531791907516, + "acc_norm_stderr": 0.03765746693865151 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237657, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237657 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4, + "acc_stderr": 0.03202563076101737, + "acc_norm": 0.4, + "acc_norm_stderr": 0.03202563076101737 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.22807017543859648, + "acc_stderr": 0.03947152782669415, + "acc_norm": 0.22807017543859648, + "acc_norm_stderr": 0.03947152782669415 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3793103448275862, + "acc_stderr": 0.04043461861916747, + "acc_norm": 0.3793103448275862, + "acc_norm_stderr": 0.04043461861916747 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.021935878081184766, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.021935878081184766 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04216370213557835, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04216370213557835 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5129032258064516, + "acc_stderr": 0.028434533152681855, + "acc_norm": 0.5129032258064516, + "acc_norm_stderr": 0.028434533152681855 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.28078817733990147, + "acc_stderr": 0.0316185633535861, + "acc_norm": 0.28078817733990147, + "acc_norm_stderr": 0.0316185633535861 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5757575757575758, + "acc_stderr": 0.038592681420702636, + "acc_norm": 0.5757575757575758, + "acc_norm_stderr": 0.038592681420702636 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5151515151515151, + "acc_stderr": 0.03560716516531061, + "acc_norm": 0.5151515151515151, + "acc_norm_stderr": 0.03560716516531061 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6580310880829016, + "acc_stderr": 0.03423465100104283, + "acc_norm": 0.6580310880829016, + "acc_norm_stderr": 0.03423465100104283 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.43846153846153846, + "acc_stderr": 0.025158266016868575, + "acc_norm": 0.43846153846153846, + "acc_norm_stderr": 0.025158266016868575 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.027840811495871927, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.027840811495871927 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.44537815126050423, + "acc_stderr": 0.0322841062671639, + "acc_norm": 0.44537815126050423, + "acc_norm_stderr": 0.0322841062671639 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.5871559633027523, + "acc_stderr": 0.021109128133413913, + "acc_norm": 0.5871559633027523, + "acc_norm_stderr": 0.021109128133413913 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3472222222222222, + "acc_stderr": 0.032468872436376486, + "acc_norm": 0.3472222222222222, + "acc_norm_stderr": 0.032468872436376486 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.03503235296367992, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.03503235296367992 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6244725738396625, + "acc_stderr": 0.03152256243091156, + "acc_norm": 0.6244725738396625, + "acc_norm_stderr": 0.03152256243091156 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5291479820627802, + "acc_stderr": 0.03350073248773404, + "acc_norm": 0.5291479820627802, + "acc_norm_stderr": 0.03350073248773404 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5343511450381679, + "acc_stderr": 0.043749285605997376, + "acc_norm": 0.5343511450381679, + "acc_norm_stderr": 0.043749285605997376 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6528925619834711, + "acc_stderr": 0.04345724570292534, + "acc_norm": 0.6528925619834711, + "acc_norm_stderr": 0.04345724570292534 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.49074074074074076, + "acc_stderr": 0.04832853553437055, + "acc_norm": 0.49074074074074076, + "acc_norm_stderr": 0.04832853553437055 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4294478527607362, + "acc_stderr": 0.03889066619112722, + "acc_norm": 0.4294478527607362, + "acc_norm_stderr": 0.03889066619112722 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.045723723587374296, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.045723723587374296 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5922330097087378, + "acc_stderr": 0.0486577757041077, + "acc_norm": 0.5922330097087378, + "acc_norm_stderr": 0.0486577757041077 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6495726495726496, + "acc_stderr": 0.0312561082442188, + "acc_norm": 0.6495726495726496, + "acc_norm_stderr": 0.0312561082442188 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6206896551724138, + "acc_stderr": 0.01735126811754445, + "acc_norm": 0.6206896551724138, + "acc_norm_stderr": 0.01735126811754445 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5028901734104047, + "acc_stderr": 0.02691864538323901, + "acc_norm": 0.5028901734104047, + "acc_norm_stderr": 0.02691864538323901 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2558659217877095, + "acc_stderr": 0.014593620923210756, + "acc_norm": 0.2558659217877095, + "acc_norm_stderr": 0.014593620923210756 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.545751633986928, + "acc_stderr": 0.028509807802626592, + "acc_norm": 0.545751633986928, + "acc_norm_stderr": 0.028509807802626592 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.49517684887459806, + "acc_stderr": 0.028396770444111298, + "acc_norm": 0.49517684887459806, + "acc_norm_stderr": 0.028396770444111298 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5030864197530864, + "acc_stderr": 0.027820214158594377, + "acc_norm": 0.5030864197530864, + "acc_norm_stderr": 0.027820214158594377 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3546099290780142, + "acc_stderr": 0.028538650028878638, + "acc_norm": 0.3546099290780142, + "acc_norm_stderr": 0.028538650028878638 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3324641460234681, + "acc_stderr": 0.01203202233226051, + "acc_norm": 0.3324641460234681, + "acc_norm_stderr": 0.01203202233226051 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5257352941176471, + "acc_stderr": 0.03033257809455502, + "acc_norm": 0.5257352941176471, + "acc_norm_stderr": 0.03033257809455502 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.46895424836601307, + "acc_stderr": 0.020188804456361887, + "acc_norm": 0.46895424836601307, + "acc_norm_stderr": 0.020188804456361887 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5636363636363636, + "acc_stderr": 0.04750185058907296, + "acc_norm": 0.5636363636363636, + "acc_norm_stderr": 0.04750185058907296 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5387755102040817, + "acc_stderr": 0.031912820526692774, + "acc_norm": 0.5387755102040817, + "acc_norm_stderr": 0.031912820526692774 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6318407960199005, + "acc_stderr": 0.03410410565495302, + "acc_norm": 0.6318407960199005, + "acc_norm_stderr": 0.03410410565495302 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42771084337349397, + "acc_stderr": 0.038515976837185335, + "acc_norm": 0.42771084337349397, + "acc_norm_stderr": 0.038515976837185335 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6549707602339181, + "acc_stderr": 0.03645981377388806, + "acc_norm": 0.6549707602339181, + "acc_norm_stderr": 0.03645981377388806 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2582619339045288, + "mc1_stderr": 0.0153218216884762, + "mc2": 0.4053787386286284, + "mc2_stderr": 0.013893490031868357 + }, + "all": { + "acc": 0.46685175478824187, + "acc_stderr": 0.03531409019484935, + "acc_norm": 0.47077526563025673, + "acc_norm_stderr": 0.035299387024960424, + "mc1": 0.2582619339045288, + "mc1_stderr": 0.0153218216884762, + "mc2": 0.4053787386286284, + "mc2_stderr": 0.013893490031868357 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "chaoyi-wu/MedLLaMA_13B", + "model_sha": "893557ef32f98cd01deb1c5d063be6d640ffa657", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2b0e07d4cdd3b0fe", + "hash_cont_tokens": "52204555b6e39a6e" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "578edd77107cb2c3", + "hash_cont_tokens": "25c49737526d9f80" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "6a95a1511f8da075", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "24a78edc4d9a93aa", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "b11106668d6c0974", + "hash_cont_tokens": "ebed26cf74a85815" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "10180ba12a075cb0", + "hash_cont_tokens": "6898ac348a7ae442" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "73351ef4968750a2", + "hash_cont_tokens": "34a058958a45af94" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "a539150af234c668", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "52e12e5a43bcee35", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "d1f3721a5659f7ee", + "hash_cont_tokens": "da408cb12ab08288" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "f2d78f546b5595c2", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "c9cc19179f63d1d6", + "hash_cont_tokens": "370a1a0ab68d15cd" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5046144e67e992e8", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4b14581ba4fc06fc", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "1ee52c413b5b4cc4", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "2914077c4dd3090a", + "hash_cont_tokens": "80dea4d59245cf01" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0f88a874342378de", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "9889933f1dd02a23", + "hash_cont_tokens": "309bef1803097408" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dc309a94c4bfdd2f", + "hash_cont_tokens": "5105a3bd1b39b785" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "0801a0aebec3ba8c", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "5bc4aca8831d9c05", + "hash_cont_tokens": "205c5deee1581b02" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b92bd6b06fc3464c", + "hash_cont_tokens": "272d28867e0ff046" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a549346cde8165e9", + "hash_cont_tokens": "98b3bf311aa83f0d" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "e7e9cf91f9d6a081", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "a61a1670f854d9e1", + "hash_cont_tokens": "d9e66fc7c702b795" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8a77cb7763f28110", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "fcfcfae391f8faa1", + "hash_cont_tokens": "d4b1936084c060e0" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a29454cc1feb23ef", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "b6734a25556d75dc", + "hash_cont_tokens": "2bf9921a39e901d9" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "5720438e29473426", + "hash_cont_tokens": "cab8b16be9576360" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "486321d5858de240", + "hash_cont_tokens": "1c34fbe5a59f1ed1" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "473919e64d1b8c80", + "hash_cont_tokens": "ebd714885a59ef55" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "47a65c81fd7ed010", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "aedfcd41cbd2fcc9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "ed5f2414144d7b72", + "hash_cont_tokens": "aac52fa6a519223b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "692eaacb5b747264", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "2cbce4edca937588", + "hash_cont_tokens": "697179a0dd47c5c0" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "c2f38b19bab1aa2c", + "hash_cont_tokens": "9b19898e3ecb527f" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fde277bc547bc3d8", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "87b232bbebce39db", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "58c21af9da3e126e", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d1f5c770d368e9c6", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "98d6db15a50aaa8e", + "hash_cont_tokens": "1e30d7dedc7588c0" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "2aabd8c7337502f8", + "hash_cont_tokens": "ceee291786cbb123" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "17f8c8f2d4a0a9b1", + "hash_cont_tokens": "484df4c25a5460bd" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "dfc6df491d991966", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "cffe8139e00da9dd", + "hash_cont_tokens": "85a9de6c685b7035" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "4a69ed6ee55918fb", + "hash_cont_tokens": "ad7b5a040535bdcf" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "6cc713f12b5890de", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "b4044fc92756c377", + "hash_cont_tokens": "0b7b5aaef574dc78" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b019784da8db089a", + "hash_cont_tokens": "63a651778e8d72d2" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "f47f37c7c9bfc601", + "hash_cont_tokens": "841583ab707b25d7" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "4d282718d6142410", + "hash_cont_tokens": "9c2c01d3214f66b8" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fbc6026e500537bc", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "150dd1ff81ff642e", + "hash_cont_tokens": "96353c5969a9028a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "fcbac3e735545969", + "hash_cont_tokens": "a1f8901800ac9b68" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ffc962a38441ef13", + "hash_cont_tokens": "08c0be345e5f1c12" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "9ffb65d225ae550f", + "hash_cont_tokens": "16c760a491c6f26e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "1c61d6705b299f5c", + "hash_cont_tokens": "868d6f1055fbd51d" + }, + "total_evaluation_time_secondes": "3780.4133019447327", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/chavinlo/alpaca-native/results_2023-08-11T17-43-25.205082.json b/eval-results/chavinlo/alpaca-native/results_2023-08-11T17-43-25.205082.json new file mode 100644 index 0000000000000000000000000000000000000000..f9791422486bd657e5b2c5ef47cadcea562e58d4 --- /dev/null +++ b/eval-results/chavinlo/alpaca-native/results_2023-08-11T17-43-25.205082.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5136518771331058, + "acc_stderr": 0.014605943429860947, + "acc_norm": 0.523037542662116, + "acc_norm_stderr": 0.014595873205358264 + }, + "harness|hellaswag|10": { + "acc": 0.5966938856801434, + "acc_stderr": 0.004895586329401312, + "acc_norm": 0.7708623780123481, + "acc_norm_stderr": 0.0041941904060001055 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3618421052631579, + "acc_stderr": 0.03910525752849724, + "acc_norm": 0.3618421052631579, + "acc_norm_stderr": 0.03910525752849724 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.44528301886792454, + "acc_stderr": 0.030588052974270658, + "acc_norm": 0.44528301886792454, + "acc_norm_stderr": 0.030588052974270658 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3958333333333333, + "acc_stderr": 0.04089465449325582, + "acc_norm": 0.3958333333333333, + "acc_norm_stderr": 0.04089465449325582 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3872832369942196, + "acc_stderr": 0.03714325906302065, + "acc_norm": 0.3872832369942196, + "acc_norm_stderr": 0.03714325906302065 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237656, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237656 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3702127659574468, + "acc_stderr": 0.03156564682236785, + "acc_norm": 0.3702127659574468, + "acc_norm_stderr": 0.03156564682236785 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022055, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022055 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.36551724137931035, + "acc_stderr": 0.04013124195424385, + "acc_norm": 0.36551724137931035, + "acc_norm_stderr": 0.04013124195424385 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.023456037383982026, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.023456037383982026 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.03852273364924314, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.03852273364924314 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.43548387096774194, + "acc_stderr": 0.028206225591502737, + "acc_norm": 0.43548387096774194, + "acc_norm_stderr": 0.028206225591502737 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3054187192118227, + "acc_stderr": 0.03240661565868408, + "acc_norm": 0.3054187192118227, + "acc_norm_stderr": 0.03240661565868408 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5393939393939394, + "acc_stderr": 0.03892207016552012, + "acc_norm": 0.5393939393939394, + "acc_norm_stderr": 0.03892207016552012 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.4797979797979798, + "acc_stderr": 0.03559443565563919, + "acc_norm": 0.4797979797979798, + "acc_norm_stderr": 0.03559443565563919 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.616580310880829, + "acc_stderr": 0.03508984236295341, + "acc_norm": 0.616580310880829, + "acc_norm_stderr": 0.03508984236295341 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.38974358974358975, + "acc_stderr": 0.024726967886647078, + "acc_norm": 0.38974358974358975, + "acc_norm_stderr": 0.024726967886647078 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.026466117538959916, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.026466117538959916 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.031124619309328177, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.031124619309328177 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.5394495412844037, + "acc_stderr": 0.02137049460999509, + "acc_norm": 0.5394495412844037, + "acc_norm_stderr": 0.02137049460999509 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.37962962962962965, + "acc_stderr": 0.03309682581119035, + "acc_norm": 0.37962962962962965, + "acc_norm_stderr": 0.03309682581119035 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5343137254901961, + "acc_stderr": 0.03501038327635897, + "acc_norm": 0.5343137254901961, + "acc_norm_stderr": 0.03501038327635897 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.5654008438818565, + "acc_stderr": 0.03226759995510145, + "acc_norm": 0.5654008438818565, + "acc_norm_stderr": 0.03226759995510145 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5022421524663677, + "acc_stderr": 0.03355746535223263, + "acc_norm": 0.5022421524663677, + "acc_norm_stderr": 0.03355746535223263 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.42748091603053434, + "acc_stderr": 0.04338920305792401, + "acc_norm": 0.42748091603053434, + "acc_norm_stderr": 0.04338920305792401 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5454545454545454, + "acc_stderr": 0.04545454545454548, + "acc_norm": 0.5454545454545454, + "acc_norm_stderr": 0.04545454545454548 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.4351851851851852, + "acc_stderr": 0.04792898170907062, + "acc_norm": 0.4351851851851852, + "acc_norm_stderr": 0.04792898170907062 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4049079754601227, + "acc_stderr": 0.038566721635489125, + "acc_norm": 0.4049079754601227, + "acc_norm_stderr": 0.038566721635489125 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764376, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.47572815533980584, + "acc_stderr": 0.049449010929737795, + "acc_norm": 0.47572815533980584, + "acc_norm_stderr": 0.049449010929737795 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6153846153846154, + "acc_stderr": 0.03187195347942466, + "acc_norm": 0.6153846153846154, + "acc_norm_stderr": 0.03187195347942466 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5517241379310345, + "acc_stderr": 0.017784034534992433, + "acc_norm": 0.5517241379310345, + "acc_norm_stderr": 0.017784034534992433 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.430635838150289, + "acc_stderr": 0.026658800273672376, + "acc_norm": 0.430635838150289, + "acc_norm_stderr": 0.026658800273672376 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4084967320261438, + "acc_stderr": 0.02814640599309636, + "acc_norm": 0.4084967320261438, + "acc_norm_stderr": 0.02814640599309636 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.4694533762057878, + "acc_stderr": 0.028345045864840684, + "acc_norm": 0.4694533762057878, + "acc_norm_stderr": 0.028345045864840684 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.027777777777777804, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.027777777777777804 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.32978723404255317, + "acc_stderr": 0.028045946942042405, + "acc_norm": 0.32978723404255317, + "acc_norm_stderr": 0.028045946942042405 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3226857887874837, + "acc_stderr": 0.01194026419319598, + "acc_norm": 0.3226857887874837, + "acc_norm_stderr": 0.01194026419319598 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.39705882352941174, + "acc_stderr": 0.02972215209928006, + "acc_norm": 0.39705882352941174, + "acc_norm_stderr": 0.02972215209928006 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.3790849673202614, + "acc_stderr": 0.019627444748412232, + "acc_norm": 0.3790849673202614, + "acc_norm_stderr": 0.019627444748412232 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.44545454545454544, + "acc_stderr": 0.047605488214603246, + "acc_norm": 0.44545454545454544, + "acc_norm_stderr": 0.047605488214603246 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.39591836734693875, + "acc_stderr": 0.03130802899065686, + "acc_norm": 0.39591836734693875, + "acc_norm_stderr": 0.03130802899065686 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.472636815920398, + "acc_stderr": 0.03530235517334682, + "acc_norm": 0.472636815920398, + "acc_norm_stderr": 0.03530235517334682 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.40963855421686746, + "acc_stderr": 0.03828401115079023, + "acc_norm": 0.40963855421686746, + "acc_norm_stderr": 0.03828401115079023 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.5263157894736842, + "acc_stderr": 0.03829509868994727, + "acc_norm": 0.5263157894736842, + "acc_norm_stderr": 0.03829509868994727 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2460220318237454, + "mc1_stderr": 0.015077219200662595, + "mc2": 0.3757902546315026, + "mc2_stderr": 0.015396830401557888 + }, + "all": { + "acc": 0.42072619666274175, + "acc_stderr": 0.035325520397257726, + "acc_norm": 0.42383728408411825, + "acc_norm_stderr": 0.03531346164898817, + "mc1": 0.2460220318237454, + "mc1_stderr": 0.015077219200662595, + "mc2": 0.3757902546315026, + "mc2_stderr": 0.015396830401557888 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "chavinlo/alpaca-native", + "model_sha": "cc7773cac2478231807c56ef2f02292d98f85cf5", + "model_dtype": "torch.float16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2b0e07d4cdd3b0fe", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "578edd77107cb2c3", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "6a95a1511f8da075", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "24a78edc4d9a93aa", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "b11106668d6c0974", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "10180ba12a075cb0", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "73351ef4968750a2", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "a539150af234c668", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "52e12e5a43bcee35", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "d1f3721a5659f7ee", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "f2d78f546b5595c2", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "c9cc19179f63d1d6", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5046144e67e992e8", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4b14581ba4fc06fc", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "1ee52c413b5b4cc4", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "2914077c4dd3090a", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0f88a874342378de", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "9889933f1dd02a23", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dc309a94c4bfdd2f", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "0801a0aebec3ba8c", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "5bc4aca8831d9c05", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b92bd6b06fc3464c", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a549346cde8165e9", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "e7e9cf91f9d6a081", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "a61a1670f854d9e1", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8a77cb7763f28110", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "fcfcfae391f8faa1", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a29454cc1feb23ef", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "b6734a25556d75dc", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "5720438e29473426", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "486321d5858de240", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "473919e64d1b8c80", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "47a65c81fd7ed010", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "aedfcd41cbd2fcc9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "ed5f2414144d7b72", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "692eaacb5b747264", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "2cbce4edca937588", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "c2f38b19bab1aa2c", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fde277bc547bc3d8", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "87b232bbebce39db", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "58c21af9da3e126e", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d1f5c770d368e9c6", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "98d6db15a50aaa8e", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "2aabd8c7337502f8", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "17f8c8f2d4a0a9b1", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "dfc6df491d991966", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "cffe8139e00da9dd", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "4a69ed6ee55918fb", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "6cc713f12b5890de", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "b4044fc92756c377", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b019784da8db089a", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "f47f37c7c9bfc601", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "4d282718d6142410", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fbc6026e500537bc", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "150dd1ff81ff642e", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "fcbac3e735545969", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ffc962a38441ef13", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "9ffb65d225ae550f", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "1c61d6705b299f5c", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "4527.759533882141", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/chavinlo/alpaca-native/results_2023-09-17T15-14-48.848140.json b/eval-results/chavinlo/alpaca-native/results_2023-09-17T15-14-48.848140.json new file mode 100644 index 0000000000000000000000000000000000000000..fa46ecf8a2e4488d3c08ce708261652be72ecbef --- /dev/null +++ b/eval-results/chavinlo/alpaca-native/results_2023-09-17T15-14-48.848140.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "chavinlo/alpaca-native", + "model_sha": "cc7773cac2478231807c56ef2f02292d98f85cf5", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.053376677852348994, + "em_stderr": 0.0023019931199868356, + "f1": 0.14230180369127515, + "f1_stderr": 0.0028220599296221362 + }, + "harness|gsm8k|5": { + "acc": 0.014404852160727824, + "acc_stderr": 0.0032820559171369825 + }, + "harness|winogrande|5": { + "acc": 0.6945540647198106, + "acc_stderr": 0.01294503863255202 + }, + "all": { + "em": 0.053376677852348994, + "em_stderr": 0.0023019931199868356, + "f1": 0.14230180369127515, + "f1_stderr": 0.0028220599296221362, + "acc": 0.35447945844026924, + "acc_stderr": 0.008113547274844502 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f70227603c1b1bfe", + "hash_cont_tokens": "38b90738e6182b06" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "e3d5b3003c52b880", + "hash_cont_tokens": "4a93af2562b06abb" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "5be2b0947cee07a9", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "ce9af2df9f2847fa", + "hash_cont_tokens": "593f4bf4aa913536" + }, + "total_evaluation_time_secondes": "13735.434634685516", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/chavinlo/alpaca-native/results_2023-09-21T20-23-20.255556.json b/eval-results/chavinlo/alpaca-native/results_2023-09-21T20-23-20.255556.json new file mode 100644 index 0000000000000000000000000000000000000000..9ce3ebd879e4726fdba4c7294c8d209ea8830ef7 --- /dev/null +++ b/eval-results/chavinlo/alpaca-native/results_2023-09-21T20-23-20.255556.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "chavinlo/alpaca-native", + "model_sha": "cc7773cac2478231807c56ef2f02292d98f85cf5", + "model_size": "12.58 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5127986348122867, + "acc_stderr": 0.014606603181012538, + "acc_norm": 0.5204778156996587, + "acc_norm_stderr": 0.01459913135303501 + }, + "harness|hellaswag|10": { + "acc": 0.5959968133837881, + "acc_stderr": 0.004896952378506926, + "acc_norm": 0.7699661422027485, + "acc_norm_stderr": 0.004199941217549452 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45925925925925926, + "acc_stderr": 0.04304979692464242, + "acc_norm": 0.45925925925925926, + "acc_norm_stderr": 0.04304979692464242 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3618421052631579, + "acc_stderr": 0.03910525752849724, + "acc_norm": 0.3618421052631579, + "acc_norm_stderr": 0.03910525752849724 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.44150943396226416, + "acc_stderr": 0.030561590426731837, + "acc_norm": 0.44150943396226416, + "acc_norm_stderr": 0.030561590426731837 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3819444444444444, + "acc_stderr": 0.040629907841466674, + "acc_norm": 0.3819444444444444, + "acc_norm_stderr": 0.040629907841466674 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3815028901734104, + "acc_stderr": 0.03703851193099521, + "acc_norm": 0.3815028901734104, + "acc_norm_stderr": 0.03703851193099521 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237656, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237656 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.37446808510638296, + "acc_stderr": 0.03163910665367291, + "acc_norm": 0.37446808510638296, + "acc_norm_stderr": 0.03163910665367291 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.040969851398436716, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.040969851398436716 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.36551724137931035, + "acc_stderr": 0.040131241954243856, + "acc_norm": 0.36551724137931035, + "acc_norm_stderr": 0.040131241954243856 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.28835978835978837, + "acc_stderr": 0.023330654054535903, + "acc_norm": 0.28835978835978837, + "acc_norm_stderr": 0.023330654054535903 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.03852273364924314, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.03852273364924314 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.4290322580645161, + "acc_stderr": 0.02815603653823321, + "acc_norm": 0.4290322580645161, + "acc_norm_stderr": 0.02815603653823321 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3103448275862069, + "acc_stderr": 0.03255086769970103, + "acc_norm": 0.3103448275862069, + "acc_norm_stderr": 0.03255086769970103 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5333333333333333, + "acc_stderr": 0.038956580652718446, + "acc_norm": 0.5333333333333333, + "acc_norm_stderr": 0.038956580652718446 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.4797979797979798, + "acc_stderr": 0.035594435655639196, + "acc_norm": 0.4797979797979798, + "acc_norm_stderr": 0.035594435655639196 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6062176165803109, + "acc_stderr": 0.035260770955482405, + "acc_norm": 0.6062176165803109, + "acc_norm_stderr": 0.035260770955482405 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3871794871794872, + "acc_stderr": 0.024697216930878948, + "acc_norm": 0.3871794871794872, + "acc_norm_stderr": 0.024697216930878948 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.02684205787383371, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.02684205787383371 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.031041941304059295, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.031041941304059295 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.544954128440367, + "acc_stderr": 0.021350503090925173, + "acc_norm": 0.544954128440367, + "acc_norm_stderr": 0.021350503090925173 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.375, + "acc_stderr": 0.033016908987210894, + "acc_norm": 0.375, + "acc_norm_stderr": 0.033016908987210894 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5343137254901961, + "acc_stderr": 0.03501038327635897, + "acc_norm": 0.5343137254901961, + "acc_norm_stderr": 0.03501038327635897 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.5654008438818565, + "acc_stderr": 0.03226759995510145, + "acc_norm": 0.5654008438818565, + "acc_norm_stderr": 0.03226759995510145 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5022421524663677, + "acc_stderr": 0.03355746535223263, + "acc_norm": 0.5022421524663677, + "acc_norm_stderr": 0.03355746535223263 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.4122137404580153, + "acc_stderr": 0.04317171194870254, + "acc_norm": 0.4122137404580153, + "acc_norm_stderr": 0.04317171194870254 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5454545454545454, + "acc_stderr": 0.045454545454545484, + "acc_norm": 0.5454545454545454, + "acc_norm_stderr": 0.045454545454545484 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.04803752235190192, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.04803752235190192 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3987730061349693, + "acc_stderr": 0.03847021420456025, + "acc_norm": 0.3987730061349693, + "acc_norm_stderr": 0.03847021420456025 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.0457237235873743, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.0457237235873743 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.47572815533980584, + "acc_stderr": 0.049449010929737795, + "acc_norm": 0.47572815533980584, + "acc_norm_stderr": 0.049449010929737795 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6068376068376068, + "acc_stderr": 0.03199957924651047, + "acc_norm": 0.6068376068376068, + "acc_norm_stderr": 0.03199957924651047 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5504469987228607, + "acc_stderr": 0.017788725283507337, + "acc_norm": 0.5504469987228607, + "acc_norm_stderr": 0.017788725283507337 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.42485549132947975, + "acc_stderr": 0.026613350840261736, + "acc_norm": 0.42485549132947975, + "acc_norm_stderr": 0.026613350840261736 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.028180596328259293, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.028180596328259293 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.4662379421221865, + "acc_stderr": 0.028333277109562793, + "acc_norm": 0.4662379421221865, + "acc_norm_stderr": 0.028333277109562793 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.027777777777777804, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.027777777777777804 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.30851063829787234, + "acc_stderr": 0.027553366165101362, + "acc_norm": 0.30851063829787234, + "acc_norm_stderr": 0.027553366165101362 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3213820078226858, + "acc_stderr": 0.011927581352265076, + "acc_norm": 0.3213820078226858, + "acc_norm_stderr": 0.011927581352265076 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.40441176470588236, + "acc_stderr": 0.029812630701569743, + "acc_norm": 0.40441176470588236, + "acc_norm_stderr": 0.029812630701569743 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.3790849673202614, + "acc_stderr": 0.019627444748412232, + "acc_norm": 0.3790849673202614, + "acc_norm_stderr": 0.019627444748412232 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.44545454545454544, + "acc_stderr": 0.047605488214603246, + "acc_norm": 0.44545454545454544, + "acc_norm_stderr": 0.047605488214603246 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.40408163265306124, + "acc_stderr": 0.031414708025865885, + "acc_norm": 0.40408163265306124, + "acc_norm_stderr": 0.031414708025865885 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.472636815920398, + "acc_stderr": 0.03530235517334682, + "acc_norm": 0.472636815920398, + "acc_norm_stderr": 0.03530235517334682 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4036144578313253, + "acc_stderr": 0.038194861407583984, + "acc_norm": 0.4036144578313253, + "acc_norm_stderr": 0.038194861407583984 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.5263157894736842, + "acc_stderr": 0.03829509868994727, + "acc_norm": 0.5263157894736842, + "acc_norm_stderr": 0.03829509868994727 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2484700122399021, + "mc1_stderr": 0.015127427096520674, + "mc2": 0.3759916250814691, + "mc2_stderr": 0.015396201572279763 + }, + "all": { + "acc": 0.41927597389078103, + "acc_stderr": 0.035302205782678654, + "acc_norm": 0.42235476219088836, + "acc_norm_stderr": 0.035290265393035695, + "mc1": 0.2484700122399021, + "mc1_stderr": 0.015127427096520674, + "mc2": 0.3759916250814691, + "mc2_stderr": 0.015396201572279763 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2b0e07d4cdd3b0fe", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "578edd77107cb2c3", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "6a95a1511f8da075", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "24a78edc4d9a93aa", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "b11106668d6c0974", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "10180ba12a075cb0", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "73351ef4968750a2", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "a539150af234c668", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "52e12e5a43bcee35", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "d1f3721a5659f7ee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "f2d78f546b5595c2", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "c9cc19179f63d1d6", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5046144e67e992e8", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4b14581ba4fc06fc", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "1ee52c413b5b4cc4", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "2914077c4dd3090a", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0f88a874342378de", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "9889933f1dd02a23", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dc309a94c4bfdd2f", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "0801a0aebec3ba8c", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "5bc4aca8831d9c05", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b92bd6b06fc3464c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a549346cde8165e9", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "e7e9cf91f9d6a081", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "a61a1670f854d9e1", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8a77cb7763f28110", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "fcfcfae391f8faa1", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a29454cc1feb23ef", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "b6734a25556d75dc", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "5720438e29473426", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "486321d5858de240", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "473919e64d1b8c80", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "47a65c81fd7ed010", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "aedfcd41cbd2fcc9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "ed5f2414144d7b72", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "692eaacb5b747264", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "2cbce4edca937588", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "c2f38b19bab1aa2c", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fde277bc547bc3d8", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "87b232bbebce39db", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "58c21af9da3e126e", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d1f5c770d368e9c6", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "98d6db15a50aaa8e", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "2aabd8c7337502f8", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "17f8c8f2d4a0a9b1", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "dfc6df491d991966", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "cffe8139e00da9dd", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "4a69ed6ee55918fb", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "6cc713f12b5890de", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "b4044fc92756c377", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b019784da8db089a", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "f47f37c7c9bfc601", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "4d282718d6142410", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fbc6026e500537bc", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "150dd1ff81ff642e", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "fcbac3e735545969", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ffc962a38441ef13", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "9ffb65d225ae550f", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "1c61d6705b299f5c", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "5010.6186554431915", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/chavinlo/gpt4-x-alpaca/results_2023-08-11T18-05-47.769359.json b/eval-results/chavinlo/gpt4-x-alpaca/results_2023-08-11T18-05-47.769359.json new file mode 100644 index 0000000000000000000000000000000000000000..fea13fb8aae8229de9ca93d141a6b865b8f47623 --- /dev/null +++ b/eval-results/chavinlo/gpt4-x-alpaca/results_2023-08-11T18-05-47.769359.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5170648464163823, + "acc_stderr": 0.0146028783885366, + "acc_norm": 0.5281569965870307, + "acc_norm_stderr": 0.014588204105102203 + }, + "harness|hellaswag|10": { + "acc": 0.6018721370244972, + "acc_stderr": 0.004885116465550283, + "acc_norm": 0.795857398924517, + "acc_norm_stderr": 0.004022499210760732 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5259259259259259, + "acc_stderr": 0.04313531696750575, + "acc_norm": 0.5259259259259259, + "acc_norm_stderr": 0.04313531696750575 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5131578947368421, + "acc_stderr": 0.04067533136309173, + "acc_norm": 0.5131578947368421, + "acc_norm_stderr": 0.04067533136309173 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5320754716981132, + "acc_stderr": 0.03070948699255654, + "acc_norm": 0.5320754716981132, + "acc_norm_stderr": 0.03070948699255654 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4930555555555556, + "acc_stderr": 0.04180806750294938, + "acc_norm": 0.4930555555555556, + "acc_norm_stderr": 0.04180806750294938 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4277456647398844, + "acc_stderr": 0.03772446857518026, + "acc_norm": 0.4277456647398844, + "acc_norm_stderr": 0.03772446857518026 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.0379328118530781, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.0379328118530781 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4340425531914894, + "acc_stderr": 0.03240038086792747, + "acc_norm": 0.4340425531914894, + "acc_norm_stderr": 0.03240038086792747 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.04339138322579861, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.04339138322579861 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4206896551724138, + "acc_stderr": 0.0411391498118926, + "acc_norm": 0.4206896551724138, + "acc_norm_stderr": 0.0411391498118926 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2830687830687831, + "acc_stderr": 0.023201392938194978, + "acc_norm": 0.2830687830687831, + "acc_norm_stderr": 0.023201392938194978 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.038522733649243135, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.038522733649243135 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621503, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621503 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5129032258064516, + "acc_stderr": 0.028434533152681855, + "acc_norm": 0.5129032258064516, + "acc_norm_stderr": 0.028434533152681855 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.33497536945812806, + "acc_stderr": 0.033208527423483104, + "acc_norm": 0.33497536945812806, + "acc_norm_stderr": 0.033208527423483104 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5454545454545454, + "acc_stderr": 0.038881769216741004, + "acc_norm": 0.5454545454545454, + "acc_norm_stderr": 0.038881769216741004 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6616161616161617, + "acc_stderr": 0.03371124142626303, + "acc_norm": 0.6616161616161617, + "acc_norm_stderr": 0.03371124142626303 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6683937823834197, + "acc_stderr": 0.03397636541089118, + "acc_norm": 0.6683937823834197, + "acc_norm_stderr": 0.03397636541089118 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4717948717948718, + "acc_stderr": 0.0253106392549339, + "acc_norm": 0.4717948717948718, + "acc_norm_stderr": 0.0253106392549339 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25555555555555554, + "acc_stderr": 0.026593939101844058, + "acc_norm": 0.25555555555555554, + "acc_norm_stderr": 0.026593939101844058 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4831932773109244, + "acc_stderr": 0.03246013680375308, + "acc_norm": 0.4831932773109244, + "acc_norm_stderr": 0.03246013680375308 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2582781456953642, + "acc_stderr": 0.035737053147634576, + "acc_norm": 0.2582781456953642, + "acc_norm_stderr": 0.035737053147634576 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6440366972477064, + "acc_stderr": 0.020528559278244214, + "acc_norm": 0.6440366972477064, + "acc_norm_stderr": 0.020528559278244214 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.35648148148148145, + "acc_stderr": 0.03266478331527272, + "acc_norm": 0.35648148148148145, + "acc_norm_stderr": 0.03266478331527272 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6372549019607843, + "acc_stderr": 0.03374499356319355, + "acc_norm": 0.6372549019607843, + "acc_norm_stderr": 0.03374499356319355 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6624472573839663, + "acc_stderr": 0.03078154910202622, + "acc_norm": 0.6624472573839663, + "acc_norm_stderr": 0.03078154910202622 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5381165919282511, + "acc_stderr": 0.033460150119732274, + "acc_norm": 0.5381165919282511, + "acc_norm_stderr": 0.033460150119732274 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5343511450381679, + "acc_stderr": 0.04374928560599738, + "acc_norm": 0.5343511450381679, + "acc_norm_stderr": 0.04374928560599738 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6776859504132231, + "acc_stderr": 0.042664163633521685, + "acc_norm": 0.6776859504132231, + "acc_norm_stderr": 0.042664163633521685 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5462962962962963, + "acc_stderr": 0.04812917324536823, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.04812917324536823 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5521472392638037, + "acc_stderr": 0.03906947479456606, + "acc_norm": 0.5521472392638037, + "acc_norm_stderr": 0.03906947479456606 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.45535714285714285, + "acc_stderr": 0.047268355537191, + "acc_norm": 0.45535714285714285, + "acc_norm_stderr": 0.047268355537191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6796116504854369, + "acc_stderr": 0.04620284082280042, + "acc_norm": 0.6796116504854369, + "acc_norm_stderr": 0.04620284082280042 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7008547008547008, + "acc_stderr": 0.02999695185834948, + "acc_norm": 0.7008547008547008, + "acc_norm_stderr": 0.02999695185834948 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.669220945083014, + "acc_stderr": 0.016824818462563746, + "acc_norm": 0.669220945083014, + "acc_norm_stderr": 0.016824818462563746 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5057803468208093, + "acc_stderr": 0.026917296179149116, + "acc_norm": 0.5057803468208093, + "acc_norm_stderr": 0.026917296179149116 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.29720670391061454, + "acc_stderr": 0.015285313353641602, + "acc_norm": 0.29720670391061454, + "acc_norm_stderr": 0.015285313353641602 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5032679738562091, + "acc_stderr": 0.02862930519400354, + "acc_norm": 0.5032679738562091, + "acc_norm_stderr": 0.02862930519400354 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5112540192926045, + "acc_stderr": 0.028390897396863533, + "acc_norm": 0.5112540192926045, + "acc_norm_stderr": 0.028390897396863533 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.02780165621232366, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.02780165621232366 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3723404255319149, + "acc_stderr": 0.028838921471251455, + "acc_norm": 0.3723404255319149, + "acc_norm_stderr": 0.028838921471251455 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.37809647979139505, + "acc_stderr": 0.012384878406798095, + "acc_norm": 0.37809647979139505, + "acc_norm_stderr": 0.012384878406798095 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.44485294117647056, + "acc_stderr": 0.030187532060329387, + "acc_norm": 0.44485294117647056, + "acc_norm_stderr": 0.030187532060329387 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4591503267973856, + "acc_stderr": 0.020160213617222516, + "acc_norm": 0.4591503267973856, + "acc_norm_stderr": 0.020160213617222516 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6181818181818182, + "acc_stderr": 0.046534298079135075, + "acc_norm": 0.6181818181818182, + "acc_norm_stderr": 0.046534298079135075 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.44081632653061226, + "acc_stderr": 0.03178419114175364, + "acc_norm": 0.44081632653061226, + "acc_norm_stderr": 0.03178419114175364 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6119402985074627, + "acc_stderr": 0.03445789964362749, + "acc_norm": 0.6119402985074627, + "acc_norm_stderr": 0.03445789964362749 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4578313253012048, + "acc_stderr": 0.0387862677100236, + "acc_norm": 0.4578313253012048, + "acc_norm_stderr": 0.0387862677100236 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7017543859649122, + "acc_stderr": 0.03508771929824565, + "acc_norm": 0.7017543859649122, + "acc_norm_stderr": 0.03508771929824565 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.31946144430844553, + "mc1_stderr": 0.016322644182960498, + "mc2": 0.48882404667849044, + "mc2_stderr": 0.016077830165514555 + }, + "all": { + "acc": 0.48449554561369323, + "acc_stderr": 0.03506199979132179, + "acc_norm": 0.48797143395387416, + "acc_norm_stderr": 0.0350471304432163, + "mc1": 0.31946144430844553, + "mc1_stderr": 0.016322644182960498, + "mc2": 0.48882404667849044, + "mc2_stderr": 0.016077830165514555 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "chavinlo/gpt4-x-alpaca", + "model_sha": "6a571f458cab9a23d14324ec63e0abd1744c8353", + "model_dtype": "torch.float16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2b0e07d4cdd3b0fe", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "578edd77107cb2c3", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "6a95a1511f8da075", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "24a78edc4d9a93aa", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "b11106668d6c0974", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "10180ba12a075cb0", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "73351ef4968750a2", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "a539150af234c668", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "52e12e5a43bcee35", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "d1f3721a5659f7ee", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "f2d78f546b5595c2", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "c9cc19179f63d1d6", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5046144e67e992e8", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4b14581ba4fc06fc", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "1ee52c413b5b4cc4", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "2914077c4dd3090a", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0f88a874342378de", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "9889933f1dd02a23", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dc309a94c4bfdd2f", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "0801a0aebec3ba8c", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "5bc4aca8831d9c05", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b92bd6b06fc3464c", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a549346cde8165e9", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "e7e9cf91f9d6a081", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "a61a1670f854d9e1", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8a77cb7763f28110", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "fcfcfae391f8faa1", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a29454cc1feb23ef", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "b6734a25556d75dc", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "5720438e29473426", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "486321d5858de240", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "473919e64d1b8c80", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "47a65c81fd7ed010", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "aedfcd41cbd2fcc9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "ed5f2414144d7b72", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "692eaacb5b747264", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "2cbce4edca937588", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "c2f38b19bab1aa2c", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fde277bc547bc3d8", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "87b232bbebce39db", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "58c21af9da3e126e", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d1f5c770d368e9c6", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "98d6db15a50aaa8e", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "2aabd8c7337502f8", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "17f8c8f2d4a0a9b1", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "dfc6df491d991966", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "cffe8139e00da9dd", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "4a69ed6ee55918fb", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "6cc713f12b5890de", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "b4044fc92756c377", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b019784da8db089a", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "f47f37c7c9bfc601", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "4d282718d6142410", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fbc6026e500537bc", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "150dd1ff81ff642e", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "fcbac3e735545969", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ffc962a38441ef13", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "9ffb65d225ae550f", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "1c61d6705b299f5c", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "7065.078456878662", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/chavinlo/gpt4-x-alpaca/results_2023-09-22T20-56-09.987040.json b/eval-results/chavinlo/gpt4-x-alpaca/results_2023-09-22T20-56-09.987040.json new file mode 100644 index 0000000000000000000000000000000000000000..6c1ca3e7f64a1850f8ad314251545acaaf5b0455 --- /dev/null +++ b/eval-results/chavinlo/gpt4-x-alpaca/results_2023-09-22T20-56-09.987040.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "chavinlo/gpt4-x-alpaca", + "model_sha": "6a571f458cab9a23d14324ec63e0abd1744c8353", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.15478187919463088, + "em_stderr": 0.003704111989193061, + "f1": 0.24988045302013467, + "f1_stderr": 0.00385619985047934 + }, + "harness|gsm8k|5": { + "acc": 0.028051554207733132, + "acc_stderr": 0.004548229533836362 + }, + "harness|winogrande|5": { + "acc": 0.7016574585635359, + "acc_stderr": 0.012858885010030421 + }, + "all": { + "em": 0.15478187919463088, + "em_stderr": 0.003704111989193061, + "f1": 0.24988045302013467, + "f1_stderr": 0.00385619985047934, + "acc": 0.3648545063856345, + "acc_stderr": 0.008703557271933391 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f70227603c1b1bfe", + "hash_cont_tokens": "030717cde54f2dff" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "e3d5b3003c52b880", + "hash_cont_tokens": "5d7d3555e4c882cf" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "5be2b0947cee07a9", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "ce9af2df9f2847fa", + "hash_cont_tokens": "be421a34d9b78c47" + }, + "total_evaluation_time_secondes": "14067.204483509064", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/clibrain/Llama-2-13b-ft-instruct-es/results_2023-08-19T16-28-48.911941.json b/eval-results/clibrain/Llama-2-13b-ft-instruct-es/results_2023-08-19T16-28-48.911941.json new file mode 100644 index 0000000000000000000000000000000000000000..178228a59468f8a6c543c60a5bb93b6a70b1cc45 --- /dev/null +++ b/eval-results/clibrain/Llama-2-13b-ft-instruct-es/results_2023-08-19T16-28-48.911941.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5580204778156996, + "acc_stderr": 0.014512682523128342, + "acc_norm": 0.5938566552901023, + "acc_norm_stderr": 0.014351656690097863 + }, + "harness|hellaswag|10": { + "acc": 0.6141206930890261, + "acc_stderr": 0.004858074013443993, + "acc_norm": 0.8150766779525991, + "acc_norm_stderr": 0.003874419065658617 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5, + "acc_stderr": 0.04068942293855797, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04068942293855797 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5245283018867924, + "acc_stderr": 0.030735822206205608, + "acc_norm": 0.5245283018867924, + "acc_norm_stderr": 0.030735822206205608 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5486111111111112, + "acc_stderr": 0.04161402398403279, + "acc_norm": 0.5486111111111112, + "acc_norm_stderr": 0.04161402398403279 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4797687861271676, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.4797687861271676, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.0433643270799318, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.0433643270799318 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4425531914893617, + "acc_stderr": 0.03246956919789958, + "acc_norm": 0.4425531914893617, + "acc_norm_stderr": 0.03246956919789958 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.043391383225798615, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.043391383225798615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3412698412698413, + "acc_stderr": 0.024419234966819064, + "acc_norm": 0.3412698412698413, + "acc_norm_stderr": 0.024419234966819064 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.043435254289490965, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.043435254289490965 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6419354838709678, + "acc_stderr": 0.027273890594300645, + "acc_norm": 0.6419354838709678, + "acc_norm_stderr": 0.027273890594300645 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43349753694581283, + "acc_stderr": 0.03486731727419872, + "acc_norm": 0.43349753694581283, + "acc_norm_stderr": 0.03486731727419872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.63, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.63, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6606060606060606, + "acc_stderr": 0.03697442205031596, + "acc_norm": 0.6606060606060606, + "acc_norm_stderr": 0.03697442205031596 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6767676767676768, + "acc_stderr": 0.03332299921070644, + "acc_norm": 0.6767676767676768, + "acc_norm_stderr": 0.03332299921070644 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7668393782383419, + "acc_stderr": 0.03051611137147602, + "acc_norm": 0.7668393782383419, + "acc_norm_stderr": 0.03051611137147602 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5153846153846153, + "acc_stderr": 0.025339003010106515, + "acc_norm": 0.5153846153846153, + "acc_norm_stderr": 0.025339003010106515 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.02730914058823018, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.02730914058823018 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5546218487394958, + "acc_stderr": 0.032284106267163895, + "acc_norm": 0.5546218487394958, + "acc_norm_stderr": 0.032284106267163895 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3509933774834437, + "acc_stderr": 0.03896981964257375, + "acc_norm": 0.3509933774834437, + "acc_norm_stderr": 0.03896981964257375 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7064220183486238, + "acc_stderr": 0.019525151122639667, + "acc_norm": 0.7064220183486238, + "acc_norm_stderr": 0.019525151122639667 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.44907407407407407, + "acc_stderr": 0.03392238405321616, + "acc_norm": 0.44907407407407407, + "acc_norm_stderr": 0.03392238405321616 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7401960784313726, + "acc_stderr": 0.03077855467869326, + "acc_norm": 0.7401960784313726, + "acc_norm_stderr": 0.03077855467869326 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7257383966244726, + "acc_stderr": 0.029041333510598035, + "acc_norm": 0.7257383966244726, + "acc_norm_stderr": 0.029041333510598035 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6547085201793722, + "acc_stderr": 0.03191100192835794, + "acc_norm": 0.6547085201793722, + "acc_norm_stderr": 0.03191100192835794 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5572519083969466, + "acc_stderr": 0.0435644720266507, + "acc_norm": 0.5572519083969466, + "acc_norm_stderr": 0.0435644720266507 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6694214876033058, + "acc_stderr": 0.04294340845212094, + "acc_norm": 0.6694214876033058, + "acc_norm_stderr": 0.04294340845212094 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.04557239513497751, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.04557239513497751 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6073619631901841, + "acc_stderr": 0.03836740907831029, + "acc_norm": 0.6073619631901841, + "acc_norm_stderr": 0.03836740907831029 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285714, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285714 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7378640776699029, + "acc_stderr": 0.043546310772605956, + "acc_norm": 0.7378640776699029, + "acc_norm_stderr": 0.043546310772605956 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7521367521367521, + "acc_stderr": 0.028286324075564386, + "acc_norm": 0.7521367521367521, + "acc_norm_stderr": 0.028286324075564386 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939098, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939098 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7266922094508301, + "acc_stderr": 0.015936681062628556, + "acc_norm": 0.7266922094508301, + "acc_norm_stderr": 0.015936681062628556 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5982658959537572, + "acc_stderr": 0.02639410417764363, + "acc_norm": 0.5982658959537572, + "acc_norm_stderr": 0.02639410417764363 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3642458100558659, + "acc_stderr": 0.016094338768474596, + "acc_norm": 0.3642458100558659, + "acc_norm_stderr": 0.016094338768474596 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5947712418300654, + "acc_stderr": 0.028110928492809068, + "acc_norm": 0.5947712418300654, + "acc_norm_stderr": 0.028110928492809068 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6270096463022508, + "acc_stderr": 0.0274666102131401, + "acc_norm": 0.6270096463022508, + "acc_norm_stderr": 0.0274666102131401 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.595679012345679, + "acc_stderr": 0.027306625297327688, + "acc_norm": 0.595679012345679, + "acc_norm_stderr": 0.027306625297327688 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3723404255319149, + "acc_stderr": 0.02883892147125146, + "acc_norm": 0.3723404255319149, + "acc_norm_stderr": 0.02883892147125146 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.41134289439374183, + "acc_stderr": 0.012567882673803684, + "acc_norm": 0.41134289439374183, + "acc_norm_stderr": 0.012567882673803684 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5220588235294118, + "acc_stderr": 0.030343264224213514, + "acc_norm": 0.5220588235294118, + "acc_norm_stderr": 0.030343264224213514 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5441176470588235, + "acc_stderr": 0.020148939420415745, + "acc_norm": 0.5441176470588235, + "acc_norm_stderr": 0.020148939420415745 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6090909090909091, + "acc_stderr": 0.046737523336702384, + "acc_norm": 0.6090909090909091, + "acc_norm_stderr": 0.046737523336702384 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6, + "acc_stderr": 0.031362502409358936, + "acc_norm": 0.6, + "acc_norm_stderr": 0.031362502409358936 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7910447761194029, + "acc_stderr": 0.028748298931728655, + "acc_norm": 0.7910447761194029, + "acc_norm_stderr": 0.028748298931728655 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.03836722176598052, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.03836722176598052 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7309941520467836, + "acc_stderr": 0.034010526201040885, + "acc_norm": 0.7309941520467836, + "acc_norm_stderr": 0.034010526201040885 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2484700122399021, + "mc1_stderr": 0.015127427096520677, + "mc2": 0.3781361627899956, + "mc2_stderr": 0.014317410058710695 + }, + "all": { + "acc": 0.5445662048386786, + "acc_stderr": 0.03468064007889854, + "acc_norm": 0.5485796313189832, + "acc_norm_stderr": 0.03466123870973217, + "mc1": 0.2484700122399021, + "mc1_stderr": 0.015127427096520677, + "mc2": 0.3781361627899956, + "mc2_stderr": 0.014317410058710695 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "clibrain/Llama-2-13b-ft-instruct-es", + "model_sha": "772b53f64f484fa0d651d453bcefc35a0f52f251", + "model_dtype": "torch.float16", + "lighteval_sha": "2b9e1cf249accf9b8168101189269701a82bfb9c", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6265.296881437302", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/clibrain/Llama-2-13b-ft-instruct-es/results_2023-09-18T03-06-46.998156.json b/eval-results/clibrain/Llama-2-13b-ft-instruct-es/results_2023-09-18T03-06-46.998156.json new file mode 100644 index 0000000000000000000000000000000000000000..d28c345c660317b4a08005ebfc87672f02351925 --- /dev/null +++ b/eval-results/clibrain/Llama-2-13b-ft-instruct-es/results_2023-09-18T03-06-46.998156.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "clibrain/Llama-2-13b-ft-instruct-es", + "model_sha": "85863593f69dccb86c18e37973769d1dcab4503f", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0024119127516778523, + "em_stderr": 0.0005023380498893348, + "f1": 0.0655463506711411, + "f1_stderr": 0.0014039891922809947 + }, + "harness|gsm8k|5": { + "acc": 0.08567096285064443, + "acc_stderr": 0.007709218855882782 + }, + "harness|winogrande|5": { + "acc": 0.7576953433307024, + "acc_stderr": 0.012042352526174787 + }, + "all": { + "em": 0.0024119127516778523, + "em_stderr": 0.0005023380498893348, + "f1": 0.0655463506711411, + "f1_stderr": 0.0014039891922809947, + "acc": 0.42168315309067345, + "acc_stderr": 0.009875785691028784 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "ec71f85a28aaa113" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "9217140379916264" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "5c1cbe81b111436d" + }, + "total_evaluation_time_secondes": "12715.394479751587", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/clibrain/Llama-2-7b-ft-instruct-es/results_2023-08-09T22-51-22.839971.json b/eval-results/clibrain/Llama-2-7b-ft-instruct-es/results_2023-08-09T22-51-22.839971.json new file mode 100644 index 0000000000000000000000000000000000000000..0ee9c1bada4fac4ebd6d3073aa1ec7fc85e40bc3 --- /dev/null +++ b/eval-results/clibrain/Llama-2-7b-ft-instruct-es/results_2023-08-09T22-51-22.839971.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.507679180887372, + "acc_stderr": 0.01460966744089257, + "acc_norm": 0.5366894197952219, + "acc_norm_stderr": 0.014572000527756993 + }, + "harness|hellaswag|10": { + "acc": 0.5840470025891257, + "acc_stderr": 0.0049187816623739436, + "acc_norm": 0.7783310097590121, + "acc_norm_stderr": 0.004145206350032313 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4740740740740741, + "acc_stderr": 0.04313531696750575, + "acc_norm": 0.4740740740740741, + "acc_norm_stderr": 0.04313531696750575 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.42105263157894735, + "acc_stderr": 0.040179012759817494, + "acc_norm": 0.42105263157894735, + "acc_norm_stderr": 0.040179012759817494 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5132075471698113, + "acc_stderr": 0.030762134874500482, + "acc_norm": 0.5132075471698113, + "acc_norm_stderr": 0.030762134874500482 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4861111111111111, + "acc_stderr": 0.04179596617581, + "acc_norm": 0.4861111111111111, + "acc_norm_stderr": 0.04179596617581 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.41040462427745666, + "acc_stderr": 0.03750757044895536, + "acc_norm": 0.41040462427745666, + "acc_norm_stderr": 0.03750757044895536 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.1568627450980392, + "acc_stderr": 0.036186648199362466, + "acc_norm": 0.1568627450980392, + "acc_norm_stderr": 0.036186648199362466 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.41702127659574467, + "acc_stderr": 0.032232762667117124, + "acc_norm": 0.41702127659574467, + "acc_norm_stderr": 0.032232762667117124 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3931034482758621, + "acc_stderr": 0.0407032901370707, + "acc_norm": 0.3931034482758621, + "acc_norm_stderr": 0.0407032901370707 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29894179894179895, + "acc_stderr": 0.02357760479165581, + "acc_norm": 0.29894179894179895, + "acc_norm_stderr": 0.02357760479165581 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04285714285714281, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04285714285714281 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5, + "acc_stderr": 0.028444006199428714, + "acc_norm": 0.5, + "acc_norm_stderr": 0.028444006199428714 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3054187192118227, + "acc_stderr": 0.03240661565868407, + "acc_norm": 0.3054187192118227, + "acc_norm_stderr": 0.03240661565868407 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6, + "acc_stderr": 0.03825460278380025, + "acc_norm": 0.6, + "acc_norm_stderr": 0.03825460278380025 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5656565656565656, + "acc_stderr": 0.03531505879359183, + "acc_norm": 0.5656565656565656, + "acc_norm_stderr": 0.03531505879359183 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6787564766839378, + "acc_stderr": 0.033699508685490674, + "acc_norm": 0.6787564766839378, + "acc_norm_stderr": 0.033699508685490674 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.441025641025641, + "acc_stderr": 0.025174048384000763, + "acc_norm": 0.441025641025641, + "acc_norm_stderr": 0.025174048384000763 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.02696242432507384, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.02696242432507384 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4369747899159664, + "acc_stderr": 0.03221943636566196, + "acc_norm": 0.4369747899159664, + "acc_norm_stderr": 0.03221943636566196 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.037804458505267334, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.037804458505267334 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6275229357798165, + "acc_stderr": 0.0207283684576385, + "acc_norm": 0.6275229357798165, + "acc_norm_stderr": 0.0207283684576385 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3101851851851852, + "acc_stderr": 0.03154696285656628, + "acc_norm": 0.3101851851851852, + "acc_norm_stderr": 0.03154696285656628 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.03454236585380609, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.03454236585380609 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6413502109704642, + "acc_stderr": 0.031219569445301843, + "acc_norm": 0.6413502109704642, + "acc_norm_stderr": 0.031219569445301843 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5605381165919282, + "acc_stderr": 0.033310925110381785, + "acc_norm": 0.5605381165919282, + "acc_norm_stderr": 0.033310925110381785 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5267175572519084, + "acc_stderr": 0.04379024936553894, + "acc_norm": 0.5267175572519084, + "acc_norm_stderr": 0.04379024936553894 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6446280991735537, + "acc_stderr": 0.0436923632657398, + "acc_norm": 0.6446280991735537, + "acc_norm_stderr": 0.0436923632657398 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5, + "acc_stderr": 0.04833682445228318, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04833682445228318 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5276073619631901, + "acc_stderr": 0.0392237829061099, + "acc_norm": 0.5276073619631901, + "acc_norm_stderr": 0.0392237829061099 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764376, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5922330097087378, + "acc_stderr": 0.04865777570410769, + "acc_norm": 0.5922330097087378, + "acc_norm_stderr": 0.04865777570410769 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6965811965811965, + "acc_stderr": 0.030118210106942656, + "acc_norm": 0.6965811965811965, + "acc_norm_stderr": 0.030118210106942656 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6270753512132823, + "acc_stderr": 0.01729286826945392, + "acc_norm": 0.6270753512132823, + "acc_norm_stderr": 0.01729286826945392 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.4797687861271676, + "acc_stderr": 0.026897049996382868, + "acc_norm": 0.4797687861271676, + "acc_norm_stderr": 0.026897049996382868 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.27150837988826815, + "acc_stderr": 0.014874252168095264, + "acc_norm": 0.27150837988826815, + "acc_norm_stderr": 0.014874252168095264 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5, + "acc_stderr": 0.028629916715693413, + "acc_norm": 0.5, + "acc_norm_stderr": 0.028629916715693413 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5434083601286174, + "acc_stderr": 0.0282908690541976, + "acc_norm": 0.5434083601286174, + "acc_norm_stderr": 0.0282908690541976 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.47530864197530864, + "acc_stderr": 0.02778680093142745, + "acc_norm": 0.47530864197530864, + "acc_norm_stderr": 0.02778680093142745 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.36879432624113473, + "acc_stderr": 0.02878222756134724, + "acc_norm": 0.36879432624113473, + "acc_norm_stderr": 0.02878222756134724 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.36114732724902215, + "acc_stderr": 0.012267935477519028, + "acc_norm": 0.36114732724902215, + "acc_norm_stderr": 0.012267935477519028 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5183823529411765, + "acc_stderr": 0.030352303395351964, + "acc_norm": 0.5183823529411765, + "acc_norm_stderr": 0.030352303395351964 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4150326797385621, + "acc_stderr": 0.019933627776857418, + "acc_norm": 0.4150326797385621, + "acc_norm_stderr": 0.019933627776857418 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5545454545454546, + "acc_stderr": 0.047605488214603246, + "acc_norm": 0.5545454545454546, + "acc_norm_stderr": 0.047605488214603246 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.42448979591836733, + "acc_stderr": 0.031642094879429414, + "acc_norm": 0.42448979591836733, + "acc_norm_stderr": 0.031642094879429414 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6467661691542289, + "acc_stderr": 0.03379790611796777, + "acc_norm": 0.6467661691542289, + "acc_norm_stderr": 0.03379790611796777 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.038367221765980515, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.038367221765980515 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6608187134502924, + "acc_stderr": 0.03631053496488905, + "acc_norm": 0.6608187134502924, + "acc_norm_stderr": 0.03631053496488905 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2558139534883721, + "mc1_stderr": 0.015274176219283361, + "mc2": 0.3882239370629839, + "mc2_stderr": 0.014196610467420477 + }, + "all": { + "acc": 0.46852179785168946, + "acc_stderr": 0.03529525820204488, + "acc_norm": 0.4723064460902952, + "acc_norm_stderr": 0.03528150833381645, + "mc1": 0.2558139534883721, + "mc1_stderr": 0.015274176219283361, + "mc2": 0.3882239370629839, + "mc2_stderr": 0.014196610467420477 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "clibrain/Llama-2-7b-ft-instruct-es", + "model_sha": "b62f431c88b232204ea7046f9d906ae1daa68437", + "model_dtype": "torch.float16", + "lighteval_sha": "da839e70121267a9bf55a0fbea4fb2fae2948337", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4056.2868497371674", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/clibrain/Llama-2-7b-ft-instruct-es/results_2023-08-09T22-51-27.386813.json b/eval-results/clibrain/Llama-2-7b-ft-instruct-es/results_2023-08-09T22-51-27.386813.json new file mode 100644 index 0000000000000000000000000000000000000000..899b2ad84168602a179d8271bfc1f6f210ec7489 --- /dev/null +++ b/eval-results/clibrain/Llama-2-7b-ft-instruct-es/results_2023-08-09T22-51-27.386813.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.507679180887372, + "acc_stderr": 0.01460966744089257, + "acc_norm": 0.5366894197952219, + "acc_norm_stderr": 0.014572000527756993 + }, + "harness|hellaswag|10": { + "acc": 0.5840470025891257, + "acc_stderr": 0.0049187816623739436, + "acc_norm": 0.7783310097590121, + "acc_norm_stderr": 0.004145206350032313 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4740740740740741, + "acc_stderr": 0.04313531696750575, + "acc_norm": 0.4740740740740741, + "acc_norm_stderr": 0.04313531696750575 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.42105263157894735, + "acc_stderr": 0.040179012759817494, + "acc_norm": 0.42105263157894735, + "acc_norm_stderr": 0.040179012759817494 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5132075471698113, + "acc_stderr": 0.030762134874500482, + "acc_norm": 0.5132075471698113, + "acc_norm_stderr": 0.030762134874500482 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4861111111111111, + "acc_stderr": 0.04179596617581, + "acc_norm": 0.4861111111111111, + "acc_norm_stderr": 0.04179596617581 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.41040462427745666, + "acc_stderr": 0.03750757044895536, + "acc_norm": 0.41040462427745666, + "acc_norm_stderr": 0.03750757044895536 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.1568627450980392, + "acc_stderr": 0.036186648199362466, + "acc_norm": 0.1568627450980392, + "acc_norm_stderr": 0.036186648199362466 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.41702127659574467, + "acc_stderr": 0.032232762667117124, + "acc_norm": 0.41702127659574467, + "acc_norm_stderr": 0.032232762667117124 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3931034482758621, + "acc_stderr": 0.0407032901370707, + "acc_norm": 0.3931034482758621, + "acc_norm_stderr": 0.0407032901370707 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29894179894179895, + "acc_stderr": 0.02357760479165581, + "acc_norm": 0.29894179894179895, + "acc_norm_stderr": 0.02357760479165581 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04285714285714281, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04285714285714281 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5, + "acc_stderr": 0.028444006199428714, + "acc_norm": 0.5, + "acc_norm_stderr": 0.028444006199428714 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3054187192118227, + "acc_stderr": 0.03240661565868407, + "acc_norm": 0.3054187192118227, + "acc_norm_stderr": 0.03240661565868407 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6, + "acc_stderr": 0.03825460278380025, + "acc_norm": 0.6, + "acc_norm_stderr": 0.03825460278380025 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5656565656565656, + "acc_stderr": 0.03531505879359183, + "acc_norm": 0.5656565656565656, + "acc_norm_stderr": 0.03531505879359183 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6787564766839378, + "acc_stderr": 0.033699508685490674, + "acc_norm": 0.6787564766839378, + "acc_norm_stderr": 0.033699508685490674 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.441025641025641, + "acc_stderr": 0.025174048384000763, + "acc_norm": 0.441025641025641, + "acc_norm_stderr": 0.025174048384000763 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.02696242432507384, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.02696242432507384 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4369747899159664, + "acc_stderr": 0.03221943636566196, + "acc_norm": 0.4369747899159664, + "acc_norm_stderr": 0.03221943636566196 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.037804458505267334, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.037804458505267334 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6275229357798165, + "acc_stderr": 0.0207283684576385, + "acc_norm": 0.6275229357798165, + "acc_norm_stderr": 0.0207283684576385 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3101851851851852, + "acc_stderr": 0.03154696285656628, + "acc_norm": 0.3101851851851852, + "acc_norm_stderr": 0.03154696285656628 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.03454236585380609, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.03454236585380609 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6413502109704642, + "acc_stderr": 0.031219569445301843, + "acc_norm": 0.6413502109704642, + "acc_norm_stderr": 0.031219569445301843 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5605381165919282, + "acc_stderr": 0.033310925110381785, + "acc_norm": 0.5605381165919282, + "acc_norm_stderr": 0.033310925110381785 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5267175572519084, + "acc_stderr": 0.04379024936553894, + "acc_norm": 0.5267175572519084, + "acc_norm_stderr": 0.04379024936553894 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6446280991735537, + "acc_stderr": 0.0436923632657398, + "acc_norm": 0.6446280991735537, + "acc_norm_stderr": 0.0436923632657398 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5, + "acc_stderr": 0.04833682445228318, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04833682445228318 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5276073619631901, + "acc_stderr": 0.0392237829061099, + "acc_norm": 0.5276073619631901, + "acc_norm_stderr": 0.0392237829061099 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764376, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5922330097087378, + "acc_stderr": 0.04865777570410769, + "acc_norm": 0.5922330097087378, + "acc_norm_stderr": 0.04865777570410769 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6965811965811965, + "acc_stderr": 0.030118210106942656, + "acc_norm": 0.6965811965811965, + "acc_norm_stderr": 0.030118210106942656 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6270753512132823, + "acc_stderr": 0.01729286826945392, + "acc_norm": 0.6270753512132823, + "acc_norm_stderr": 0.01729286826945392 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.4797687861271676, + "acc_stderr": 0.026897049996382868, + "acc_norm": 0.4797687861271676, + "acc_norm_stderr": 0.026897049996382868 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.27150837988826815, + "acc_stderr": 0.014874252168095264, + "acc_norm": 0.27150837988826815, + "acc_norm_stderr": 0.014874252168095264 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5, + "acc_stderr": 0.028629916715693413, + "acc_norm": 0.5, + "acc_norm_stderr": 0.028629916715693413 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5434083601286174, + "acc_stderr": 0.0282908690541976, + "acc_norm": 0.5434083601286174, + "acc_norm_stderr": 0.0282908690541976 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.47530864197530864, + "acc_stderr": 0.02778680093142745, + "acc_norm": 0.47530864197530864, + "acc_norm_stderr": 0.02778680093142745 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.36879432624113473, + "acc_stderr": 0.02878222756134724, + "acc_norm": 0.36879432624113473, + "acc_norm_stderr": 0.02878222756134724 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.36114732724902215, + "acc_stderr": 0.012267935477519028, + "acc_norm": 0.36114732724902215, + "acc_norm_stderr": 0.012267935477519028 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5183823529411765, + "acc_stderr": 0.030352303395351964, + "acc_norm": 0.5183823529411765, + "acc_norm_stderr": 0.030352303395351964 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4150326797385621, + "acc_stderr": 0.019933627776857418, + "acc_norm": 0.4150326797385621, + "acc_norm_stderr": 0.019933627776857418 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5545454545454546, + "acc_stderr": 0.047605488214603246, + "acc_norm": 0.5545454545454546, + "acc_norm_stderr": 0.047605488214603246 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.42448979591836733, + "acc_stderr": 0.031642094879429414, + "acc_norm": 0.42448979591836733, + "acc_norm_stderr": 0.031642094879429414 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6467661691542289, + "acc_stderr": 0.03379790611796777, + "acc_norm": 0.6467661691542289, + "acc_norm_stderr": 0.03379790611796777 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.038367221765980515, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.038367221765980515 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6608187134502924, + "acc_stderr": 0.03631053496488905, + "acc_norm": 0.6608187134502924, + "acc_norm_stderr": 0.03631053496488905 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2558139534883721, + "mc1_stderr": 0.015274176219283361, + "mc2": 0.3882239370629839, + "mc2_stderr": 0.014196610467420477 + }, + "all": { + "acc": 0.46852179785168946, + "acc_stderr": 0.03529525820204488, + "acc_norm": 0.4723064460902952, + "acc_norm_stderr": 0.03528150833381645, + "mc1": 0.2558139534883721, + "mc1_stderr": 0.015274176219283361, + "mc2": 0.3882239370629839, + "mc2_stderr": 0.014196610467420477 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "clibrain/Llama-2-7b-ft-instruct-es", + "model_sha": "b62f431c88b232204ea7046f9d906ae1daa68437", + "model_dtype": "torch.float16", + "lighteval_sha": "da839e70121267a9bf55a0fbea4fb2fae2948337", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4060.836770296097", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/clibrain/Llama-2-7b-ft-instruct-es/results_2023-08-09T22-51-28.194714.json b/eval-results/clibrain/Llama-2-7b-ft-instruct-es/results_2023-08-09T22-51-28.194714.json new file mode 100644 index 0000000000000000000000000000000000000000..02e987e66a4eac2964e92b2fea203253729e8650 --- /dev/null +++ b/eval-results/clibrain/Llama-2-7b-ft-instruct-es/results_2023-08-09T22-51-28.194714.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.507679180887372, + "acc_stderr": 0.01460966744089257, + "acc_norm": 0.5366894197952219, + "acc_norm_stderr": 0.014572000527756993 + }, + "harness|hellaswag|10": { + "acc": 0.5840470025891257, + "acc_stderr": 0.0049187816623739436, + "acc_norm": 0.7783310097590121, + "acc_norm_stderr": 0.004145206350032313 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4740740740740741, + "acc_stderr": 0.04313531696750575, + "acc_norm": 0.4740740740740741, + "acc_norm_stderr": 0.04313531696750575 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.42105263157894735, + "acc_stderr": 0.040179012759817494, + "acc_norm": 0.42105263157894735, + "acc_norm_stderr": 0.040179012759817494 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5132075471698113, + "acc_stderr": 0.030762134874500482, + "acc_norm": 0.5132075471698113, + "acc_norm_stderr": 0.030762134874500482 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4861111111111111, + "acc_stderr": 0.04179596617581, + "acc_norm": 0.4861111111111111, + "acc_norm_stderr": 0.04179596617581 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.41040462427745666, + "acc_stderr": 0.03750757044895536, + "acc_norm": 0.41040462427745666, + "acc_norm_stderr": 0.03750757044895536 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.1568627450980392, + "acc_stderr": 0.036186648199362466, + "acc_norm": 0.1568627450980392, + "acc_norm_stderr": 0.036186648199362466 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.41702127659574467, + "acc_stderr": 0.032232762667117124, + "acc_norm": 0.41702127659574467, + "acc_norm_stderr": 0.032232762667117124 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3931034482758621, + "acc_stderr": 0.0407032901370707, + "acc_norm": 0.3931034482758621, + "acc_norm_stderr": 0.0407032901370707 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29894179894179895, + "acc_stderr": 0.02357760479165581, + "acc_norm": 0.29894179894179895, + "acc_norm_stderr": 0.02357760479165581 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04285714285714281, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04285714285714281 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5, + "acc_stderr": 0.028444006199428714, + "acc_norm": 0.5, + "acc_norm_stderr": 0.028444006199428714 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3054187192118227, + "acc_stderr": 0.03240661565868407, + "acc_norm": 0.3054187192118227, + "acc_norm_stderr": 0.03240661565868407 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6, + "acc_stderr": 0.03825460278380025, + "acc_norm": 0.6, + "acc_norm_stderr": 0.03825460278380025 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5656565656565656, + "acc_stderr": 0.03531505879359183, + "acc_norm": 0.5656565656565656, + "acc_norm_stderr": 0.03531505879359183 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6787564766839378, + "acc_stderr": 0.033699508685490674, + "acc_norm": 0.6787564766839378, + "acc_norm_stderr": 0.033699508685490674 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.441025641025641, + "acc_stderr": 0.025174048384000763, + "acc_norm": 0.441025641025641, + "acc_norm_stderr": 0.025174048384000763 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.02696242432507384, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.02696242432507384 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4369747899159664, + "acc_stderr": 0.03221943636566196, + "acc_norm": 0.4369747899159664, + "acc_norm_stderr": 0.03221943636566196 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.037804458505267334, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.037804458505267334 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6275229357798165, + "acc_stderr": 0.0207283684576385, + "acc_norm": 0.6275229357798165, + "acc_norm_stderr": 0.0207283684576385 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3101851851851852, + "acc_stderr": 0.03154696285656628, + "acc_norm": 0.3101851851851852, + "acc_norm_stderr": 0.03154696285656628 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.03454236585380609, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.03454236585380609 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6413502109704642, + "acc_stderr": 0.031219569445301843, + "acc_norm": 0.6413502109704642, + "acc_norm_stderr": 0.031219569445301843 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5605381165919282, + "acc_stderr": 0.033310925110381785, + "acc_norm": 0.5605381165919282, + "acc_norm_stderr": 0.033310925110381785 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5267175572519084, + "acc_stderr": 0.04379024936553894, + "acc_norm": 0.5267175572519084, + "acc_norm_stderr": 0.04379024936553894 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6446280991735537, + "acc_stderr": 0.0436923632657398, + "acc_norm": 0.6446280991735537, + "acc_norm_stderr": 0.0436923632657398 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5, + "acc_stderr": 0.04833682445228318, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04833682445228318 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5276073619631901, + "acc_stderr": 0.0392237829061099, + "acc_norm": 0.5276073619631901, + "acc_norm_stderr": 0.0392237829061099 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764376, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5922330097087378, + "acc_stderr": 0.04865777570410769, + "acc_norm": 0.5922330097087378, + "acc_norm_stderr": 0.04865777570410769 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6965811965811965, + "acc_stderr": 0.030118210106942656, + "acc_norm": 0.6965811965811965, + "acc_norm_stderr": 0.030118210106942656 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6270753512132823, + "acc_stderr": 0.01729286826945392, + "acc_norm": 0.6270753512132823, + "acc_norm_stderr": 0.01729286826945392 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.4797687861271676, + "acc_stderr": 0.026897049996382868, + "acc_norm": 0.4797687861271676, + "acc_norm_stderr": 0.026897049996382868 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.27150837988826815, + "acc_stderr": 0.014874252168095264, + "acc_norm": 0.27150837988826815, + "acc_norm_stderr": 0.014874252168095264 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5, + "acc_stderr": 0.028629916715693413, + "acc_norm": 0.5, + "acc_norm_stderr": 0.028629916715693413 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5434083601286174, + "acc_stderr": 0.0282908690541976, + "acc_norm": 0.5434083601286174, + "acc_norm_stderr": 0.0282908690541976 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.47530864197530864, + "acc_stderr": 0.02778680093142745, + "acc_norm": 0.47530864197530864, + "acc_norm_stderr": 0.02778680093142745 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.36879432624113473, + "acc_stderr": 0.02878222756134724, + "acc_norm": 0.36879432624113473, + "acc_norm_stderr": 0.02878222756134724 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.36114732724902215, + "acc_stderr": 0.012267935477519028, + "acc_norm": 0.36114732724902215, + "acc_norm_stderr": 0.012267935477519028 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5183823529411765, + "acc_stderr": 0.030352303395351964, + "acc_norm": 0.5183823529411765, + "acc_norm_stderr": 0.030352303395351964 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4150326797385621, + "acc_stderr": 0.019933627776857418, + "acc_norm": 0.4150326797385621, + "acc_norm_stderr": 0.019933627776857418 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5545454545454546, + "acc_stderr": 0.047605488214603246, + "acc_norm": 0.5545454545454546, + "acc_norm_stderr": 0.047605488214603246 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.42448979591836733, + "acc_stderr": 0.031642094879429414, + "acc_norm": 0.42448979591836733, + "acc_norm_stderr": 0.031642094879429414 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6467661691542289, + "acc_stderr": 0.03379790611796777, + "acc_norm": 0.6467661691542289, + "acc_norm_stderr": 0.03379790611796777 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.038367221765980515, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.038367221765980515 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6608187134502924, + "acc_stderr": 0.03631053496488905, + "acc_norm": 0.6608187134502924, + "acc_norm_stderr": 0.03631053496488905 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2558139534883721, + "mc1_stderr": 0.015274176219283361, + "mc2": 0.3882239370629839, + "mc2_stderr": 0.014196610467420477 + }, + "all": { + "acc": 0.46852179785168946, + "acc_stderr": 0.03529525820204488, + "acc_norm": 0.4723064460902952, + "acc_norm_stderr": 0.03528150833381645, + "mc1": 0.2558139534883721, + "mc1_stderr": 0.015274176219283361, + "mc2": 0.3882239370629839, + "mc2_stderr": 0.014196610467420477 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "clibrain/Llama-2-7b-ft-instruct-es", + "model_sha": "b62f431c88b232204ea7046f9d906ae1daa68437", + "model_dtype": "torch.float16", + "lighteval_sha": "da839e70121267a9bf55a0fbea4fb2fae2948337", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4061.5287792682648", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/clibrain/Llama-2-7b-ft-instruct-es/results_2023-09-17T14-05-09.748904.json b/eval-results/clibrain/Llama-2-7b-ft-instruct-es/results_2023-09-17T14-05-09.748904.json new file mode 100644 index 0000000000000000000000000000000000000000..aa877d18c139c2fdeb3ac9f84f0f39b7d8e5456a --- /dev/null +++ b/eval-results/clibrain/Llama-2-7b-ft-instruct-es/results_2023-09-17T14-05-09.748904.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "clibrain/Llama-2-7b-ft-instruct-es", + "model_sha": "b62f431c88b232204ea7046f9d906ae1daa68437", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001363255033557047, + "em_stderr": 0.00037786091964606556, + "f1": 0.059617239932886215, + "f1_stderr": 0.0013507073733013888 + }, + "harness|gsm8k|5": { + "acc": 0.05686125852918878, + "acc_stderr": 0.006378790242099664 + }, + "harness|winogrande|5": { + "acc": 0.7521704814522494, + "acc_stderr": 0.01213438601986535 + }, + "all": { + "em": 0.001363255033557047, + "em_stderr": 0.00037786091964606556, + "f1": 0.059617239932886215, + "f1_stderr": 0.0013507073733013888, + "acc": 0.4045158699907191, + "acc_stderr": 0.009256588130982506 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "7b59a930c98053f7" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "96b66af63c0361be" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "04c243a424e6a955" + }, + "total_evaluation_time_secondes": "10256.67344713211", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/clibrain/Llama-2-ft-instruct-es/results_2023-08-25T19-36-08.180753.json b/eval-results/clibrain/Llama-2-ft-instruct-es/results_2023-08-25T19-36-08.180753.json new file mode 100644 index 0000000000000000000000000000000000000000..59d0d9f527b82d533a9e5c2888f84f8295605818 --- /dev/null +++ b/eval-results/clibrain/Llama-2-ft-instruct-es/results_2023-08-25T19-36-08.180753.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "clibrain/Llama-2-ft-instruct-es", + "model_sha": "42f07d6a86fac5574febb7b8fa13c3b1e14fcebd", + "model_dtype": "torch.float16", + "lighteval_sha": "578835f70c499eaf870208de093513e08f864581", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.22696245733788395, + "acc_stderr": 0.012240491536132861, + "acc_norm": 0.22696245733788395, + "acc_norm_stderr": 0.012240491536132861 + }, + "harness|hellaswag|10": { + "acc": 0.2504481179047998, + "acc_stderr": 0.004323856300539177, + "acc_norm": 0.2504481179047998, + "acc_norm_stderr": 0.004323856300539177 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.18518518518518517, + "acc_stderr": 0.03355677216313142, + "acc_norm": 0.18518518518518517, + "acc_norm_stderr": 0.03355677216313142 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.21509433962264152, + "acc_stderr": 0.02528839450289137, + "acc_norm": 0.21509433962264152, + "acc_norm_stderr": 0.02528839450289137 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749874, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749874 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813365 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.20899470899470898, + "acc_stderr": 0.02094048156533486, + "acc_norm": 0.20899470899470898, + "acc_norm_stderr": 0.02094048156533486 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04040610178208841, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04040610178208841 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.1774193548387097, + "acc_stderr": 0.02173254068932927, + "acc_norm": 0.1774193548387097, + "acc_norm_stderr": 0.02173254068932927 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.15270935960591134, + "acc_stderr": 0.02530890453938063, + "acc_norm": 0.15270935960591134, + "acc_norm_stderr": 0.02530890453938063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.17676767676767677, + "acc_stderr": 0.027178752639044915, + "acc_norm": 0.17676767676767677, + "acc_norm_stderr": 0.027178752639044915 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.19689119170984457, + "acc_stderr": 0.028697873971860664, + "acc_norm": 0.19689119170984457, + "acc_norm_stderr": 0.028697873971860664 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.20256410256410257, + "acc_stderr": 0.020377660970371372, + "acc_norm": 0.20256410256410257, + "acc_norm_stderr": 0.020377660970371372 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2111111111111111, + "acc_stderr": 0.024882116857655075, + "acc_norm": 0.2111111111111111, + "acc_norm_stderr": 0.024882116857655075 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21008403361344538, + "acc_stderr": 0.026461398717471874, + "acc_norm": 0.21008403361344538, + "acc_norm_stderr": 0.026461398717471874 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.1986754966887417, + "acc_stderr": 0.03257847384436776, + "acc_norm": 0.1986754966887417, + "acc_norm_stderr": 0.03257847384436776 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1926605504587156, + "acc_stderr": 0.016909276884936094, + "acc_norm": 0.1926605504587156, + "acc_norm_stderr": 0.016909276884936094 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.1527777777777778, + "acc_stderr": 0.024536326026134224, + "acc_norm": 0.1527777777777778, + "acc_norm_stderr": 0.024536326026134224 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.31390134529147984, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.31390134529147984, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2905982905982906, + "acc_stderr": 0.02974504857267404, + "acc_norm": 0.2905982905982906, + "acc_norm_stderr": 0.02974504857267404 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.23754789272030652, + "acc_stderr": 0.015218733046150193, + "acc_norm": 0.23754789272030652, + "acc_norm_stderr": 0.015218733046150193 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.023929155517351284, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.023929155517351284 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1864951768488746, + "acc_stderr": 0.02212243977248077, + "acc_norm": 0.1864951768488746, + "acc_norm_stderr": 0.02212243977248077 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21604938271604937, + "acc_stderr": 0.022899162918445806, + "acc_norm": 0.21604938271604937, + "acc_norm_stderr": 0.022899162918445806 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23404255319148937, + "acc_stderr": 0.025257861359432417, + "acc_norm": 0.23404255319148937, + "acc_norm_stderr": 0.025257861359432417 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142692, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142692 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.18382352941176472, + "acc_stderr": 0.023529242185193106, + "acc_norm": 0.18382352941176472, + "acc_norm_stderr": 0.023529242185193106 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25, + "acc_stderr": 0.01751781884501444, + "acc_norm": 0.25, + "acc_norm_stderr": 0.01751781884501444 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.18775510204081633, + "acc_stderr": 0.02500025603954621, + "acc_norm": 0.18775510204081633, + "acc_norm_stderr": 0.02500025603954621 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401465, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401465 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.28313253012048195, + "acc_stderr": 0.03507295431370518, + "acc_norm": 0.28313253012048195, + "acc_norm_stderr": 0.03507295431370518 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 1.0, + "mc1_stderr": 0.0, + "mc2": NaN, + "mc2_stderr": NaN + }, + "all": { + "acc": 0.2314240573187148, + "acc_stderr": 0.03071122006512167, + "acc_norm": 0.2314240573187148, + "acc_norm_stderr": 0.03071122006512167, + "mc1": 1.0, + "mc1_stderr": 0.0, + "mc2": NaN, + "mc2_stderr": NaN + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "4466.601140499115", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/clibrain/Llama-2-ft-instruct-es/results_2023-09-17T17-59-02.863865.json b/eval-results/clibrain/Llama-2-ft-instruct-es/results_2023-09-17T17-59-02.863865.json new file mode 100644 index 0000000000000000000000000000000000000000..91888b598169d22235e76c33b6290bd49c29d4fc --- /dev/null +++ b/eval-results/clibrain/Llama-2-ft-instruct-es/results_2023-09-17T17-59-02.863865.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "clibrain/Llama-2-ft-instruct-es", + "model_sha": "42f07d6a86fac5574febb7b8fa13c3b1e14fcebd", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.4956590370955012, + "acc_stderr": 0.014051956064076911 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0, + "acc": 0.2478295185477506, + "acc_stderr": 0.007025978032038456 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "d62a3b26770557a9" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "8401e6188d643544" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "f150732b0323f26d" + }, + "total_evaluation_time_secondes": "60270.98185944557", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/clibrain/Llama-2-ft-instruct-es/results_2023-12-02T16-47-05.366390.json b/eval-results/clibrain/Llama-2-ft-instruct-es/results_2023-12-02T16-47-05.366390.json new file mode 100644 index 0000000000000000000000000000000000000000..a392edc16a85fff65578d227bfc1f5f2429a01ec --- /dev/null +++ b/eval-results/clibrain/Llama-2-ft-instruct-es/results_2023-12-02T16-47-05.366390.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 648542.801651075, + "end_time": 651732.578155781, + "total_evaluation_time_secondes": "3189.7765047060093", + "model_name": "clibrain/Llama-2-ft-instruct-es", + "model_sha": "42f07d6a86fac5574febb7b8fa13c3b1e14fcebd", + "model_dtype": "torch.float16", + "model_size": "12.58 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "all": { + "acc": 0.0, + "acc_stderr": 0.0 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "8401e6188d643544" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "42036645de5ac59d", + "hash_cont_tokens": "95e452ffb745c2ae" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/concedo/OPT-19M-ChatSalad/results_2023-07-19T13-30-17.272494.json b/eval-results/concedo/OPT-19M-ChatSalad/results_2023-07-19T13-30-17.272494.json new file mode 100644 index 0000000000000000000000000000000000000000..4016885bbfbcc0ae6f13333edb662906c07c9abd --- /dev/null +++ b/eval-results/concedo/OPT-19M-ChatSalad/results_2023-07-19T13-30-17.272494.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.2030716723549488, + "acc_stderr": 0.011755899303705582, + "acc_norm": 0.2440273037542662, + "acc_norm_stderr": 0.012551447627856255 + }, + "harness|hellaswag|10": { + "acc": 0.2591117307309301, + "acc_stderr": 0.004372516060164751, + "acc_norm": 0.2515435172276439, + "acc_norm_stderr": 0.0043301342197628444 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.18518518518518517, + "acc_stderr": 0.03355677216313142, + "acc_norm": 0.18518518518518517, + "acc_norm_stderr": 0.03355677216313142 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.21509433962264152, + "acc_stderr": 0.02528839450289137, + "acc_norm": 0.21509433962264152, + "acc_norm_stderr": 0.02528839450289137 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749874, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749874 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813365 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.20899470899470898, + "acc_stderr": 0.02094048156533486, + "acc_norm": 0.20899470899470898, + "acc_norm_stderr": 0.02094048156533486 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04040610178208841, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04040610178208841 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.1774193548387097, + "acc_stderr": 0.02173254068932927, + "acc_norm": 0.1774193548387097, + "acc_norm_stderr": 0.02173254068932927 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.15270935960591134, + "acc_stderr": 0.02530890453938063, + "acc_norm": 0.15270935960591134, + "acc_norm_stderr": 0.02530890453938063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.17676767676767677, + "acc_stderr": 0.027178752639044915, + "acc_norm": 0.17676767676767677, + "acc_norm_stderr": 0.027178752639044915 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.19689119170984457, + "acc_stderr": 0.028697873971860664, + "acc_norm": 0.19689119170984457, + "acc_norm_stderr": 0.028697873971860664 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.20256410256410257, + "acc_stderr": 0.020377660970371372, + "acc_norm": 0.20256410256410257, + "acc_norm_stderr": 0.020377660970371372 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2111111111111111, + "acc_stderr": 0.024882116857655075, + "acc_norm": 0.2111111111111111, + "acc_norm_stderr": 0.024882116857655075 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21008403361344538, + "acc_stderr": 0.026461398717471874, + "acc_norm": 0.21008403361344538, + "acc_norm_stderr": 0.026461398717471874 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.1986754966887417, + "acc_stderr": 0.03257847384436776, + "acc_norm": 0.1986754966887417, + "acc_norm_stderr": 0.03257847384436776 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1926605504587156, + "acc_stderr": 0.016909276884936094, + "acc_norm": 0.1926605504587156, + "acc_norm_stderr": 0.016909276884936094 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.1527777777777778, + "acc_stderr": 0.024536326026134224, + "acc_norm": 0.1527777777777778, + "acc_norm_stderr": 0.024536326026134224 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.31390134529147984, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.31390134529147984, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2905982905982906, + "acc_stderr": 0.02974504857267404, + "acc_norm": 0.2905982905982906, + "acc_norm_stderr": 0.02974504857267404 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.23754789272030652, + "acc_stderr": 0.015218733046150193, + "acc_norm": 0.23754789272030652, + "acc_norm_stderr": 0.015218733046150193 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.023929155517351284, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.023929155517351284 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1864951768488746, + "acc_stderr": 0.02212243977248077, + "acc_norm": 0.1864951768488746, + "acc_norm_stderr": 0.02212243977248077 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21604938271604937, + "acc_stderr": 0.022899162918445806, + "acc_norm": 0.21604938271604937, + "acc_norm_stderr": 0.022899162918445806 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23404255319148937, + "acc_stderr": 0.025257861359432417, + "acc_norm": 0.23404255319148937, + "acc_norm_stderr": 0.025257861359432417 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142692, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142692 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.18382352941176472, + "acc_stderr": 0.023529242185193106, + "acc_norm": 0.18382352941176472, + "acc_norm_stderr": 0.023529242185193106 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25, + "acc_stderr": 0.01751781884501444, + "acc_norm": 0.25, + "acc_norm_stderr": 0.01751781884501444 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.18775510204081633, + "acc_stderr": 0.02500025603954621, + "acc_norm": 0.18775510204081633, + "acc_norm_stderr": 0.02500025603954621 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401465, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401465 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.28313253012048195, + "acc_stderr": 0.03507295431370518, + "acc_norm": 0.28313253012048195, + "acc_norm_stderr": 0.03507295431370518 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2484700122399021, + "mc1_stderr": 0.01512742709652069, + "mc2": 0.5135991249714797, + "mc2_stderr": 0.016297704289571835 + }, + "all": { + "acc": 0.2311659696550401, + "acc_stderr": 0.03070383137914198, + "acc_norm": 0.2317318580939559, + "acc_norm_stderr": 0.030716596912764838, + "mc1": 0.2484700122399021, + "mc1_stderr": 0.01512742709652069, + "mc2": 0.5135991249714797, + "mc2_stderr": 0.016297704289571835 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "concedo/OPT-19M-ChatSalad", + "model_sha": "3930ca6bf3976e9b603815403cb373398ae509e5", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2e52476df896898b", + "hash_cont_tokens": "28e2701291693338" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "a5079f2e8402bdc3", + "hash_cont_tokens": "30e348bce778fa10" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "094c3a171105c12e", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "fe68bfcf91b9075e", + "hash_cont_tokens": "705516ff46ec26dc" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "4d77ecaf04a26dfe", + "hash_cont_tokens": "881af7bd65854d45" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "7353edcfcf72d221", + "hash_cont_tokens": "e760cc7be5ddbe71" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "162bb9f7b3cd706e", + "hash_cont_tokens": "37477257cf9eeb0a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "63d442b13b5d85b6", + "hash_cont_tokens": "3f04694ac6f92548" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "99db48cd6b077b68", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "4bc7d55623070a07", + "hash_cont_tokens": "15b2112308ef7b2b" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e83395ed75fa03d5", + "hash_cont_tokens": "a67ba9facbae0268" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "7f508f7828fe5ba6", + "hash_cont_tokens": "40630b2e3e33ca08" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "0fb01b8731db8d81", + "hash_cont_tokens": "4085a0ba4a98cf79" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "8c8460fe570b556e", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "16e0aa20b920aa11", + "hash_cont_tokens": "f15de85dda56bf9a" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "bc236ab739e1c15b", + "hash_cont_tokens": "35b673589f562c55" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "eec634c59e67082e", + "hash_cont_tokens": "1fec337497bf988f" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "551d76303aaf3f4e", + "hash_cont_tokens": "85d6a2e58f1aa799" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "532728846623b114", + "hash_cont_tokens": "6a362d8f09b66319" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "8aaecba1a0475c64", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "2afe2320ca29933a", + "hash_cont_tokens": "7186426999d40201" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "2ba3b67fb2446a06", + "hash_cont_tokens": "97e729fbed631d26" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "10e55771dbb42b2c", + "hash_cont_tokens": "2d5af91609bd4d0d" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "6d8596e5edbe236d", + "hash_cont_tokens": "2553c38072fe59e9" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "3fb9fd43f1792a28", + "hash_cont_tokens": "967f1a6377c5dada" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "51f21e325fe493bc", + "hash_cont_tokens": "5cbe4530fc364ed8" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "78a8e9b40bc5418c", + "hash_cont_tokens": "3c15870aa9a751c8" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "44525d3009ded4a4", + "hash_cont_tokens": "75f6aa84e7959e70" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "76e98460e3320e1c", + "hash_cont_tokens": "7bfc49a85b0e6b0f" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "f47dbaece0632444", + "hash_cont_tokens": "5ced294bf867b6fa" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d685add8792a69d2", + "hash_cont_tokens": "9ffbe637167399d6" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "10fa751069aea803", + "hash_cont_tokens": "25c58237091f9ea7" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "2b245a8312dd0ee8", + "hash_cont_tokens": "19500e048c94127a" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "fa3b5b3bf631cd40", + "hash_cont_tokens": "0135bf601685a8b0" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "a7cc14eb97a963c1", + "hash_cont_tokens": "350bc807db8602e4" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "5a27a3a18e11300c", + "hash_cont_tokens": "944bf06e08c9e841" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "5355beafda861ea0", + "hash_cont_tokens": "a9ec061d9a865f49" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "85bf654d3221129b", + "hash_cont_tokens": "3813b356ad4675eb" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "5f8c6e6a21145296", + "hash_cont_tokens": "4250ef4e0ecec581" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "1cf278ba4dac7b93", + "hash_cont_tokens": "c4fb7cc44b48985a" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "67df50e49cb50049", + "hash_cont_tokens": "f6301f26d3421bfe" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e254e479a1dd95e6", + "hash_cont_tokens": "4bea1308c2dedd32" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "836b977dd80307df", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "3d9d2c0b97a586f9", + "hash_cont_tokens": "d87f2c7e8fda82f9" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "b354e905172e9a92", + "hash_cont_tokens": "098675117a7f6f77" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "e0f5580d6e0bd639", + "hash_cont_tokens": "bd59c34597b05651" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "e66c2273b0b50f8a", + "hash_cont_tokens": "03bcb0a0f9d4f331" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "72c74dca625bae21", + "hash_cont_tokens": "4b9e620ce1055d4a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "139ea332c437abef", + "hash_cont_tokens": "3f04832c8adc4e0a" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "9e4929005482ae10", + "hash_cont_tokens": "767ed1231cb8e258" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "7105767805e28747", + "hash_cont_tokens": "f0b059007537e041" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f04f0a03ea895b5b", + "hash_cont_tokens": "3bc5fb58666e5e8b" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "46fbbd942e3b6db5", + "hash_cont_tokens": "190e8f92d03650fe" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "4b9217ec408da4d4", + "hash_cont_tokens": "1bda889eaab363c0" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "9eadb993a592c2bf", + "hash_cont_tokens": "859ddf07f8d0ab66" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "18f0e119974d9136", + "hash_cont_tokens": "7fdcb74bc758e7bd" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "9a26a58deec29cba", + "hash_cont_tokens": "65115fc130126941" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "4b0d85cf3b0bf65b", + "hash_cont_tokens": "456a90466d8efd2a" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "b0e8f149dfd2fa76", + "hash_cont_tokens": "6d21235f853c8d4b" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "6e0e57e58e2d03ff", + "hash_cont_tokens": "a67a79a7e9449644" + } + } +} \ No newline at end of file diff --git a/eval-results/concedo/OPT-19M-ChatSalad/results_2023-09-22T13-42-25.445156.json b/eval-results/concedo/OPT-19M-ChatSalad/results_2023-09-22T13-42-25.445156.json new file mode 100644 index 0000000000000000000000000000000000000000..522d0276dad6b524c59c062a56d690f993ff4048 --- /dev/null +++ b/eval-results/concedo/OPT-19M-ChatSalad/results_2023-09-22T13-42-25.445156.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "concedo/OPT-19M-ChatSalad", + "model_sha": "3930ca6bf3976e9b603815403cb373398ae509e5", + "model_size": "36.82 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0024863674496644274, + "f1_stderr": 0.0002550496086684011 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.4972375690607735, + "acc_stderr": 0.014052271211616441 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0024863674496644274, + "f1_stderr": 0.0002550496086684011, + "acc": 0.24861878453038674, + "acc_stderr": 0.007026135605808221 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "e74b23fd6ab24722", + "hash_cont_tokens": "7b70515926da5675" + }, + "truncated": 384, + "non-truncated": 9152, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "a2243014cab6a7a0", + "hash_cont_tokens": "3f4cd0e60d78ef85" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "0a8020a0b9bd626c", + "hash_cont_tokens": "d75b4039559457e2" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "409bf3c4619f5fc0", + "hash_cont_tokens": "9adab0480282ae37" + }, + "total_evaluation_time_secondes": "4397.518654346466", + "truncated": 384, + "non-truncated": 13005, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/concedo/Pythia-70M-ChatSalad/results_2023-07-19T13-36-47.045814.json b/eval-results/concedo/Pythia-70M-ChatSalad/results_2023-07-19T13-36-47.045814.json new file mode 100644 index 0000000000000000000000000000000000000000..60046263f961bbb045bb6f3f22c7065f34181a9d --- /dev/null +++ b/eval-results/concedo/Pythia-70M-ChatSalad/results_2023-07-19T13-36-47.045814.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.17747440273037543, + "acc_stderr": 0.011165138769643958, + "acc_norm": 0.2098976109215017, + "acc_norm_stderr": 0.011900548748047444 + }, + "harness|hellaswag|10": { + "acc": 0.2660824536944832, + "acc_stderr": 0.004410047530835031, + "acc_norm": 0.2727544313881697, + "acc_norm_stderr": 0.004444654076550554 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.2, + "acc_stderr": 0.040201512610368466, + "acc_norm": 0.2, + "acc_norm_stderr": 0.040201512610368466 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04072314811876837, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04072314811876837 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.03317672787533157, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.03317672787533157 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.21, + "acc_stderr": 0.04093601807403325, + "acc_norm": 0.21, + "acc_norm_stderr": 0.04093601807403325 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.21132075471698114, + "acc_stderr": 0.02512576648482784, + "acc_norm": 0.21132075471698114, + "acc_norm_stderr": 0.02512576648482784 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.1597222222222222, + "acc_stderr": 0.030635578972093278, + "acc_norm": 0.1597222222222222, + "acc_norm_stderr": 0.030635578972093278 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768077, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768077 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2543352601156069, + "acc_stderr": 0.0332055644308557, + "acc_norm": 0.2543352601156069, + "acc_norm_stderr": 0.0332055644308557 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.04440521906179327, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.04440521906179327 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.02880998985410296, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.02880998985410296 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748141, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748141 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.23448275862068965, + "acc_stderr": 0.035306258743465914, + "acc_norm": 0.23448275862068965, + "acc_norm_stderr": 0.035306258743465914 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.022569897074918424, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.022569897074918424 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.1984126984126984, + "acc_stderr": 0.03567016675276863, + "acc_norm": 0.1984126984126984, + "acc_norm_stderr": 0.03567016675276863 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3161290322580645, + "acc_stderr": 0.026450874489042774, + "acc_norm": 0.3161290322580645, + "acc_norm_stderr": 0.026450874489042774 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.30049261083743845, + "acc_stderr": 0.03225799476233484, + "acc_norm": 0.30049261083743845, + "acc_norm_stderr": 0.03225799476233484 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.24848484848484848, + "acc_stderr": 0.03374402644139405, + "acc_norm": 0.24848484848484848, + "acc_norm_stderr": 0.03374402644139405 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.03191178226713547, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.03191178226713547 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.24352331606217617, + "acc_stderr": 0.030975436386845436, + "acc_norm": 0.24352331606217617, + "acc_norm_stderr": 0.030975436386845436 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2076923076923077, + "acc_stderr": 0.020567539567246797, + "acc_norm": 0.2076923076923077, + "acc_norm_stderr": 0.020567539567246797 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.026719240783712166, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.026719240783712166 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2184873949579832, + "acc_stderr": 0.026841514322958945, + "acc_norm": 0.2184873949579832, + "acc_norm_stderr": 0.026841514322958945 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.21467889908256882, + "acc_stderr": 0.017604304149256487, + "acc_norm": 0.21467889908256882, + "acc_norm_stderr": 0.017604304149256487 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.031141447823536023, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.031141447823536023 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.031321798030832904, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.031321798030832904 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.25316455696202533, + "acc_stderr": 0.028304657943035303, + "acc_norm": 0.25316455696202533, + "acc_norm_stderr": 0.028304657943035303 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.2645739910313901, + "acc_stderr": 0.029605103217038315, + "acc_norm": 0.2645739910313901, + "acc_norm_stderr": 0.029605103217038315 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.1984732824427481, + "acc_stderr": 0.0349814938546247, + "acc_norm": 0.1984732824427481, + "acc_norm_stderr": 0.0349814938546247 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.36363636363636365, + "acc_stderr": 0.043913262867240704, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.043913262867240704 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.23148148148148148, + "acc_stderr": 0.04077494709252628, + "acc_norm": 0.23148148148148148, + "acc_norm_stderr": 0.04077494709252628 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.31901840490797545, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.31901840490797545, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.042878587513404544, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.042878587513404544 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.0376017800602662, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.0376017800602662 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.19230769230769232, + "acc_stderr": 0.025819233256483706, + "acc_norm": 0.19230769230769232, + "acc_norm_stderr": 0.025819233256483706 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.26309067688378035, + "acc_stderr": 0.015745497169049053, + "acc_norm": 0.26309067688378035, + "acc_norm_stderr": 0.015745497169049053 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.22905027932960895, + "acc_stderr": 0.014054314935614579, + "acc_norm": 0.22905027932960895, + "acc_norm_stderr": 0.014054314935614579 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.24183006535947713, + "acc_stderr": 0.024518195641879334, + "acc_norm": 0.24183006535947713, + "acc_norm_stderr": 0.024518195641879334 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2765273311897106, + "acc_stderr": 0.02540383297817962, + "acc_norm": 0.2765273311897106, + "acc_norm_stderr": 0.02540383297817962 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.024659685185967284, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.024659685185967284 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23049645390070922, + "acc_stderr": 0.025123739226872395, + "acc_norm": 0.23049645390070922, + "acc_norm_stderr": 0.025123739226872395 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24511082138200782, + "acc_stderr": 0.010986307870045517, + "acc_norm": 0.24511082138200782, + "acc_norm_stderr": 0.010986307870045517 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.1875, + "acc_stderr": 0.023709788253811766, + "acc_norm": 0.1875, + "acc_norm_stderr": 0.023709788253811766 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25, + "acc_stderr": 0.01751781884501444, + "acc_norm": 0.25, + "acc_norm_stderr": 0.01751781884501444 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.19090909090909092, + "acc_stderr": 0.03764425585984924, + "acc_norm": 0.19090909090909092, + "acc_norm_stderr": 0.03764425585984924 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.24489795918367346, + "acc_stderr": 0.027529637440174934, + "acc_norm": 0.24489795918367346, + "acc_norm_stderr": 0.027529637440174934 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24875621890547264, + "acc_stderr": 0.030567675938916707, + "acc_norm": 0.24875621890547264, + "acc_norm_stderr": 0.030567675938916707 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.20481927710843373, + "acc_stderr": 0.03141784291663926, + "acc_norm": 0.20481927710843373, + "acc_norm_stderr": 0.03141784291663926 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.16374269005847952, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.16374269005847952, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2778457772337821, + "mc1_stderr": 0.015680929364024643, + "mc2": 0.49740714965139177, + "mc2_stderr": 0.01631085689464895 + }, + "all": { + "acc": 0.24696465444408017, + "acc_stderr": 0.031317602874143216, + "acc_norm": 0.24762728471331427, + "acc_norm_stderr": 0.031330654001670656, + "mc1": 0.2778457772337821, + "mc1_stderr": 0.015680929364024643, + "mc2": 0.49740714965139177, + "mc2_stderr": 0.01631085689464895 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "concedo/Pythia-70M-ChatSalad", + "model_sha": "692289413c47c219cf83b1596783a8e9223541eb", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "573b1b078b6e9deb", + "hash_cont_tokens": "22424bcffb42ecdf" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "f0fd0caf4d4c1110", + "hash_cont_tokens": "62a15ef112ea07d6" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "f076ac6b177ca28c", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "059827606e6b0780", + "hash_cont_tokens": "ec7e2288ab5f1ce9" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "1dd0dab88aa9e4b2", + "hash_cont_tokens": "d7e922da5bc6d1bf" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "d51eb5246cbe2173", + "hash_cont_tokens": "08933598b321179c" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "2337a7f17800c6ec", + "hash_cont_tokens": "bc82b3cc5072f164" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "e394ebbb8ceace76", + "hash_cont_tokens": "3bc45e0c4b6d612d" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "9221fbdf710a6f67", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "ebe2748d21b2ba41", + "hash_cont_tokens": "d839b8186e0f3d94" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "bfecefb08ffb7faa", + "hash_cont_tokens": "3c16f9c45a7a7272" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "2ac8aec9025dc58b", + "hash_cont_tokens": "16f654508cdc19c4" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "faf44c77f43368ef", + "hash_cont_tokens": "a3a24586c7218684" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "280c7f12abde10a5", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "217a841c86d2d992", + "hash_cont_tokens": "43818b3dc0c7496f" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "354267c0f98aad3b", + "hash_cont_tokens": "4f0a3e41169314a8" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "4f5e8d051d04dde0", + "hash_cont_tokens": "7e14ccd1e2688bb8" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "cd12bec1d5448dda", + "hash_cont_tokens": "317e29ee6bba387d" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "c549e395850984fe", + "hash_cont_tokens": "c01a9b75f55e32e0" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "81b06f5caa221f97", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "ad626d781102fe51", + "hash_cont_tokens": "edb2063e955bd5ca" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "2c0d3f2eacc6bbd5", + "hash_cont_tokens": "8000de09bc1dc113" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "aada51d0571db37b", + "hash_cont_tokens": "dcd6a0ada4ab8e0b" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "6e47d696116edd01", + "hash_cont_tokens": "47a5e5973f50fe17" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "0e8ee6c9e572e3c4", + "hash_cont_tokens": "812f79117b9593de" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8fa2bf90de3b07e7", + "hash_cont_tokens": "b4c405890ebd3ee1" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fabb8f176276af2f", + "hash_cont_tokens": "8d468d84a686647d" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3e86d13ef021476a", + "hash_cont_tokens": "e5d02f8f1c5dcf31" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a132b5e9c9531b36", + "hash_cont_tokens": "4c32e38c066727bc" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "f8f6fe5143776cb4", + "hash_cont_tokens": "9416ad85fd6f4a2c" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "e28121967b27a315", + "hash_cont_tokens": "57cc212706ddcdf4" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "bdbe90efb4a1c4ce", + "hash_cont_tokens": "8c5c954092a64343" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "b8f58f05dc082011", + "hash_cont_tokens": "e5ab34a54e3f5b7c" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "3af911bf93093a85", + "hash_cont_tokens": "f3276c80ce1b205b" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "1dd2240eb90b9a70", + "hash_cont_tokens": "7982edf99219e1b0" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f3de2f8181824a79", + "hash_cont_tokens": "ed73d516c5552dd0" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "0c2a1dd63cc74137", + "hash_cont_tokens": "549d9b32b8a90e4e" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "08e3527985f33aab", + "hash_cont_tokens": "ddf5241e450210d6" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "bf7216a648529f68", + "hash_cont_tokens": "eb791fcbee9e0682" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "28f5891c956afd65", + "hash_cont_tokens": "c66b1f3b46001b09" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6de88b824d4f64c3", + "hash_cont_tokens": "27795e9c98bdeda8" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "5ef855d01044fd83", + "hash_cont_tokens": "874c5b0b496cbe8a" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "1840e0b96d7e619e", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "02483f6b53dc13ac", + "hash_cont_tokens": "313ee361fbdbab3c" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "93202e79d594dde4", + "hash_cont_tokens": "fe7747dc69c4909e" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "41c03f41d2ba9fe7", + "hash_cont_tokens": "e0d0ad58a3f1ff22" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "d83bcb6dd08809ac", + "hash_cont_tokens": "c55a10a018de0228" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "65c70474c8a5d205", + "hash_cont_tokens": "7916d26928435f1a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "4d4126ac9a91ac47", + "hash_cont_tokens": "81836c52a10e6ffd" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "592f80ad364d686a", + "hash_cont_tokens": "f5d669014a273483" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "7f837322b1b62ac1", + "hash_cont_tokens": "6b31cf265df9b81b" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "05a8ef0dd10b4bba", + "hash_cont_tokens": "4b3ac60441ad14ec" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3c7944f0b2c49f64", + "hash_cont_tokens": "f139af481f2a9e74" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "637e934bb716d5ec", + "hash_cont_tokens": "ca79966b90cda0ea" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "3bad229573ed6a9c", + "hash_cont_tokens": "952a2e479fc3a83e" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "70a479e96d02d5d8", + "hash_cont_tokens": "f49476cf49b37d7c" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "0d690fc0db462440", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "4b0fdf8e692dd640", + "hash_cont_tokens": "0065c4bbe6134c1c" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "cfd7092dc8aacd96", + "hash_cont_tokens": "9a178e9ec050bf3e" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "e820abadeb7ebfb3", + "hash_cont_tokens": "7f48ddfffa64eb41" + } + } +} \ No newline at end of file diff --git a/eval-results/concedo/Pythia-70M-ChatSalad/results_2023-09-22T19-59-13.355253.json b/eval-results/concedo/Pythia-70M-ChatSalad/results_2023-09-22T19-59-13.355253.json new file mode 100644 index 0000000000000000000000000000000000000000..c4b244a852a349b9c8f75f65eae3e626151291d3 --- /dev/null +++ b/eval-results/concedo/Pythia-70M-ChatSalad/results_2023-09-22T19-59-13.355253.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "concedo/Pythia-70M-ChatSalad", + "model_sha": "692289413c47c219cf83b1596783a8e9223541eb", + "model_size": "137.28 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0014681208053691276, + "em_stderr": 0.00039210421902982634, + "f1": 0.008363045302013424, + "f1_stderr": 0.0006175853648384896 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5240726124704025, + "acc_stderr": 0.014036189665395129 + }, + "all": { + "em": 0.0014681208053691276, + "em_stderr": 0.00039210421902982634, + "f1": 0.008363045302013424, + "f1_stderr": 0.0006175853648384896, + "acc": 0.26203630623520124, + "acc_stderr": 0.0070180948326975644 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "4bf3f6ba1bae765a", + "hash_cont_tokens": "f7cd3c3dfa17a644" + }, + "truncated": 439, + "non-truncated": 9097, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "ef516f9ffbe76423", + "hash_cont_tokens": "cc855739daf153e5" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c469718508f43cab", + "hash_cont_tokens": "87eeb79172195781" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2456, + "non-padded": 78, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "401c6c49053f17ab", + "hash_cont_tokens": "fc1799fed758533c" + }, + "total_evaluation_time_secondes": "1975.3353481292725", + "truncated": 439, + "non-truncated": 12950, + "padded": 2456, + "non-padded": 10933, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/concedo/Vicuzard-30B-Uncensored/results_2023-07-19T22-20-40.681862.json b/eval-results/concedo/Vicuzard-30B-Uncensored/results_2023-07-19T22-20-40.681862.json new file mode 100644 index 0000000000000000000000000000000000000000..91f7835347e8322bdf5dabab1528661d5ed32d19 --- /dev/null +++ b/eval-results/concedo/Vicuzard-30B-Uncensored/results_2023-07-19T22-20-40.681862.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5998293515358362, + "acc_stderr": 0.01431719778780917, + "acc_norm": 0.6296928327645052, + "acc_norm_stderr": 0.01411129875167495 + }, + "harness|hellaswag|10": { + "acc": 0.6445927106154152, + "acc_stderr": 0.004776583530909573, + "acc_norm": 0.8367855008962358, + "acc_norm_stderr": 0.0036880598312390212 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252606, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252606 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5723684210526315, + "acc_stderr": 0.040260970832965634, + "acc_norm": 0.5723684210526315, + "acc_norm_stderr": 0.040260970832965634 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.630188679245283, + "acc_stderr": 0.029711421880107933, + "acc_norm": 0.630188679245283, + "acc_norm_stderr": 0.029711421880107933 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6041666666666666, + "acc_stderr": 0.04089465449325582, + "acc_norm": 0.6041666666666666, + "acc_norm_stderr": 0.04089465449325582 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5317919075144508, + "acc_stderr": 0.038047497443647646, + "acc_norm": 0.5317919075144508, + "acc_norm_stderr": 0.038047497443647646 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.04533838195929776, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.04533838195929776 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252607, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252607 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.502127659574468, + "acc_stderr": 0.03268572658667492, + "acc_norm": 0.502127659574468, + "acc_norm_stderr": 0.03268572658667492 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.044346007015849245, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.044346007015849245 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5103448275862069, + "acc_stderr": 0.041657747757287644, + "acc_norm": 0.5103448275862069, + "acc_norm_stderr": 0.041657747757287644 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.024594975128920938, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.024594975128920938 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04216370213557835, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04216370213557835 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6580645161290323, + "acc_stderr": 0.02698528957655274, + "acc_norm": 0.6580645161290323, + "acc_norm_stderr": 0.02698528957655274 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3694581280788177, + "acc_stderr": 0.03395970381998574, + "acc_norm": 0.3694581280788177, + "acc_norm_stderr": 0.03395970381998574 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.03546563019624336, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.03546563019624336 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7373737373737373, + "acc_stderr": 0.03135305009533085, + "acc_norm": 0.7373737373737373, + "acc_norm_stderr": 0.03135305009533085 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8134715025906736, + "acc_stderr": 0.02811209121011746, + "acc_norm": 0.8134715025906736, + "acc_norm_stderr": 0.02811209121011746 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5538461538461539, + "acc_stderr": 0.02520357177302833, + "acc_norm": 0.5538461538461539, + "acc_norm_stderr": 0.02520357177302833 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085622, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085622 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.03156663099215416, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.03156663099215416 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7761467889908257, + "acc_stderr": 0.017871217767790236, + "acc_norm": 0.7761467889908257, + "acc_norm_stderr": 0.017871217767790236 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.03372343271653064, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.03372343271653064 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7794117647058824, + "acc_stderr": 0.029102254389674082, + "acc_norm": 0.7794117647058824, + "acc_norm_stderr": 0.029102254389674082 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8016877637130801, + "acc_stderr": 0.025955020841621115, + "acc_norm": 0.8016877637130801, + "acc_norm_stderr": 0.025955020841621115 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.031024411740572203, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.031024411740572203 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6183206106870229, + "acc_stderr": 0.042607351576445594, + "acc_norm": 0.6183206106870229, + "acc_norm_stderr": 0.042607351576445594 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7520661157024794, + "acc_stderr": 0.03941897526516303, + "acc_norm": 0.7520661157024794, + "acc_norm_stderr": 0.03941897526516303 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.043300437496507416, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.043300437496507416 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7177914110429447, + "acc_stderr": 0.03536117886664743, + "acc_norm": 0.7177914110429447, + "acc_norm_stderr": 0.03536117886664743 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4732142857142857, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.4732142857142857, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7864077669902912, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.7864077669902912, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8589743589743589, + "acc_stderr": 0.022801382534597542, + "acc_norm": 0.8589743589743589, + "acc_norm_stderr": 0.022801382534597542 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7637292464878672, + "acc_stderr": 0.015190473717037495, + "acc_norm": 0.7637292464878672, + "acc_norm_stderr": 0.015190473717037495 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.025722802200895817, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.025722802200895817 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.39888268156424583, + "acc_stderr": 0.016376966142610073, + "acc_norm": 0.39888268156424583, + "acc_norm_stderr": 0.016376966142610073 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.02782610930728369, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.02782610930728369 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6881028938906752, + "acc_stderr": 0.02631185807185416, + "acc_norm": 0.6881028938906752, + "acc_norm_stderr": 0.02631185807185416 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6419753086419753, + "acc_stderr": 0.0266756119260371, + "acc_norm": 0.6419753086419753, + "acc_norm_stderr": 0.0266756119260371 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.450354609929078, + "acc_stderr": 0.02968010556502904, + "acc_norm": 0.450354609929078, + "acc_norm_stderr": 0.02968010556502904 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4595827900912647, + "acc_stderr": 0.012728446067669983, + "acc_norm": 0.4595827900912647, + "acc_norm_stderr": 0.012728446067669983 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5735294117647058, + "acc_stderr": 0.030042615832714867, + "acc_norm": 0.5735294117647058, + "acc_norm_stderr": 0.030042615832714867 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6029411764705882, + "acc_stderr": 0.019794488900024106, + "acc_norm": 0.6029411764705882, + "acc_norm_stderr": 0.019794488900024106 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.045820048415054174, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.045820048415054174 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.636734693877551, + "acc_stderr": 0.030789051139030806, + "acc_norm": 0.636734693877551, + "acc_norm_stderr": 0.030789051139030806 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7810945273631841, + "acc_stderr": 0.029239174636647, + "acc_norm": 0.7810945273631841, + "acc_norm_stderr": 0.029239174636647 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.03588702812826371, + "acc_norm": 0.85, + "acc_norm_stderr": 0.03588702812826371 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7719298245614035, + "acc_stderr": 0.032180937956023566, + "acc_norm": 0.7719298245614035, + "acc_norm_stderr": 0.032180937956023566 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3623011015911873, + "mc1_stderr": 0.016826646897262258, + "mc2": 0.5226698924852939, + "mc2_stderr": 0.015695340781638704 + }, + "all": { + "acc": 0.5830232121558723, + "acc_stderr": 0.034094609851246425, + "acc_norm": 0.5867868777746771, + "acc_norm_stderr": 0.03407267048284295, + "mc1": 0.3623011015911873, + "mc1_stderr": 0.016826646897262258, + "mc2": 0.5226698924852939, + "mc2_stderr": 0.015695340781638704 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "concedo/Vicuzard-30B-Uncensored", + "model_sha": "e2329c05a6e59660ba3cbcc01adf30a78f852594", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/concedo/Vicuzard-30B-Uncensored/results_2023-09-23T02-47-37.236097.json b/eval-results/concedo/Vicuzard-30B-Uncensored/results_2023-09-23T02-47-37.236097.json new file mode 100644 index 0000000000000000000000000000000000000000..302de2ab60d5f17845eb1172aac605799350738c --- /dev/null +++ b/eval-results/concedo/Vicuzard-30B-Uncensored/results_2023-09-23T02-47-37.236097.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "concedo/Vicuzard-30B-Uncensored", + "model_sha": "e2329c05a6e59660ba3cbcc01adf30a78f852594", + "model_size": "60.65 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.17365771812080538, + "em_stderr": 0.003879418958892462, + "f1": 0.2676352768456391, + "f1_stderr": 0.003979938331768844 + }, + "harness|gsm8k|5": { + "acc": 0.15390447308567096, + "acc_stderr": 0.009939799304049 + }, + "harness|winogrande|5": { + "acc": 0.771112865035517, + "acc_stderr": 0.011807360224025395 + }, + "all": { + "em": 0.17365771812080538, + "em_stderr": 0.003879418958892462, + "f1": 0.2676352768456391, + "f1_stderr": 0.003979938331768844, + "acc": 0.46250866906059396, + "acc_stderr": 0.010873579764037198 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "ef1cc179ce56dded" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "32288649456cd8e2" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "84ffd68e293fb097" + }, + "total_evaluation_time_secondes": "19863.771782398224", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/deepnight-research/lil-c3po/results_2023-12-16T17-28-57.885828.json b/eval-results/deepnight-research/lil-c3po/results_2023-12-16T17-28-57.885828.json new file mode 100644 index 0000000000000000000000000000000000000000..738fc617b6fca555e7e11853c7ca9eb2bfbcc825 --- /dev/null +++ b/eval-results/deepnight-research/lil-c3po/results_2023-12-16T17-28-57.885828.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 371261.788468432, + "end_time": 378695.776369421, + "total_evaluation_time_secondes": "7433.987900989014", + "model_name": "deepnight-research/lil-c3po", + "model_sha": "7888318c72df9f668df20b2916b651b94a6ed77c", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6262798634812287, + "acc_stderr": 0.014137708601759091, + "acc_norm": 0.6501706484641638, + "acc_norm_stderr": 0.01393680921215829 + }, + "harness|hellaswag|10": { + "acc": 0.6699860585540729, + "acc_stderr": 0.004692567655961763, + "acc_norm": 0.8444532961561442, + "acc_norm_stderr": 0.0036168436913607627 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621502, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621502 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6, + "acc_stderr": 0.042320736951515885, + "acc_norm": 0.6, + "acc_norm_stderr": 0.042320736951515885 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6513157894736842, + "acc_stderr": 0.0387813988879761, + "acc_norm": 0.6513157894736842, + "acc_norm_stderr": 0.0387813988879761 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7018867924528301, + "acc_stderr": 0.028152837942493864, + "acc_norm": 0.7018867924528301, + "acc_norm_stderr": 0.028152837942493864 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7013888888888888, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.7013888888888888, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.56, + "acc_stderr": 0.049888765156985884, + "acc_norm": 0.56, + "acc_norm_stderr": 0.049888765156985884 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6184971098265896, + "acc_stderr": 0.03703851193099521, + "acc_norm": 0.6184971098265896, + "acc_norm_stderr": 0.03703851193099521 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.04897104952726367, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.04897104952726367 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.73, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.73, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5617021276595745, + "acc_stderr": 0.03243618636108101, + "acc_norm": 0.5617021276595745, + "acc_norm_stderr": 0.03243618636108101 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4473684210526316, + "acc_stderr": 0.04677473004491199, + "acc_norm": 0.4473684210526316, + "acc_norm_stderr": 0.04677473004491199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.593103448275862, + "acc_stderr": 0.04093793981266236, + "acc_norm": 0.593103448275862, + "acc_norm_stderr": 0.04093793981266236 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.37566137566137564, + "acc_stderr": 0.02494236893115979, + "acc_norm": 0.37566137566137564, + "acc_norm_stderr": 0.02494236893115979 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7290322580645161, + "acc_stderr": 0.025284416114900156, + "acc_norm": 0.7290322580645161, + "acc_norm_stderr": 0.025284416114900156 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.03515895551165698, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.03515895551165698 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.803030303030303, + "acc_stderr": 0.028335609732463362, + "acc_norm": 0.803030303030303, + "acc_norm_stderr": 0.028335609732463362 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8497409326424871, + "acc_stderr": 0.02578772318072388, + "acc_norm": 0.8497409326424871, + "acc_norm_stderr": 0.02578772318072388 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6256410256410256, + "acc_stderr": 0.024537591572830506, + "acc_norm": 0.6256410256410256, + "acc_norm_stderr": 0.024537591572830506 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.362962962962963, + "acc_stderr": 0.02931820364520686, + "acc_norm": 0.362962962962963, + "acc_norm_stderr": 0.02931820364520686 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6554621848739496, + "acc_stderr": 0.030868682604121622, + "acc_norm": 0.6554621848739496, + "acc_norm_stderr": 0.030868682604121622 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.37748344370860926, + "acc_stderr": 0.03958027231121569, + "acc_norm": 0.37748344370860926, + "acc_norm_stderr": 0.03958027231121569 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8110091743119267, + "acc_stderr": 0.016785481159203624, + "acc_norm": 0.8110091743119267, + "acc_norm_stderr": 0.016785481159203624 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5324074074074074, + "acc_stderr": 0.03402801581358966, + "acc_norm": 0.5324074074074074, + "acc_norm_stderr": 0.03402801581358966 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7647058823529411, + "acc_stderr": 0.029771775228145628, + "acc_norm": 0.7647058823529411, + "acc_norm_stderr": 0.029771775228145628 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7468354430379747, + "acc_stderr": 0.028304657943035286, + "acc_norm": 0.7468354430379747, + "acc_norm_stderr": 0.028304657943035286 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6860986547085202, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.6860986547085202, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7175572519083969, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.7175572519083969, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8016528925619835, + "acc_stderr": 0.036401182719909456, + "acc_norm": 0.8016528925619835, + "acc_norm_stderr": 0.036401182719909456 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.04453197507374984, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.04453197507374984 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7484662576687117, + "acc_stderr": 0.03408997886857529, + "acc_norm": 0.7484662576687117, + "acc_norm_stderr": 0.03408997886857529 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.44642857142857145, + "acc_stderr": 0.047184714852195886, + "acc_norm": 0.44642857142857145, + "acc_norm_stderr": 0.047184714852195886 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7475728155339806, + "acc_stderr": 0.04301250399690878, + "acc_norm": 0.7475728155339806, + "acc_norm_stderr": 0.04301250399690878 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8632478632478633, + "acc_stderr": 0.022509033937077802, + "acc_norm": 0.8632478632478633, + "acc_norm_stderr": 0.022509033937077802 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7943805874840357, + "acc_stderr": 0.01445250045678583, + "acc_norm": 0.7943805874840357, + "acc_norm_stderr": 0.01445250045678583 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6705202312138728, + "acc_stderr": 0.025305258131879702, + "acc_norm": 0.6705202312138728, + "acc_norm_stderr": 0.025305258131879702 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.48268156424581005, + "acc_stderr": 0.016712467441702517, + "acc_norm": 0.48268156424581005, + "acc_norm_stderr": 0.016712467441702517 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6928104575163399, + "acc_stderr": 0.02641560191438899, + "acc_norm": 0.6928104575163399, + "acc_norm_stderr": 0.02641560191438899 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6688102893890675, + "acc_stderr": 0.02673062072800491, + "acc_norm": 0.6688102893890675, + "acc_norm_stderr": 0.02673062072800491 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6759259259259259, + "acc_stderr": 0.02604176620271716, + "acc_norm": 0.6759259259259259, + "acc_norm_stderr": 0.02604176620271716 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.46099290780141844, + "acc_stderr": 0.029736592526424438, + "acc_norm": 0.46099290780141844, + "acc_norm_stderr": 0.029736592526424438 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4406779661016949, + "acc_stderr": 0.012680037994097074, + "acc_norm": 0.4406779661016949, + "acc_norm_stderr": 0.012680037994097074 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6323529411764706, + "acc_stderr": 0.02928941340940319, + "acc_norm": 0.6323529411764706, + "acc_norm_stderr": 0.02928941340940319 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6225490196078431, + "acc_stderr": 0.01961085147488029, + "acc_norm": 0.6225490196078431, + "acc_norm_stderr": 0.01961085147488029 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6818181818181818, + "acc_stderr": 0.04461272175910509, + "acc_norm": 0.6818181818181818, + "acc_norm_stderr": 0.04461272175910509 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7061224489795919, + "acc_stderr": 0.02916273841024977, + "acc_norm": 0.7061224489795919, + "acc_norm_stderr": 0.02916273841024977 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7910447761194029, + "acc_stderr": 0.028748298931728655, + "acc_norm": 0.7910447761194029, + "acc_norm_stderr": 0.028748298931728655 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774708, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774708 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4759036144578313, + "acc_stderr": 0.03887971849597264, + "acc_norm": 0.4759036144578313, + "acc_norm_stderr": 0.03887971849597264 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8538011695906432, + "acc_stderr": 0.027097290118070806, + "acc_norm": 0.8538011695906432, + "acc_norm_stderr": 0.027097290118070806 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.5238678090575275, + "mc1_stderr": 0.017483547156961567, + "mc2": 0.6873119394140667, + "mc2_stderr": 0.0149863398321527 + }, + "harness|winogrande|5": { + "acc": 0.7916337805840569, + "acc_stderr": 0.011414554399987745 + }, + "harness|gsm8k|5": { + "acc": 0.4844579226686884, + "acc_stderr": 0.013765829454512893 + }, + "all": { + "acc": 0.6248592823720264, + "acc_stderr": 0.032934207150823985, + "acc_norm": 0.627774280407218, + "acc_norm_stderr": 0.03360219710155188, + "mc1": 0.5238678090575275, + "mc1_stderr": 0.017483547156961567, + "mc2": 0.6873119394140667, + "mc2_stderr": 0.0149863398321527 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "93aa5e22c86c6501" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "8bba71397999438a" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/deepnight-research/zsc-text/results_2023-08-20T14-23-47.276985.json b/eval-results/deepnight-research/zsc-text/results_2023-08-20T14-23-47.276985.json new file mode 100644 index 0000000000000000000000000000000000000000..bdaa93802044cc5397d580c3ef1975d248ee4617 --- /dev/null +++ b/eval-results/deepnight-research/zsc-text/results_2023-08-20T14-23-47.276985.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.22013651877133106, + "acc_stderr": 0.01210812488346098, + "acc_norm": 0.26706484641638223, + "acc_norm_stderr": 0.012928933196496328 + }, + "harness|hellaswag|10": { + "acc": 0.25473013343955386, + "acc_stderr": 0.004348189459336533, + "acc_norm": 0.25761800438159727, + "acc_norm_stderr": 0.004364287353415472 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.18518518518518517, + "acc_stderr": 0.03355677216313142, + "acc_norm": 0.18518518518518517, + "acc_norm_stderr": 0.03355677216313142 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.21509433962264152, + "acc_stderr": 0.02528839450289137, + "acc_norm": 0.21509433962264152, + "acc_norm_stderr": 0.02528839450289137 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749874, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749874 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813365 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.20899470899470898, + "acc_stderr": 0.02094048156533486, + "acc_norm": 0.20899470899470898, + "acc_norm_stderr": 0.02094048156533486 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04040610178208841, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04040610178208841 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.1774193548387097, + "acc_stderr": 0.02173254068932927, + "acc_norm": 0.1774193548387097, + "acc_norm_stderr": 0.02173254068932927 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.15270935960591134, + "acc_stderr": 0.02530890453938063, + "acc_norm": 0.15270935960591134, + "acc_norm_stderr": 0.02530890453938063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.17676767676767677, + "acc_stderr": 0.027178752639044915, + "acc_norm": 0.17676767676767677, + "acc_norm_stderr": 0.027178752639044915 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.19689119170984457, + "acc_stderr": 0.028697873971860664, + "acc_norm": 0.19689119170984457, + "acc_norm_stderr": 0.028697873971860664 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.20256410256410257, + "acc_stderr": 0.020377660970371372, + "acc_norm": 0.20256410256410257, + "acc_norm_stderr": 0.020377660970371372 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2111111111111111, + "acc_stderr": 0.024882116857655075, + "acc_norm": 0.2111111111111111, + "acc_norm_stderr": 0.024882116857655075 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21008403361344538, + "acc_stderr": 0.026461398717471874, + "acc_norm": 0.21008403361344538, + "acc_norm_stderr": 0.026461398717471874 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.1986754966887417, + "acc_stderr": 0.03257847384436776, + "acc_norm": 0.1986754966887417, + "acc_norm_stderr": 0.03257847384436776 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1926605504587156, + "acc_stderr": 0.016909276884936094, + "acc_norm": 0.1926605504587156, + "acc_norm_stderr": 0.016909276884936094 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.1527777777777778, + "acc_stderr": 0.024536326026134224, + "acc_norm": 0.1527777777777778, + "acc_norm_stderr": 0.024536326026134224 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.31390134529147984, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.31390134529147984, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2905982905982906, + "acc_stderr": 0.02974504857267404, + "acc_norm": 0.2905982905982906, + "acc_norm_stderr": 0.02974504857267404 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.23754789272030652, + "acc_stderr": 0.015218733046150193, + "acc_norm": 0.23754789272030652, + "acc_norm_stderr": 0.015218733046150193 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.023929155517351284, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.023929155517351284 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1864951768488746, + "acc_stderr": 0.02212243977248077, + "acc_norm": 0.1864951768488746, + "acc_norm_stderr": 0.02212243977248077 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21604938271604937, + "acc_stderr": 0.022899162918445806, + "acc_norm": 0.21604938271604937, + "acc_norm_stderr": 0.022899162918445806 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23404255319148937, + "acc_stderr": 0.025257861359432417, + "acc_norm": 0.23404255319148937, + "acc_norm_stderr": 0.025257861359432417 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142692, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142692 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.18382352941176472, + "acc_stderr": 0.023529242185193106, + "acc_norm": 0.18382352941176472, + "acc_norm_stderr": 0.023529242185193106 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25, + "acc_stderr": 0.01751781884501444, + "acc_norm": 0.25, + "acc_norm_stderr": 0.01751781884501444 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.18775510204081633, + "acc_stderr": 0.02500025603954621, + "acc_norm": 0.18775510204081633, + "acc_norm_stderr": 0.02500025603954621 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401465, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401465 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.28313253012048195, + "acc_stderr": 0.03507295431370518, + "acc_norm": 0.28313253012048195, + "acc_norm_stderr": 0.03507295431370518 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2423500611995104, + "mc1_stderr": 0.015000674373570345, + "mc2": 0.48353441310003753, + "mc2_stderr": 0.016429450805956303 + }, + "all": { + "acc": 0.23138093997919276, + "acc_stderr": 0.030709388988954305, + "acc_norm": 0.23222528232812656, + "acc_norm_stderr": 0.030723573839922344, + "mc1": 0.2423500611995104, + "mc1_stderr": 0.015000674373570345, + "mc2": 0.48353441310003753, + "mc2_stderr": 0.016429450805956303 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "deepnight-research/zsc-text", + "model_sha": "9b1c704ac76968dbd61597c22610084b975ef576", + "model_dtype": "torch.float16", + "lighteval_sha": "2b9e1cf249accf9b8168101189269701a82bfb9c", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "c08c26f3b959fada", + "hash_cont_tokens": "05c929f31923ff6a" + }, + "truncated": 1621, + "non-truncated": 3066, + "padded": 3036, + "non-padded": 1651, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "acf77dd0930cb868", + "hash_cont_tokens": "5cea47f1559b26a7" + }, + "truncated": 2081, + "non-truncated": 38087, + "padded": 37827, + "non-padded": 2341, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "026baba4f1855d52", + "hash_cont_tokens": "315867450f870798" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "380fe8d83ea3c9b2", + "hash_cont_tokens": "7e3688bb7ad2b2e2" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "98d1d46752d29d15", + "hash_cont_tokens": "7b109b7aedd1de06" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "627fcc4e1a739e4c", + "hash_cont_tokens": "482818b9b9298a4a" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "a553b87955b42e6f", + "hash_cont_tokens": "b576db639e0f3142" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "8c0690ebe221add6", + "hash_cont_tokens": "cd0640e579e32ce4" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "05e8af54937e98a6", + "hash_cont_tokens": "22ec0b82218e5be0" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "f3c3a14d5969c94c", + "hash_cont_tokens": "45af687c0fd428cd" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "a0c4f365335cd943", + "hash_cont_tokens": "4c7dc0b55346b3f3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "df45e8c2ca51f638", + "hash_cont_tokens": "d170cfdd736319e9" + }, + "truncated": 20, + "non-truncated": 672, + "padded": 672, + "non-padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "ee3c8d69704f4eab", + "hash_cont_tokens": "2a816d0db2b2dad2" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4e793304fa40946a", + "hash_cont_tokens": "60d628e697a248fd" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "4941098afc0a54b8", + "hash_cont_tokens": "c2c34d63c18eeb77" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "baa172bae1b94c65", + "hash_cont_tokens": "3f107af6d9ff2dcd" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "4b0a609742a9e908", + "hash_cont_tokens": "3f847bca444eb260" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "d90639331a8320f7", + "hash_cont_tokens": "c57831bd282cea61" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fe07863f8f0a0bd7", + "hash_cont_tokens": "387b1818a1601610" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "e9dfcf2a004f401d", + "hash_cont_tokens": "315867450f870798" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0a3559db4ec26e1", + "hash_cont_tokens": "f459de237de504e9" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "9c8a2731d2f3380f", + "hash_cont_tokens": "49ec1deb042dbebe" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "f7bb5008371e1527", + "hash_cont_tokens": "fd1878b8e9c42423" + }, + "truncated": 16, + "non-truncated": 384, + "padded": 384, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "fd4617eecbb18559", + "hash_cont_tokens": "73746f95aea6d401" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b3343e70f308a668", + "hash_cont_tokens": "607a1eae3d001215" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "063871e6fa526348", + "hash_cont_tokens": "fdeb368b2af27d82" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "78bd6589d0703076", + "hash_cont_tokens": "08f9e7c813838329" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "2088f17ee1e7ac69", + "hash_cont_tokens": "3a9b67e2168a3919" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "0ad739227a37ce19", + "hash_cont_tokens": "18c3a115dde0d4a0" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "6fd1a2da278b67ea", + "hash_cont_tokens": "d635b2adb3a26fc8" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "e6214bbed98ad866", + "hash_cont_tokens": "88d1c004c35ab299" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "819a1a92d862f61e", + "hash_cont_tokens": "9206c6335f47e2d8" + }, + "truncated": 4, + "non-truncated": 860, + "padded": 860, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "d83d66a8ad169421", + "hash_cont_tokens": "3ea7eb381dd53a9b" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "f07275ca22fee79d", + "hash_cont_tokens": "5c8f2a6eed682cc1" + }, + "truncated": 948, + "non-truncated": 0, + "padded": 0, + "non-padded": 948, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "ac7d6c64ac9157db", + "hash_cont_tokens": "0d2410a9a6f7c215" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "b8c08b86431c12ce", + "hash_cont_tokens": "27fc0321d0209a57" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "935d8485a3e48113", + "hash_cont_tokens": "ac3124b7ae555857" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "8c4714004d2f719f", + "hash_cont_tokens": "85f53f71b5bf59cc" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "4102585761849833", + "hash_cont_tokens": "7cc253a6e9ffe732" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e41308a69396d427", + "hash_cont_tokens": "37044b9b52fbbd04" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "45c08329de270f7c", + "hash_cont_tokens": "c49d804680a184aa" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "ac27d41d3262e8a4", + "hash_cont_tokens": "627aef25b6e292c0" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "31f84690ba4f408c", + "hash_cont_tokens": "315867450f870798" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "41837ddf5cb248b2", + "hash_cont_tokens": "47ce21d22a7e5dc0" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "50818afe85700057", + "hash_cont_tokens": "893da4d07dd1b2bd" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1384, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "06ed985dd195fbb6", + "hash_cont_tokens": "0689a16840f8f6e8" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3524, + "non-padded": 56, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "bc64a1c57f25606d", + "hash_cont_tokens": "7b837ac99342a9d9" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1220, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "61dbc9148dd34ebc", + "hash_cont_tokens": "8f9fa377c2228a93" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "98cae05c6c60ddf5", + "hash_cont_tokens": "082e5da2b5c610bc" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "c0c48830a9cfd1c0", + "hash_cont_tokens": "dd234b0e519fb7a0" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "b37c174e67f89fdf", + "hash_cont_tokens": "9f7692e6cf1fbaba" + }, + "truncated": 6136, + "non-truncated": 0, + "padded": 0, + "non-padded": 6136, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c34b7fc1e099adc9", + "hash_cont_tokens": "d9c68bab9ea25a63" + }, + "truncated": 1052, + "non-truncated": 36, + "padded": 36, + "non-padded": 1052, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "861ac2ebf1ea8163", + "hash_cont_tokens": "61248e7f4ced24ed" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "ecb1fe6940b2660f", + "hash_cont_tokens": "458592fb64cdebd0" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "dec99c55c33ad5c4", + "hash_cont_tokens": "266e50f726b20ed4" + }, + "truncated": 980, + "non-truncated": 0, + "padded": 0, + "non-padded": 980, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "67d6f59e4c194367", + "hash_cont_tokens": "b33e8c419b968eea" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "f895efb381f53972", + "hash_cont_tokens": "315867450f870798" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "8838a6ea7b9dc1dd", + "hash_cont_tokens": "8c5130cfcf880044" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "60595e3de43c3e85", + "hash_cont_tokens": "7b7fa11b779c55a8" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "34aeea4bb9abe314", + "hash_cont_tokens": "dbc111204ee4260b" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6285b5efe540a36a", + "hash_cont_tokens": "c6023fb155322c80" + }, + "total_evaluation_time_secondes": "815.3381953239441", + "truncated": 14334, + "non-truncated": 96685, + "padded": 96335, + "non-padded": 14684, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/digitous/13B-Chimera/results_2023-08-17T15-36-44.224352.json b/eval-results/digitous/13B-Chimera/results_2023-08-17T15-36-44.224352.json new file mode 100644 index 0000000000000000000000000000000000000000..0cb8d16ddf36f1f7f7370eb519fc1ecb178a8dab --- /dev/null +++ b/eval-results/digitous/13B-Chimera/results_2023-08-17T15-36-44.224352.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5614334470989761, + "acc_stderr": 0.014500682618212864, + "acc_norm": 0.575938566552901, + "acc_norm_stderr": 0.014441889627464398 + }, + "harness|hellaswag|10": { + "acc": 0.6163114917347142, + "acc_stderr": 0.004852896681736759, + "acc_norm": 0.8149770961959769, + "acc_norm_stderr": 0.0038752253693657315 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4962962962962963, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.4962962962962963, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4605263157894737, + "acc_stderr": 0.04056242252249034, + "acc_norm": 0.4605263157894737, + "acc_norm_stderr": 0.04056242252249034 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5094339622641509, + "acc_stderr": 0.030767394707808093, + "acc_norm": 0.5094339622641509, + "acc_norm_stderr": 0.030767394707808093 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5208333333333334, + "acc_stderr": 0.041775789507399935, + "acc_norm": 0.5208333333333334, + "acc_norm_stderr": 0.041775789507399935 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411019, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411019 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.42196531791907516, + "acc_stderr": 0.0376574669386515, + "acc_norm": 0.42196531791907516, + "acc_norm_stderr": 0.0376574669386515 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.040925639582376556, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.040925639582376556 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.41702127659574467, + "acc_stderr": 0.032232762667117124, + "acc_norm": 0.41702127659574467, + "acc_norm_stderr": 0.032232762667117124 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.04227054451232199, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.04227054451232199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.43448275862068964, + "acc_stderr": 0.041307408795554966, + "acc_norm": 0.43448275862068964, + "acc_norm_stderr": 0.041307408795554966 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.023266512213730575, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.023266512213730575 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.04343525428949098, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.04343525428949098 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5548387096774193, + "acc_stderr": 0.028272410186214906, + "acc_norm": 0.5548387096774193, + "acc_norm_stderr": 0.028272410186214906 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.32019704433497537, + "acc_stderr": 0.032826493853041504, + "acc_norm": 0.32019704433497537, + "acc_norm_stderr": 0.032826493853041504 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.037563357751878974, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.037563357751878974 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6161616161616161, + "acc_stderr": 0.0346488167501634, + "acc_norm": 0.6161616161616161, + "acc_norm_stderr": 0.0346488167501634 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.689119170984456, + "acc_stderr": 0.03340361906276587, + "acc_norm": 0.689119170984456, + "acc_norm_stderr": 0.03340361906276587 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4794871794871795, + "acc_stderr": 0.02532966316348994, + "acc_norm": 0.4794871794871795, + "acc_norm_stderr": 0.02532966316348994 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.025348097468097856, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.025348097468097856 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.453781512605042, + "acc_stderr": 0.03233943468182088, + "acc_norm": 0.453781512605042, + "acc_norm_stderr": 0.03233943468182088 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2582781456953642, + "acc_stderr": 0.035737053147634576, + "acc_norm": 0.2582781456953642, + "acc_norm_stderr": 0.035737053147634576 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.689908256880734, + "acc_stderr": 0.019830849684439752, + "acc_norm": 0.689908256880734, + "acc_norm_stderr": 0.019830849684439752 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3101851851851852, + "acc_stderr": 0.0315469628565663, + "acc_norm": 0.3101851851851852, + "acc_norm_stderr": 0.0315469628565663 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6715686274509803, + "acc_stderr": 0.03296245110172229, + "acc_norm": 0.6715686274509803, + "acc_norm_stderr": 0.03296245110172229 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6962025316455697, + "acc_stderr": 0.029936696387138608, + "acc_norm": 0.6962025316455697, + "acc_norm_stderr": 0.029936696387138608 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5874439461883408, + "acc_stderr": 0.03304062175449297, + "acc_norm": 0.5874439461883408, + "acc_norm_stderr": 0.03304062175449297 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5801526717557252, + "acc_stderr": 0.04328577215262971, + "acc_norm": 0.5801526717557252, + "acc_norm_stderr": 0.04328577215262971 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6611570247933884, + "acc_stderr": 0.043207678075366705, + "acc_norm": 0.6611570247933884, + "acc_norm_stderr": 0.043207678075366705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.04766075165356461, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.04766075165356461 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5276073619631901, + "acc_stderr": 0.03922378290610991, + "acc_norm": 0.5276073619631901, + "acc_norm_stderr": 0.03922378290610991 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.045723723587374296, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.045723723587374296 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7281553398058253, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.7281553398058253, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7606837606837606, + "acc_stderr": 0.027951826808924333, + "acc_norm": 0.7606837606837606, + "acc_norm_stderr": 0.027951826808924333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7062579821200511, + "acc_stderr": 0.016287759388491654, + "acc_norm": 0.7062579821200511, + "acc_norm_stderr": 0.016287759388491654 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5317919075144508, + "acc_stderr": 0.026864624366756646, + "acc_norm": 0.5317919075144508, + "acc_norm_stderr": 0.026864624366756646 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574917, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574917 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.545751633986928, + "acc_stderr": 0.028509807802626595, + "acc_norm": 0.545751633986928, + "acc_norm_stderr": 0.028509807802626595 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5305466237942122, + "acc_stderr": 0.028345045864840625, + "acc_norm": 0.5305466237942122, + "acc_norm_stderr": 0.028345045864840625 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5432098765432098, + "acc_stderr": 0.02771666165019404, + "acc_norm": 0.5432098765432098, + "acc_norm_stderr": 0.02771666165019404 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3829787234042553, + "acc_stderr": 0.02899908090480618, + "acc_norm": 0.3829787234042553, + "acc_norm_stderr": 0.02899908090480618 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4015645371577575, + "acc_stderr": 0.012520315120147106, + "acc_norm": 0.4015645371577575, + "acc_norm_stderr": 0.012520315120147106 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.46691176470588236, + "acc_stderr": 0.03030625772246832, + "acc_norm": 0.46691176470588236, + "acc_norm_stderr": 0.03030625772246832 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5228758169934641, + "acc_stderr": 0.020206653187884786, + "acc_norm": 0.5228758169934641, + "acc_norm_stderr": 0.020206653187884786 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5545454545454546, + "acc_stderr": 0.047605488214603246, + "acc_norm": 0.5545454545454546, + "acc_norm_stderr": 0.047605488214603246 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5387755102040817, + "acc_stderr": 0.031912820526692774, + "acc_norm": 0.5387755102040817, + "acc_norm_stderr": 0.031912820526692774 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.681592039800995, + "acc_stderr": 0.032941184790540944, + "acc_norm": 0.681592039800995, + "acc_norm_stderr": 0.032941184790540944 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4457831325301205, + "acc_stderr": 0.03869543323472101, + "acc_norm": 0.4457831325301205, + "acc_norm_stderr": 0.03869543323472101 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7543859649122807, + "acc_stderr": 0.03301405946987249, + "acc_norm": 0.7543859649122807, + "acc_norm_stderr": 0.03301405946987249 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3598531211750306, + "mc1_stderr": 0.016801860466677147, + "mc2": 0.5259120317801959, + "mc2_stderr": 0.015140404580264173 + }, + "all": { + "acc": 0.501681989763272, + "acc_stderr": 0.03489701828715803, + "acc_norm": 0.5052950528804787, + "acc_norm_stderr": 0.03487945109557973, + "mc1": 0.3598531211750306, + "mc1_stderr": 0.016801860466677147, + "mc2": 0.5259120317801959, + "mc2_stderr": 0.015140404580264173 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "digitous/13B-Chimera", + "model_sha": "85cfe8e6db2bee804873cfdb48955696cc5b0689", + "model_dtype": "torch.float16", + "lighteval_sha": "8bab069fee0c6e75ffa4c1ef8a9591c28ee0e049", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "3836.3561160564423", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/digitous/13B-Chimera/results_2023-10-21T22-03-30.588181.json b/eval-results/digitous/13B-Chimera/results_2023-10-21T22-03-30.588181.json new file mode 100644 index 0000000000000000000000000000000000000000..1b9fb5a867da1ffd82bcc58c83914c33c047cad9 --- /dev/null +++ b/eval-results/digitous/13B-Chimera/results_2023-10-21T22-03-30.588181.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "digitous/13B-Chimera", + "model_sha": "85cfe8e6db2bee804873cfdb48955696cc5b0689", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.2860738255033557, + "em_stderr": 0.004628128039725735, + "f1": 0.35844274328859277, + "f1_stderr": 0.004563129120809242 + }, + "harness|gsm8k|5": { + "acc": 0.1068991660348749, + "acc_stderr": 0.008510982565520481 + }, + "harness|winogrande|5": { + "acc": 0.7726913970007893, + "acc_stderr": 0.011778612167091088 + }, + "all": { + "em": 0.2860738255033557, + "em_stderr": 0.004628128039725735, + "f1": 0.35844274328859277, + "f1_stderr": 0.004563129120809242, + "acc": 0.4397952815178321, + "acc_stderr": 0.010144797366305785 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "13ce471b4f6a8d46" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "73c0cc60dedf0dfb" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "55343e0d5bdc7e89" + }, + "total_evaluation_time_secondes": "9635.125216007233", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/digitous/13B-HyperMantis/results_2023-07-19T19-30-10.108453.json b/eval-results/digitous/13B-HyperMantis/results_2023-07-19T19-30-10.108453.json new file mode 100644 index 0000000000000000000000000000000000000000..68bdba0c1e4bcc55fa70e0011834a796aff728b2 --- /dev/null +++ b/eval-results/digitous/13B-HyperMantis/results_2023-07-19T19-30-10.108453.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5554607508532423, + "acc_stderr": 0.014521226405627077, + "acc_norm": 0.5853242320819113, + "acc_norm_stderr": 0.014397070564409172 + }, + "harness|hellaswag|10": { + "acc": 0.6267675761800439, + "acc_stderr": 0.0048267461608301815, + "acc_norm": 0.8220474009161521, + "acc_norm_stderr": 0.0038169117116791705 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4934210526315789, + "acc_stderr": 0.040685900502249704, + "acc_norm": 0.4934210526315789, + "acc_norm_stderr": 0.040685900502249704 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5056603773584906, + "acc_stderr": 0.03077090076385131, + "acc_norm": 0.5056603773584906, + "acc_norm_stderr": 0.03077090076385131 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.04174752578923185, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.04174752578923185 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4161849710982659, + "acc_stderr": 0.03758517775404948, + "acc_norm": 0.4161849710982659, + "acc_norm_stderr": 0.03758517775404948 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.039505818611799616, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.039505818611799616 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.62, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.62, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.41702127659574467, + "acc_stderr": 0.032232762667117124, + "acc_norm": 0.41702127659574467, + "acc_norm_stderr": 0.032232762667117124 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.43448275862068964, + "acc_stderr": 0.041307408795554966, + "acc_norm": 0.43448275862068964, + "acc_norm_stderr": 0.041307408795554966 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.02264421261552521, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.02264421261552521 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.043758884927270605, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.043758884927270605 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5548387096774193, + "acc_stderr": 0.028272410186214906, + "acc_norm": 0.5548387096774193, + "acc_norm_stderr": 0.028272410186214906 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3497536945812808, + "acc_stderr": 0.03355400904969565, + "acc_norm": 0.3497536945812808, + "acc_norm_stderr": 0.03355400904969565 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6424242424242425, + "acc_stderr": 0.037425970438065864, + "acc_norm": 0.6424242424242425, + "acc_norm_stderr": 0.037425970438065864 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6464646464646465, + "acc_stderr": 0.03406086723547155, + "acc_norm": 0.6464646464646465, + "acc_norm_stderr": 0.03406086723547155 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6683937823834197, + "acc_stderr": 0.03397636541089118, + "acc_norm": 0.6683937823834197, + "acc_norm_stderr": 0.03397636541089118 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.47692307692307695, + "acc_stderr": 0.025323990861736118, + "acc_norm": 0.47692307692307695, + "acc_norm_stderr": 0.025323990861736118 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.025928876132766097, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.025928876132766097 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.47058823529411764, + "acc_stderr": 0.03242225027115006, + "acc_norm": 0.47058823529411764, + "acc_norm_stderr": 0.03242225027115006 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2913907284768212, + "acc_stderr": 0.03710185726119995, + "acc_norm": 0.2913907284768212, + "acc_norm_stderr": 0.03710185726119995 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.691743119266055, + "acc_stderr": 0.019798366698367233, + "acc_norm": 0.691743119266055, + "acc_norm_stderr": 0.019798366698367233 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3425925925925926, + "acc_stderr": 0.03236585252602159, + "acc_norm": 0.3425925925925926, + "acc_norm_stderr": 0.03236585252602159 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6862745098039216, + "acc_stderr": 0.032566854844603886, + "acc_norm": 0.6862745098039216, + "acc_norm_stderr": 0.032566854844603886 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7130801687763713, + "acc_stderr": 0.029443773022594693, + "acc_norm": 0.7130801687763713, + "acc_norm_stderr": 0.029443773022594693 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5874439461883408, + "acc_stderr": 0.03304062175449297, + "acc_norm": 0.5874439461883408, + "acc_norm_stderr": 0.03304062175449297 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6106870229007634, + "acc_stderr": 0.04276486542814591, + "acc_norm": 0.6106870229007634, + "acc_norm_stderr": 0.04276486542814591 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6859504132231405, + "acc_stderr": 0.04236964753041019, + "acc_norm": 0.6859504132231405, + "acc_norm_stderr": 0.04236964753041019 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5648148148148148, + "acc_stderr": 0.04792898170907061, + "acc_norm": 0.5648148148148148, + "acc_norm_stderr": 0.04792898170907061 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.558282208588957, + "acc_stderr": 0.03901591825836184, + "acc_norm": 0.558282208588957, + "acc_norm_stderr": 0.03901591825836184 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7184466019417476, + "acc_stderr": 0.044532548363264673, + "acc_norm": 0.7184466019417476, + "acc_norm_stderr": 0.044532548363264673 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7692307692307693, + "acc_stderr": 0.027601921381417597, + "acc_norm": 0.7692307692307693, + "acc_norm_stderr": 0.027601921381417597 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7088122605363985, + "acc_stderr": 0.0162460870697014, + "acc_norm": 0.7088122605363985, + "acc_norm_stderr": 0.0162460870697014 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5520231213872833, + "acc_stderr": 0.02677299065336182, + "acc_norm": 0.5520231213872833, + "acc_norm_stderr": 0.02677299065336182 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25139664804469275, + "acc_stderr": 0.014508979453553974, + "acc_norm": 0.25139664804469275, + "acc_norm_stderr": 0.014508979453553974 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.02845263998508801, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.02845263998508801 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5691318327974276, + "acc_stderr": 0.028125340983972704, + "acc_norm": 0.5691318327974276, + "acc_norm_stderr": 0.028125340983972704 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.558641975308642, + "acc_stderr": 0.027628737155668773, + "acc_norm": 0.558641975308642, + "acc_norm_stderr": 0.027628737155668773 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.38652482269503546, + "acc_stderr": 0.02904919034254346, + "acc_norm": 0.38652482269503546, + "acc_norm_stderr": 0.02904919034254346 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4002607561929596, + "acc_stderr": 0.012513582529136211, + "acc_norm": 0.4002607561929596, + "acc_norm_stderr": 0.012513582529136211 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5073529411764706, + "acc_stderr": 0.030369552523902173, + "acc_norm": 0.5073529411764706, + "acc_norm_stderr": 0.030369552523902173 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5228758169934641, + "acc_stderr": 0.020206653187884786, + "acc_norm": 0.5228758169934641, + "acc_norm_stderr": 0.020206653187884786 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5909090909090909, + "acc_stderr": 0.04709306978661896, + "acc_norm": 0.5909090909090909, + "acc_norm_stderr": 0.04709306978661896 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5469387755102041, + "acc_stderr": 0.03186785930004128, + "acc_norm": 0.5469387755102041, + "acc_norm_stderr": 0.03186785930004128 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.681592039800995, + "acc_stderr": 0.032941184790540944, + "acc_norm": 0.681592039800995, + "acc_norm_stderr": 0.032941184790540944 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.45180722891566266, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.45180722891566266, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7368421052631579, + "acc_stderr": 0.03377310252209205, + "acc_norm": 0.7368421052631579, + "acc_norm_stderr": 0.03377310252209205 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3243574051407589, + "mc1_stderr": 0.016387976779647935, + "mc2": 0.4749758518603797, + "mc2_stderr": 0.014913887028736786 + }, + "all": { + "acc": 0.5090009865936799, + "acc_stderr": 0.03489854562136909, + "acc_norm": 0.512816974830371, + "acc_norm_stderr": 0.034879325446956065, + "mc1": 0.3243574051407589, + "mc1_stderr": 0.016387976779647935, + "mc2": 0.4749758518603797, + "mc2_stderr": 0.014913887028736786 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "digitous/13B-HyperMantis", + "model_sha": "aa828ef92c363a5577ffd7d29e678277b9d2eb3c", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/digitous/13B-HyperMantis/results_2023-10-16T02-31-52.338573.json b/eval-results/digitous/13B-HyperMantis/results_2023-10-16T02-31-52.338573.json new file mode 100644 index 0000000000000000000000000000000000000000..4a63fa7743e1ed2e087d747ac3ce1de744d921cd --- /dev/null +++ b/eval-results/digitous/13B-HyperMantis/results_2023-10-16T02-31-52.338573.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "digitous/13B-HyperMantis", + "model_sha": "aa828ef92c363a5577ffd7d29e678277b9d2eb3c", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.12804110738255034, + "em_stderr": 0.0034218610287585043, + "f1": 0.195454068791946, + "f1_stderr": 0.0035590395888605362 + }, + "harness|gsm8k|5": { + "acc": 0.10386656557998483, + "acc_stderr": 0.008403622228924032 + }, + "harness|winogrande|5": { + "acc": 0.7624309392265194, + "acc_stderr": 0.01196129890580314 + }, + "all": { + "em": 0.12804110738255034, + "em_stderr": 0.0034218610287585043, + "f1": 0.195454068791946, + "f1_stderr": 0.0035590395888605362, + "acc": 0.4331487524032521, + "acc_stderr": 0.010182460567363586 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "9b470307deac873f" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "944ef55a41d1295d" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "c2b9da8dcc05e77a" + }, + "total_evaluation_time_secondes": "11393.866104841232", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/digitous/Adventien-GPTJ/results_2023-07-19T20-04-02.923110.json b/eval-results/digitous/Adventien-GPTJ/results_2023-07-19T20-04-02.923110.json new file mode 100644 index 0000000000000000000000000000000000000000..76264df30e8a433d6d4a52787ad76d2f961fdf12 --- /dev/null +++ b/eval-results/digitous/Adventien-GPTJ/results_2023-07-19T20-04-02.923110.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.38822525597269625, + "acc_stderr": 0.014241614207414037, + "acc_norm": 0.4249146757679181, + "acc_norm_stderr": 0.014445698968520777 + }, + "harness|hellaswag|10": { + "acc": 0.5256920932085242, + "acc_stderr": 0.00498318971120851, + "acc_norm": 0.6920932085241984, + "acc_norm_stderr": 0.0046068433445174885 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.03673731683969506, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.03673731683969506 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.24342105263157895, + "acc_stderr": 0.034923496688842384, + "acc_norm": 0.24342105263157895, + "acc_norm_stderr": 0.034923496688842384 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2641509433962264, + "acc_stderr": 0.02713429162874173, + "acc_norm": 0.2641509433962264, + "acc_norm_stderr": 0.02713429162874173 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2708333333333333, + "acc_stderr": 0.03716177437566018, + "acc_norm": 0.2708333333333333, + "acc_norm_stderr": 0.03716177437566018 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909282, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909282 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653695, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653695 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542126, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542126 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.19653179190751446, + "acc_stderr": 0.03029957466478815, + "acc_norm": 0.19653179190751446, + "acc_norm_stderr": 0.03029957466478815 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.04023382273617749, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.04023382273617749 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3148936170212766, + "acc_stderr": 0.030363582197238167, + "acc_norm": 0.3148936170212766, + "acc_norm_stderr": 0.030363582197238167 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537315, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537315 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2206896551724138, + "acc_stderr": 0.03455930201924811, + "acc_norm": 0.2206896551724138, + "acc_norm_stderr": 0.03455930201924811 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.02193587808118476, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.02193587808118476 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.03670066451047181, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.03670066451047181 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.22903225806451613, + "acc_stderr": 0.023904914311782648, + "acc_norm": 0.22903225806451613, + "acc_norm_stderr": 0.023904914311782648 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.26108374384236455, + "acc_stderr": 0.030903796952114485, + "acc_norm": 0.26108374384236455, + "acc_norm_stderr": 0.030903796952114485 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036623, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036623 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.23636363636363636, + "acc_stderr": 0.033175059300091805, + "acc_norm": 0.23636363636363636, + "acc_norm_stderr": 0.033175059300091805 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.21212121212121213, + "acc_stderr": 0.029126522834586818, + "acc_norm": 0.21212121212121213, + "acc_norm_stderr": 0.029126522834586818 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.21761658031088082, + "acc_stderr": 0.02977866303775295, + "acc_norm": 0.21761658031088082, + "acc_norm_stderr": 0.02977866303775295 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.23846153846153847, + "acc_stderr": 0.021606294494647727, + "acc_norm": 0.23846153846153847, + "acc_norm_stderr": 0.021606294494647727 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.026466117538959916, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.026466117538959916 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.23109243697478993, + "acc_stderr": 0.027381406927868966, + "acc_norm": 0.23109243697478993, + "acc_norm_stderr": 0.027381406927868966 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2251655629139073, + "acc_stderr": 0.03410435282008936, + "acc_norm": 0.2251655629139073, + "acc_norm_stderr": 0.03410435282008936 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.24036697247706423, + "acc_stderr": 0.01832060732096407, + "acc_norm": 0.24036697247706423, + "acc_norm_stderr": 0.01832060732096407 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.14814814814814814, + "acc_stderr": 0.02422762927372836, + "acc_norm": 0.14814814814814814, + "acc_norm_stderr": 0.02422762927372836 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.031660096793998116, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.031660096793998116 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.29957805907172996, + "acc_stderr": 0.029818024749753102, + "acc_norm": 0.29957805907172996, + "acc_norm_stderr": 0.029818024749753102 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.38565022421524664, + "acc_stderr": 0.03266842214289201, + "acc_norm": 0.38565022421524664, + "acc_norm_stderr": 0.03266842214289201 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2892561983471074, + "acc_stderr": 0.04139112727635463, + "acc_norm": 0.2892561983471074, + "acc_norm_stderr": 0.04139112727635463 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.04489931073591312, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.04489931073591312 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2392638036809816, + "acc_stderr": 0.033519538795212696, + "acc_norm": 0.2392638036809816, + "acc_norm_stderr": 0.033519538795212696 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04287858751340456, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04287858751340456 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.1941747572815534, + "acc_stderr": 0.03916667762822584, + "acc_norm": 0.1941747572815534, + "acc_norm_stderr": 0.03916667762822584 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2905982905982906, + "acc_stderr": 0.029745048572674057, + "acc_norm": 0.2905982905982906, + "acc_norm_stderr": 0.029745048572674057 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2707535121328225, + "acc_stderr": 0.01588988836256049, + "acc_norm": 0.2707535121328225, + "acc_norm_stderr": 0.01588988836256049 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2658959537572254, + "acc_stderr": 0.023786203255508287, + "acc_norm": 0.2658959537572254, + "acc_norm_stderr": 0.023786203255508287 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.24836601307189543, + "acc_stderr": 0.024739981355113592, + "acc_norm": 0.24836601307189543, + "acc_norm_stderr": 0.024739981355113592 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2765273311897106, + "acc_stderr": 0.02540383297817962, + "acc_norm": 0.2765273311897106, + "acc_norm_stderr": 0.02540383297817962 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2716049382716049, + "acc_stderr": 0.02474862449053737, + "acc_norm": 0.2716049382716049, + "acc_norm_stderr": 0.02474862449053737 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2730496453900709, + "acc_stderr": 0.026577860943307854, + "acc_norm": 0.2730496453900709, + "acc_norm_stderr": 0.026577860943307854 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2633637548891786, + "acc_stderr": 0.011249506403605293, + "acc_norm": 0.2633637548891786, + "acc_norm_stderr": 0.011249506403605293 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.20955882352941177, + "acc_stderr": 0.024723110407677055, + "acc_norm": 0.20955882352941177, + "acc_norm_stderr": 0.024723110407677055 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2565359477124183, + "acc_stderr": 0.017667841612378988, + "acc_norm": 0.2565359477124183, + "acc_norm_stderr": 0.017667841612378988 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.3090909090909091, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.3090909090909091, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.24897959183673468, + "acc_stderr": 0.02768297952296024, + "acc_norm": 0.24897959183673468, + "acc_norm_stderr": 0.02768297952296024 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.25870646766169153, + "acc_stderr": 0.030965903123573026, + "acc_norm": 0.25870646766169153, + "acc_norm_stderr": 0.030965903123573026 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3192771084337349, + "acc_stderr": 0.03629335329947861, + "acc_norm": 0.3192771084337349, + "acc_norm_stderr": 0.03629335329947861 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.033773102522091945, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.033773102522091945 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2178702570379437, + "mc1_stderr": 0.014450846714123892, + "mc2": 0.3694690724525972, + "mc2_stderr": 0.014557861662400237 + }, + "all": { + "acc": 0.26083931825046897, + "acc_stderr": 0.03166895124019951, + "acc_norm": 0.2642815307099757, + "acc_norm_stderr": 0.031666031551969266, + "mc1": 0.2178702570379437, + "mc1_stderr": 0.014450846714123892, + "mc2": 0.3694690724525972, + "mc2_stderr": 0.014557861662400237 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "digitous/Adventien-GPTJ", + "model_sha": "4fbfe9eae03a1d6ecf60fda8cf39c4123f0438bd", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + } + } +} \ No newline at end of file diff --git a/eval-results/digitous/Adventien-GPTJ/results_2023-09-22T18-30-15.376611.json b/eval-results/digitous/Adventien-GPTJ/results_2023-09-22T18-30-15.376611.json new file mode 100644 index 0000000000000000000000000000000000000000..0e664fc754912170a391056fecf6996d62ee9693 --- /dev/null +++ b/eval-results/digitous/Adventien-GPTJ/results_2023-09-22T18-30-15.376611.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "digitous/Adventien-GPTJ", + "model_sha": "4fbfe9eae03a1d6ecf60fda8cf39c4123f0438bd", + "model_size": "11.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0008389261744966443, + "em_stderr": 0.0002964962989801232, + "f1": 0.04690331375838923, + "f1_stderr": 0.0011372681519599575 + }, + "harness|gsm8k|5": { + "acc": 0.01592115238817286, + "acc_stderr": 0.003447819272389025 + }, + "harness|winogrande|5": { + "acc": 0.6022099447513812, + "acc_stderr": 0.013755743513749022 + }, + "all": { + "em": 0.0008389261744966443, + "em_stderr": 0.0002964962989801232, + "f1": 0.04690331375838923, + "f1_stderr": 0.0011372681519599575, + "acc": 0.309065548569777, + "acc_stderr": 0.008601781393069023 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f21277d2c2d2e06c", + "hash_cont_tokens": "d5f75b41a355138c" + }, + "truncated": 382, + "non-truncated": 9154, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "783a746d769d12e5" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "26cd3631535039d0", + "hash_cont_tokens": "f066fe8f9b012304" + }, + "total_evaluation_time_secondes": "16189.165263414383", + "truncated": 382, + "non-truncated": 13007, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/digitous/Alpacino13b/results_2023-07-19T19-38-18.713837.json b/eval-results/digitous/Alpacino13b/results_2023-07-19T19-38-18.713837.json new file mode 100644 index 0000000000000000000000000000000000000000..0ac9e643c8140b6ad6242367cd8dd96c6596800f --- /dev/null +++ b/eval-results/digitous/Alpacino13b/results_2023-07-19T19-38-18.713837.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5503412969283277, + "acc_stderr": 0.014537144444284738, + "acc_norm": 0.5853242320819113, + "acc_norm_stderr": 0.014397070564409174 + }, + "harness|hellaswag|10": { + "acc": 0.6141206930890261, + "acc_stderr": 0.004858074013443993, + "acc_norm": 0.8130850428201554, + "acc_norm_stderr": 0.003890465158271809 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4740740740740741, + "acc_stderr": 0.04313531696750574, + "acc_norm": 0.4740740740740741, + "acc_norm_stderr": 0.04313531696750574 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4342105263157895, + "acc_stderr": 0.04033565667848319, + "acc_norm": 0.4342105263157895, + "acc_norm_stderr": 0.04033565667848319 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4716981132075472, + "acc_stderr": 0.0307235352490061, + "acc_norm": 0.4716981132075472, + "acc_norm_stderr": 0.0307235352490061 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4652777777777778, + "acc_stderr": 0.04171115858181618, + "acc_norm": 0.4652777777777778, + "acc_norm_stderr": 0.04171115858181618 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.43352601156069365, + "acc_stderr": 0.03778621079092055, + "acc_norm": 0.43352601156069365, + "acc_norm_stderr": 0.03778621079092055 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.04023382273617747, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.04023382273617747 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4297872340425532, + "acc_stderr": 0.03236214467715563, + "acc_norm": 0.4297872340425532, + "acc_norm_stderr": 0.03236214467715563 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3931034482758621, + "acc_stderr": 0.0407032901370707, + "acc_norm": 0.3931034482758621, + "acc_norm_stderr": 0.0407032901370707 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.022182037202948365, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.022182037202948365 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.04263906892795133, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.04263906892795133 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5161290322580645, + "acc_stderr": 0.028429203176724555, + "acc_norm": 0.5161290322580645, + "acc_norm_stderr": 0.028429203176724555 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3054187192118227, + "acc_stderr": 0.03240661565868408, + "acc_norm": 0.3054187192118227, + "acc_norm_stderr": 0.03240661565868408 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6121212121212121, + "acc_stderr": 0.038049136539710114, + "acc_norm": 0.6121212121212121, + "acc_norm_stderr": 0.038049136539710114 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5303030303030303, + "acc_stderr": 0.0355580405176393, + "acc_norm": 0.5303030303030303, + "acc_norm_stderr": 0.0355580405176393 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6476683937823834, + "acc_stderr": 0.03447478286414358, + "acc_norm": 0.6476683937823834, + "acc_norm_stderr": 0.03447478286414358 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.43846153846153846, + "acc_stderr": 0.025158266016868568, + "acc_norm": 0.43846153846153846, + "acc_norm_stderr": 0.025158266016868568 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085626, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085626 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.47058823529411764, + "acc_stderr": 0.03242225027115006, + "acc_norm": 0.47058823529411764, + "acc_norm_stderr": 0.03242225027115006 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.03631329803969654, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.03631329803969654 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6293577981651376, + "acc_stderr": 0.02070745816435298, + "acc_norm": 0.6293577981651376, + "acc_norm_stderr": 0.02070745816435298 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.27314814814814814, + "acc_stderr": 0.030388051301678116, + "acc_norm": 0.27314814814814814, + "acc_norm_stderr": 0.030388051301678116 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5931372549019608, + "acc_stderr": 0.03447891136353382, + "acc_norm": 0.5931372549019608, + "acc_norm_stderr": 0.03447891136353382 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6877637130801688, + "acc_stderr": 0.030165137867847008, + "acc_norm": 0.6877637130801688, + "acc_norm_stderr": 0.030165137867847008 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5650224215246636, + "acc_stderr": 0.033272833702713445, + "acc_norm": 0.5650224215246636, + "acc_norm_stderr": 0.033272833702713445 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5648854961832062, + "acc_stderr": 0.04348208051644858, + "acc_norm": 0.5648854961832062, + "acc_norm_stderr": 0.04348208051644858 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6528925619834711, + "acc_stderr": 0.04345724570292534, + "acc_norm": 0.6528925619834711, + "acc_norm_stderr": 0.04345724570292534 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.04820403072760627, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.04820403072760627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5644171779141104, + "acc_stderr": 0.03895632464138937, + "acc_norm": 0.5644171779141104, + "acc_norm_stderr": 0.03895632464138937 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3392857142857143, + "acc_stderr": 0.044939490686135376, + "acc_norm": 0.3392857142857143, + "acc_norm_stderr": 0.044939490686135376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6504854368932039, + "acc_stderr": 0.047211885060971716, + "acc_norm": 0.6504854368932039, + "acc_norm_stderr": 0.047211885060971716 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7264957264957265, + "acc_stderr": 0.029202540153431183, + "acc_norm": 0.7264957264957265, + "acc_norm_stderr": 0.029202540153431183 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.01685739124747255, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.01685739124747255 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5144508670520231, + "acc_stderr": 0.026907849856282532, + "acc_norm": 0.5144508670520231, + "acc_norm_stderr": 0.026907849856282532 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2435754189944134, + "acc_stderr": 0.014355911964767867, + "acc_norm": 0.2435754189944134, + "acc_norm_stderr": 0.014355911964767867 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5130718954248366, + "acc_stderr": 0.028620130800700246, + "acc_norm": 0.5130718954248366, + "acc_norm_stderr": 0.028620130800700246 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5434083601286174, + "acc_stderr": 0.028290869054197608, + "acc_norm": 0.5434083601286174, + "acc_norm_stderr": 0.028290869054197608 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5339506172839507, + "acc_stderr": 0.027756535257347663, + "acc_norm": 0.5339506172839507, + "acc_norm_stderr": 0.027756535257347663 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.35106382978723405, + "acc_stderr": 0.02847350127296376, + "acc_norm": 0.35106382978723405, + "acc_norm_stderr": 0.02847350127296376 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.37157757496740546, + "acc_stderr": 0.012341828514528289, + "acc_norm": 0.37157757496740546, + "acc_norm_stderr": 0.012341828514528289 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5147058823529411, + "acc_stderr": 0.03035969707904611, + "acc_norm": 0.5147058823529411, + "acc_norm_stderr": 0.03035969707904611 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4852941176470588, + "acc_stderr": 0.020219083895133924, + "acc_norm": 0.4852941176470588, + "acc_norm_stderr": 0.020219083895133924 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5909090909090909, + "acc_stderr": 0.04709306978661896, + "acc_norm": 0.5909090909090909, + "acc_norm_stderr": 0.04709306978661896 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5387755102040817, + "acc_stderr": 0.031912820526692774, + "acc_norm": 0.5387755102040817, + "acc_norm_stderr": 0.031912820526692774 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6567164179104478, + "acc_stderr": 0.03357379665433431, + "acc_norm": 0.6567164179104478, + "acc_norm_stderr": 0.03357379665433431 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4397590361445783, + "acc_stderr": 0.03864139923699122, + "acc_norm": 0.4397590361445783, + "acc_norm_stderr": 0.03864139923699122 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03615507630310936, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03615507630310936 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.28151774785801714, + "mc1_stderr": 0.01574402724825605, + "mc2": 0.41663528664025434, + "mc2_stderr": 0.014080694661593746 + }, + "all": { + "acc": 0.4826701891648848, + "acc_stderr": 0.035136279478874335, + "acc_norm": 0.4866353973832698, + "acc_norm_stderr": 0.03511750519522946, + "mc1": 0.28151774785801714, + "mc1_stderr": 0.01574402724825605, + "mc2": 0.41663528664025434, + "mc2_stderr": 0.014080694661593746 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "digitous/Alpacino13b", + "model_sha": "7092a5c8dec649694dd66ff8cfe5452ce52e6a40", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/digitous/Alpacino13b/results_2023-10-15T02-53-57.964177.json b/eval-results/digitous/Alpacino13b/results_2023-10-15T02-53-57.964177.json new file mode 100644 index 0000000000000000000000000000000000000000..87c55e610e4867f1ada7474e08f9a2edd1023e2c --- /dev/null +++ b/eval-results/digitous/Alpacino13b/results_2023-10-15T02-53-57.964177.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "digitous/Alpacino13b", + "model_sha": "7092a5c8dec649694dd66ff8cfe5452ce52e6a40", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.002726510067114094, + "em_stderr": 0.0005340111700415912, + "f1": 0.060902894295302074, + "f1_stderr": 0.0013623642919700313 + }, + "harness|gsm8k|5": { + "acc": 0.07960576194086429, + "acc_stderr": 0.007455924338676276 + }, + "harness|winogrande|5": { + "acc": 0.7695343330702447, + "acc_stderr": 0.011835872164836676 + }, + "all": { + "em": 0.002726510067114094, + "em_stderr": 0.0005340111700415912, + "f1": 0.060902894295302074, + "f1_stderr": 0.0013623642919700313, + "acc": 0.4245700475055545, + "acc_stderr": 0.009645898251756477 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "af2381cb9475c64b" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "4ec36a82fd67dce3" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "639080f4517cfb00" + }, + "total_evaluation_time_secondes": "12575.613512277603", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/digitous/Alpacino30b/results_2023-07-19T22-46-17.057330.json b/eval-results/digitous/Alpacino30b/results_2023-07-19T22-46-17.057330.json new file mode 100644 index 0000000000000000000000000000000000000000..66208e2bc8f8ae81927fce451b11919a28c813cf --- /dev/null +++ b/eval-results/digitous/Alpacino30b/results_2023-07-19T22-46-17.057330.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5998293515358362, + "acc_stderr": 0.014317197787809169, + "acc_norm": 0.6271331058020477, + "acc_norm_stderr": 0.014131176760131169 + }, + "harness|hellaswag|10": { + "acc": 0.6537542322246565, + "acc_stderr": 0.004748003276466209, + "acc_norm": 0.8504282015534754, + "acc_norm_stderr": 0.003559223015610494 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6381578947368421, + "acc_stderr": 0.03910525752849723, + "acc_norm": 0.6381578947368421, + "acc_norm_stderr": 0.03910525752849723 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5886792452830188, + "acc_stderr": 0.03028500925900979, + "acc_norm": 0.5886792452830188, + "acc_norm_stderr": 0.03028500925900979 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.04076663253918567, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.04076663253918567 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5317919075144508, + "acc_stderr": 0.03804749744364764, + "acc_norm": 0.5317919075144508, + "acc_norm_stderr": 0.03804749744364764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.04533838195929775, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.04533838195929775 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5148936170212766, + "acc_stderr": 0.032671518489247764, + "acc_norm": 0.5148936170212766, + "acc_norm_stderr": 0.032671518489247764 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.37719298245614036, + "acc_stderr": 0.04559522141958216, + "acc_norm": 0.37719298245614036, + "acc_norm_stderr": 0.04559522141958216 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5241379310344828, + "acc_stderr": 0.0416180850350153, + "acc_norm": 0.5241379310344828, + "acc_norm_stderr": 0.0416180850350153 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.36772486772486773, + "acc_stderr": 0.024833839825562417, + "acc_norm": 0.36772486772486773, + "acc_norm_stderr": 0.024833839825562417 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3412698412698413, + "acc_stderr": 0.04240799327574925, + "acc_norm": 0.3412698412698413, + "acc_norm_stderr": 0.04240799327574925 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6935483870967742, + "acc_stderr": 0.026226485652553883, + "acc_norm": 0.6935483870967742, + "acc_norm_stderr": 0.026226485652553883 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4088669950738916, + "acc_stderr": 0.034590588158832314, + "acc_norm": 0.4088669950738916, + "acc_norm_stderr": 0.034590588158832314 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.03546563019624336, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.03546563019624336 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.031911782267135466, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.031911782267135466 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8186528497409327, + "acc_stderr": 0.02780703236068609, + "acc_norm": 0.8186528497409327, + "acc_norm_stderr": 0.02780703236068609 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5743589743589743, + "acc_stderr": 0.025069094387296532, + "acc_norm": 0.5743589743589743, + "acc_norm_stderr": 0.025069094387296532 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2851851851851852, + "acc_stderr": 0.027528599210340496, + "acc_norm": 0.2851851851851852, + "acc_norm_stderr": 0.027528599210340496 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.0322529423239964, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.0322529423239964 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7541284403669725, + "acc_stderr": 0.01846194096870843, + "acc_norm": 0.7541284403669725, + "acc_norm_stderr": 0.01846194096870843 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4398148148148148, + "acc_stderr": 0.03385177976044811, + "acc_norm": 0.4398148148148148, + "acc_norm_stderr": 0.03385177976044811 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7941176470588235, + "acc_stderr": 0.028379449451588663, + "acc_norm": 0.7941176470588235, + "acc_norm_stderr": 0.028379449451588663 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7974683544303798, + "acc_stderr": 0.026160568246601453, + "acc_norm": 0.7974683544303798, + "acc_norm_stderr": 0.026160568246601453 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6547085201793722, + "acc_stderr": 0.03191100192835795, + "acc_norm": 0.6547085201793722, + "acc_norm_stderr": 0.03191100192835795 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6335877862595419, + "acc_stderr": 0.04225875451969638, + "acc_norm": 0.6335877862595419, + "acc_norm_stderr": 0.04225875451969638 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.040261875275912073, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.040261875275912073 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6851851851851852, + "acc_stderr": 0.04489931073591312, + "acc_norm": 0.6851851851851852, + "acc_norm_stderr": 0.04489931073591312 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6871165644171779, + "acc_stderr": 0.03642914578292406, + "acc_norm": 0.6871165644171779, + "acc_norm_stderr": 0.03642914578292406 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.04697113923010212, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.04697113923010212 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8675213675213675, + "acc_stderr": 0.022209309073165616, + "acc_norm": 0.8675213675213675, + "acc_norm_stderr": 0.022209309073165616 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.65, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.65, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7752234993614304, + "acc_stderr": 0.014927447101937153, + "acc_norm": 0.7752234993614304, + "acc_norm_stderr": 0.014927447101937153 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6647398843930635, + "acc_stderr": 0.025416003773165555, + "acc_norm": 0.6647398843930635, + "acc_norm_stderr": 0.025416003773165555 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.42681564245810055, + "acc_stderr": 0.01654240195463191, + "acc_norm": 0.42681564245810055, + "acc_norm_stderr": 0.01654240195463191 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6045751633986928, + "acc_stderr": 0.02799672318063145, + "acc_norm": 0.6045751633986928, + "acc_norm_stderr": 0.02799672318063145 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6816720257234726, + "acc_stderr": 0.026457225067811032, + "acc_norm": 0.6816720257234726, + "acc_norm_stderr": 0.026457225067811032 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.654320987654321, + "acc_stderr": 0.02646248777700187, + "acc_norm": 0.654320987654321, + "acc_norm_stderr": 0.02646248777700187 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.450354609929078, + "acc_stderr": 0.029680105565029036, + "acc_norm": 0.450354609929078, + "acc_norm_stderr": 0.029680105565029036 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4661016949152542, + "acc_stderr": 0.012740853872949823, + "acc_norm": 0.4661016949152542, + "acc_norm_stderr": 0.012740853872949823 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5514705882352942, + "acc_stderr": 0.0302114796091216, + "acc_norm": 0.5514705882352942, + "acc_norm_stderr": 0.0302114796091216 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6241830065359477, + "acc_stderr": 0.019594021136577443, + "acc_norm": 0.6241830065359477, + "acc_norm_stderr": 0.019594021136577443 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.04350271442923243, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.04350271442923243 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6571428571428571, + "acc_stderr": 0.030387262919547724, + "acc_norm": 0.6571428571428571, + "acc_norm_stderr": 0.030387262919547724 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8059701492537313, + "acc_stderr": 0.027962677604768907, + "acc_norm": 0.8059701492537313, + "acc_norm_stderr": 0.027962677604768907 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.0358870281282637, + "acc_norm": 0.85, + "acc_norm_stderr": 0.0358870281282637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8011695906432749, + "acc_stderr": 0.030611116557432528, + "acc_norm": 0.8011695906432749, + "acc_norm_stderr": 0.030611116557432528 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2864137086903305, + "mc1_stderr": 0.015826142439502353, + "mc2": 0.44228547305989546, + "mc2_stderr": 0.01422728533131314 + }, + "all": { + "acc": 0.5862112983530949, + "acc_stderr": 0.03411054871660542, + "acc_norm": 0.5900075309564006, + "acc_norm_stderr": 0.03408724699985061, + "mc1": 0.2864137086903305, + "mc1_stderr": 0.015826142439502353, + "mc2": 0.44228547305989546, + "mc2_stderr": 0.01422728533131314 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "digitous/Alpacino30b", + "model_sha": "300bc5f3dc129a3d17adf059394e381eff7fbd55", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/digitous/Alpacino30b/results_2023-09-17T12-47-23.707315.json b/eval-results/digitous/Alpacino30b/results_2023-09-17T12-47-23.707315.json new file mode 100644 index 0000000000000000000000000000000000000000..d88be64a2bebafe758fcbcbd054d8a26b6f9fe75 --- /dev/null +++ b/eval-results/digitous/Alpacino30b/results_2023-09-17T12-47-23.707315.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "digitous/Alpacino30b", + "model_sha": "300bc5f3dc129a3d17adf059394e381eff7fbd55", + "model_size": "60.65 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001363255033557047, + "em_stderr": 0.00037786091964607055, + "f1": 0.06650901845637598, + "f1_stderr": 0.0014161819077838128 + }, + "harness|gsm8k|5": { + "acc": 0.15769522365428354, + "acc_stderr": 0.01003890157506138 + }, + "harness|winogrande|5": { + "acc": 0.797947908445146, + "acc_stderr": 0.011285013754047434 + }, + "all": { + "em": 0.001363255033557047, + "em_stderr": 0.00037786091964607055, + "f1": 0.06650901845637598, + "f1_stderr": 0.0014161819077838128, + "acc": 0.47782156604971476, + "acc_stderr": 0.010661957664554408 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "633a25710730bafa" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "41e86ec4bc3315bf" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "9d1572ae09517235" + }, + "total_evaluation_time_secondes": "20718.29528069496", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/digitous/GPT-R/results_2023-07-19T20-10-48.990479.json b/eval-results/digitous/GPT-R/results_2023-07-19T20-10-48.990479.json new file mode 100644 index 0000000000000000000000000000000000000000..0a5d48fe8ae50bdef4f193072e94959e742ded09 --- /dev/null +++ b/eval-results/digitous/GPT-R/results_2023-07-19T20-10-48.990479.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.38310580204778155, + "acc_stderr": 0.014206472661672881, + "acc_norm": 0.4121160409556314, + "acc_norm_stderr": 0.0143839153022254 + }, + "harness|hellaswag|10": { + "acc": 0.4980083648675563, + "acc_stderr": 0.004989741826250389, + "acc_norm": 0.6688906592312288, + "acc_norm_stderr": 0.00469650510121741 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4934210526315789, + "acc_stderr": 0.040685900502249704, + "acc_norm": 0.4934210526315789, + "acc_norm_stderr": 0.040685900502249704 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.3886792452830189, + "acc_stderr": 0.03000048544867599, + "acc_norm": 0.3886792452830189, + "acc_norm_stderr": 0.03000048544867599 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3472222222222222, + "acc_stderr": 0.039812405437178615, + "acc_norm": 0.3472222222222222, + "acc_norm_stderr": 0.039812405437178615 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411022, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411022 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768077, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768077 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.34104046242774566, + "acc_stderr": 0.03614665424180826, + "acc_norm": 0.34104046242774566, + "acc_norm_stderr": 0.03614665424180826 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.04440521906179326, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.04440521906179326 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.32340425531914896, + "acc_stderr": 0.03057944277361034, + "acc_norm": 0.32340425531914896, + "acc_norm_stderr": 0.03057944277361034 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.32456140350877194, + "acc_stderr": 0.04404556157374767, + "acc_norm": 0.32456140350877194, + "acc_norm_stderr": 0.04404556157374767 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3931034482758621, + "acc_stderr": 0.040703290137070705, + "acc_norm": 0.3931034482758621, + "acc_norm_stderr": 0.040703290137070705 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.023517294335963283, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.023517294335963283 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.037184890068181146, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.037184890068181146 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.38387096774193546, + "acc_stderr": 0.027666182075539635, + "acc_norm": 0.38387096774193546, + "acc_norm_stderr": 0.027666182075539635 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3251231527093596, + "acc_stderr": 0.032957975663112704, + "acc_norm": 0.3251231527093596, + "acc_norm_stderr": 0.032957975663112704 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.34545454545454546, + "acc_stderr": 0.037131580674819115, + "acc_norm": 0.34545454545454546, + "acc_norm_stderr": 0.037131580674819115 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.4292929292929293, + "acc_stderr": 0.03526552724601199, + "acc_norm": 0.4292929292929293, + "acc_norm_stderr": 0.03526552724601199 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.46113989637305697, + "acc_stderr": 0.03597524411734578, + "acc_norm": 0.46113989637305697, + "acc_norm_stderr": 0.03597524411734578 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3641025641025641, + "acc_stderr": 0.02439667298509478, + "acc_norm": 0.3641025641025641, + "acc_norm_stderr": 0.02439667298509478 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25555555555555554, + "acc_stderr": 0.026593939101844065, + "acc_norm": 0.25555555555555554, + "acc_norm_stderr": 0.026593939101844065 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.36554621848739494, + "acc_stderr": 0.031282177063684614, + "acc_norm": 0.36554621848739494, + "acc_norm_stderr": 0.031282177063684614 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.038227469376587525, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.038227469376587525 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3541284403669725, + "acc_stderr": 0.020504729013829125, + "acc_norm": 0.3541284403669725, + "acc_norm_stderr": 0.020504729013829125 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.03054674526495318, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.03054674526495318 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.03374499356319354, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.03374499356319354 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.35443037974683544, + "acc_stderr": 0.0311373042971858, + "acc_norm": 0.35443037974683544, + "acc_norm_stderr": 0.0311373042971858 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.4080717488789238, + "acc_stderr": 0.03298574607842821, + "acc_norm": 0.4080717488789238, + "acc_norm_stderr": 0.03298574607842821 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3816793893129771, + "acc_stderr": 0.04260735157644561, + "acc_norm": 0.3816793893129771, + "acc_norm_stderr": 0.04260735157644561 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5041322314049587, + "acc_stderr": 0.04564198767432754, + "acc_norm": 0.5041322314049587, + "acc_norm_stderr": 0.04564198767432754 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.3611111111111111, + "acc_stderr": 0.04643454608906275, + "acc_norm": 0.3611111111111111, + "acc_norm_stderr": 0.04643454608906275 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.37423312883435583, + "acc_stderr": 0.03802068102899616, + "acc_norm": 0.37423312883435583, + "acc_norm_stderr": 0.03802068102899616 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25892857142857145, + "acc_stderr": 0.041577515398656284, + "acc_norm": 0.25892857142857145, + "acc_norm_stderr": 0.041577515398656284 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3592233009708738, + "acc_stderr": 0.047504583990416946, + "acc_norm": 0.3592233009708738, + "acc_norm_stderr": 0.047504583990416946 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.42735042735042733, + "acc_stderr": 0.03240847393516327, + "acc_norm": 0.42735042735042733, + "acc_norm_stderr": 0.03240847393516327 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.3780332056194125, + "acc_stderr": 0.0173398444621046, + "acc_norm": 0.3780332056194125, + "acc_norm_stderr": 0.0173398444621046 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.36416184971098264, + "acc_stderr": 0.025906632631016124, + "acc_norm": 0.36416184971098264, + "acc_norm_stderr": 0.025906632631016124 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2446927374301676, + "acc_stderr": 0.014378169884098398, + "acc_norm": 0.2446927374301676, + "acc_norm_stderr": 0.014378169884098398 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.42810457516339867, + "acc_stderr": 0.028332397483664274, + "acc_norm": 0.42810457516339867, + "acc_norm_stderr": 0.028332397483664274 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3858520900321543, + "acc_stderr": 0.02764814959975147, + "acc_norm": 0.3858520900321543, + "acc_norm_stderr": 0.02764814959975147 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.38271604938271603, + "acc_stderr": 0.027044538138402612, + "acc_norm": 0.38271604938271603, + "acc_norm_stderr": 0.027044538138402612 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2765957446808511, + "acc_stderr": 0.026684564340460997, + "acc_norm": 0.2765957446808511, + "acc_norm_stderr": 0.026684564340460997 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.30378096479791394, + "acc_stderr": 0.011745787720472458, + "acc_norm": 0.30378096479791394, + "acc_norm_stderr": 0.011745787720472458 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.3602941176470588, + "acc_stderr": 0.029163128570670736, + "acc_norm": 0.3602941176470588, + "acc_norm_stderr": 0.029163128570670736 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.315359477124183, + "acc_stderr": 0.018798086284886887, + "acc_norm": 0.315359477124183, + "acc_norm_stderr": 0.018798086284886887 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.43636363636363634, + "acc_stderr": 0.04750185058907297, + "acc_norm": 0.43636363636363634, + "acc_norm_stderr": 0.04750185058907297 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4448979591836735, + "acc_stderr": 0.031814251181977865, + "acc_norm": 0.4448979591836735, + "acc_norm_stderr": 0.031814251181977865 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.5174129353233831, + "acc_stderr": 0.03533389234739245, + "acc_norm": 0.5174129353233831, + "acc_norm_stderr": 0.03533389234739245 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3433734939759036, + "acc_stderr": 0.03696584317010601, + "acc_norm": 0.3433734939759036, + "acc_norm_stderr": 0.03696584317010601 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.4152046783625731, + "acc_stderr": 0.03779275945503201, + "acc_norm": 0.4152046783625731, + "acc_norm_stderr": 0.03779275945503201 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.22276621787025705, + "mc1_stderr": 0.014566506961396743, + "mc2": 0.34219171977394097, + "mc2_stderr": 0.014199441494649892 + }, + "all": { + "acc": 0.36758450011334015, + "acc_stderr": 0.034864257501640405, + "acc_norm": 0.3709725091518405, + "acc_norm_stderr": 0.03486229489003904, + "mc1": 0.22276621787025705, + "mc1_stderr": 0.014566506961396743, + "mc2": 0.34219171977394097, + "mc2_stderr": 0.014199441494649892 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "digitous/GPT-R", + "model_sha": "92b955a3ff74aa577fa0d8517dfc314847ef60af", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + } + } +} \ No newline at end of file diff --git a/eval-results/digitous/GPT-R/results_2023-10-21T16-59-10.441941.json b/eval-results/digitous/GPT-R/results_2023-10-21T16-59-10.441941.json new file mode 100644 index 0000000000000000000000000000000000000000..7f0caaf4c80df9440e8f12fe43723d4c68d53152 --- /dev/null +++ b/eval-results/digitous/GPT-R/results_2023-10-21T16-59-10.441941.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "digitous/GPT-R", + "model_sha": "92b955a3ff74aa577fa0d8517dfc314847ef60af", + "model_size": "11.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0012583892617449664, + "em_stderr": 0.00036305608931189593, + "f1": 0.05138632550335586, + "f1_stderr": 0.0012400453401352261 + }, + "harness|gsm8k|5": { + "acc": 0.01592115238817286, + "acc_stderr": 0.0034478192723890067 + }, + "harness|winogrande|5": { + "acc": 0.6440410418310971, + "acc_stderr": 0.013456740656273952 + }, + "all": { + "em": 0.0012583892617449664, + "em_stderr": 0.00036305608931189593, + "f1": 0.05138632550335586, + "f1_stderr": 0.0012400453401352261, + "acc": 0.32998109710963497, + "acc_stderr": 0.00845227996433148 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f21277d2c2d2e06c", + "hash_cont_tokens": "5ccc05487a4356b1" + }, + "truncated": 382, + "non-truncated": 9154, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "523f0f0f3e62faf1" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "26cd3631535039d0", + "hash_cont_tokens": "44fa059b7e7c4a1b" + }, + "total_evaluation_time_secondes": "9902.361578941345", + "truncated": 382, + "non-truncated": 13007, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/digitous/Janin-GPTJ/results_2023-07-19T20-10-14.286796.json b/eval-results/digitous/Janin-GPTJ/results_2023-07-19T20-10-14.286796.json new file mode 100644 index 0000000000000000000000000000000000000000..22174072618ce621366f39e937094ad7b2cd72ef --- /dev/null +++ b/eval-results/digitous/Janin-GPTJ/results_2023-07-19T20-10-14.286796.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.36006825938566556, + "acc_stderr": 0.014027516814585186, + "acc_norm": 0.4087030716723549, + "acc_norm_stderr": 0.014365750345427005 + }, + "harness|hellaswag|10": { + "acc": 0.4953196574387572, + "acc_stderr": 0.004989562798280521, + "acc_norm": 0.6728739294961164, + "acc_norm_stderr": 0.004682048906622314 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.0391545063041425, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.0391545063041425 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.03690677986137283, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.03690677986137283 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206824, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206824 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.3018867924528302, + "acc_stderr": 0.028254200344438655, + "acc_norm": 0.3018867924528302, + "acc_norm_stderr": 0.028254200344438655 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.15, + "acc_stderr": 0.03588702812826371, + "acc_norm": 0.15, + "acc_norm_stderr": 0.03588702812826371 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816508, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816508 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.27167630057803466, + "acc_stderr": 0.03391750322321659, + "acc_norm": 0.27167630057803466, + "acc_norm_stderr": 0.03391750322321659 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.33617021276595743, + "acc_stderr": 0.030881618520676942, + "acc_norm": 0.33617021276595743, + "acc_norm_stderr": 0.030881618520676942 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.044346007015849245, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.044346007015849245 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2689655172413793, + "acc_stderr": 0.03695183311650232, + "acc_norm": 0.2689655172413793, + "acc_norm_stderr": 0.03695183311650232 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2671957671957672, + "acc_stderr": 0.022789673145776564, + "acc_norm": 0.2671957671957672, + "acc_norm_stderr": 0.022789673145776564 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23015873015873015, + "acc_stderr": 0.03764950879790605, + "acc_norm": 0.23015873015873015, + "acc_norm_stderr": 0.03764950879790605 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.22580645161290322, + "acc_stderr": 0.023785577884181012, + "acc_norm": 0.22580645161290322, + "acc_norm_stderr": 0.023785577884181012 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.030108330718011625, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.030108330718011625 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2606060606060606, + "acc_stderr": 0.03427743175816524, + "acc_norm": 0.2606060606060606, + "acc_norm_stderr": 0.03427743175816524 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.029620227874790465, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.029620227874790465 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.22279792746113988, + "acc_stderr": 0.03003114797764154, + "acc_norm": 0.22279792746113988, + "acc_norm_stderr": 0.03003114797764154 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2717948717948718, + "acc_stderr": 0.022556551010132344, + "acc_norm": 0.2717948717948718, + "acc_norm_stderr": 0.022556551010132344 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.02646611753895991, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.02646611753895991 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.25630252100840334, + "acc_stderr": 0.02835962087053395, + "acc_norm": 0.25630252100840334, + "acc_norm_stderr": 0.02835962087053395 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2582781456953642, + "acc_stderr": 0.035737053147634576, + "acc_norm": 0.2582781456953642, + "acc_norm_stderr": 0.035737053147634576 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.24587155963302754, + "acc_stderr": 0.01846194096870845, + "acc_norm": 0.24587155963302754, + "acc_norm_stderr": 0.01846194096870845 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.19444444444444445, + "acc_stderr": 0.026991454502036733, + "acc_norm": 0.19444444444444445, + "acc_norm_stderr": 0.026991454502036733 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.031660096793998116, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.031660096793998116 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2616033755274262, + "acc_stderr": 0.028609516716994934, + "acc_norm": 0.2616033755274262, + "acc_norm_stderr": 0.028609516716994934 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.32286995515695066, + "acc_stderr": 0.03138147637575499, + "acc_norm": 0.32286995515695066, + "acc_norm_stderr": 0.03138147637575499 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.22900763358778625, + "acc_stderr": 0.036853466317118506, + "acc_norm": 0.22900763358778625, + "acc_norm_stderr": 0.036853466317118506 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.04065578140908705, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.04065578140908705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.04489931073591312, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.04489931073591312 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.24539877300613497, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.24539877300613497, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3392857142857143, + "acc_stderr": 0.04493949068613539, + "acc_norm": 0.3392857142857143, + "acc_norm_stderr": 0.04493949068613539 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.20388349514563106, + "acc_stderr": 0.03989139859531772, + "acc_norm": 0.20388349514563106, + "acc_norm_stderr": 0.03989139859531772 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.029343114798094476, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.029343114798094476 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.30268199233716475, + "acc_stderr": 0.016428781581749364, + "acc_norm": 0.30268199233716475, + "acc_norm_stderr": 0.016428781581749364 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.28901734104046245, + "acc_stderr": 0.02440517393578324, + "acc_norm": 0.28901734104046245, + "acc_norm_stderr": 0.02440517393578324 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574875, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574875 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.025646863097137904, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.025646863097137904 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2765273311897106, + "acc_stderr": 0.025403832978179604, + "acc_norm": 0.2765273311897106, + "acc_norm_stderr": 0.025403832978179604 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.30246913580246915, + "acc_stderr": 0.02555765398186805, + "acc_norm": 0.30246913580246915, + "acc_norm_stderr": 0.02555765398186805 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.28368794326241137, + "acc_stderr": 0.02689170942834396, + "acc_norm": 0.28368794326241137, + "acc_norm_stderr": 0.02689170942834396 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3109517601043025, + "acc_stderr": 0.011822252917799207, + "acc_norm": 0.3109517601043025, + "acc_norm_stderr": 0.011822252917799207 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.25735294117647056, + "acc_stderr": 0.02655651947004152, + "acc_norm": 0.25735294117647056, + "acc_norm_stderr": 0.02655651947004152 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.272875816993464, + "acc_stderr": 0.01802047414839358, + "acc_norm": 0.272875816993464, + "acc_norm_stderr": 0.01802047414839358 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.33636363636363636, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.33636363636363636, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4122448979591837, + "acc_stderr": 0.0315123604467428, + "acc_norm": 0.4122448979591837, + "acc_norm_stderr": 0.0315123604467428 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.35323383084577115, + "acc_stderr": 0.03379790611796776, + "acc_norm": 0.35323383084577115, + "acc_norm_stderr": 0.03379790611796776 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.27710843373493976, + "acc_stderr": 0.03484331592680588, + "acc_norm": 0.27710843373493976, + "acc_norm_stderr": 0.03484331592680588 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.30994152046783624, + "acc_stderr": 0.035469769593931624, + "acc_norm": 0.30994152046783624, + "acc_norm_stderr": 0.035469769593931624 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.211750305997552, + "mc1_stderr": 0.014302068353925612, + "mc2": 0.36245167469649814, + "mc2_stderr": 0.01346595361722477 + }, + "all": { + "acc": 0.2792504630285298, + "acc_stderr": 0.0324003365412133, + "acc_norm": 0.28308417632249677, + "acc_norm_stderr": 0.03240085721306387, + "mc1": 0.211750305997552, + "mc1_stderr": 0.014302068353925612, + "mc2": 0.36245167469649814, + "mc2_stderr": 0.01346595361722477 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "digitous/Janin-GPTJ", + "model_sha": "a6773861798f2abea3849514aa6f60961518af9c", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + } + } +} \ No newline at end of file diff --git a/eval-results/digitous/Janin-GPTJ/results_2023-10-13T01-41-17.922398.json b/eval-results/digitous/Janin-GPTJ/results_2023-10-13T01-41-17.922398.json new file mode 100644 index 0000000000000000000000000000000000000000..68db4d19d6255c4ec680ade77b80422df209b67a --- /dev/null +++ b/eval-results/digitous/Janin-GPTJ/results_2023-10-13T01-41-17.922398.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "digitous/Janin-GPTJ", + "model_sha": "a6773861798f2abea3849514aa6f60961518af9c", + "model_size": "11.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0010486577181208054, + "em_stderr": 0.00033145814652192754, + "f1": 0.04561451342281894, + "f1_stderr": 0.0011266864813108584 + }, + "harness|gsm8k|5": { + "acc": 0.019711902956785442, + "acc_stderr": 0.0038289829787357143 + }, + "harness|winogrande|5": { + "acc": 0.6424625098658248, + "acc_stderr": 0.013470007443920688 + }, + "all": { + "em": 0.0010486577181208054, + "em_stderr": 0.00033145814652192754, + "f1": 0.04561451342281894, + "f1_stderr": 0.0011266864813108584, + "acc": 0.3310872064113051, + "acc_stderr": 0.008649495211328202 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f21277d2c2d2e06c", + "hash_cont_tokens": "ed77db0a3e4e3665" + }, + "truncated": 382, + "non-truncated": 9154, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "0f59640500fb7885" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "26cd3631535039d0", + "hash_cont_tokens": "c03d5436981acfba" + }, + "total_evaluation_time_secondes": "16086.180728197098", + "truncated": 382, + "non-truncated": 13007, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/digitous/Janin-R/results_2023-07-19T19-29-39.251365.json b/eval-results/digitous/Janin-R/results_2023-07-19T19-29-39.251365.json new file mode 100644 index 0000000000000000000000000000000000000000..5694049e37690a49b76ab3434b223a11d032f2b4 --- /dev/null +++ b/eval-results/digitous/Janin-R/results_2023-07-19T19-29-39.251365.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.3728668941979522, + "acc_stderr": 0.014131176760131165, + "acc_norm": 0.4044368600682594, + "acc_norm_stderr": 0.014342036483436175 + }, + "harness|hellaswag|10": { + "acc": 0.4986058554072894, + "acc_stderr": 0.004989762014739187, + "acc_norm": 0.6735710017924716, + "acc_norm_stderr": 0.004679479763516773 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.34814814814814815, + "acc_stderr": 0.041153246103369526, + "acc_norm": 0.34814814814814815, + "acc_norm_stderr": 0.041153246103369526 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3355263157894737, + "acc_stderr": 0.038424985593952694, + "acc_norm": 0.3355263157894737, + "acc_norm_stderr": 0.038424985593952694 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768077, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768077 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.3660377358490566, + "acc_stderr": 0.02964781353936523, + "acc_norm": 0.3660377358490566, + "acc_norm_stderr": 0.02964781353936523 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2916666666666667, + "acc_stderr": 0.038009680605548574, + "acc_norm": 0.2916666666666667, + "acc_norm_stderr": 0.038009680605548574 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816503, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816503 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816508, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816508 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.30057803468208094, + "acc_stderr": 0.03496101481191181, + "acc_norm": 0.30057803468208094, + "acc_norm_stderr": 0.03496101481191181 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237655, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237655 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939098, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939098 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3404255319148936, + "acc_stderr": 0.03097669299853443, + "acc_norm": 0.3404255319148936, + "acc_norm_stderr": 0.03097669299853443 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.04372748290278007, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.04372748290278007 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.31724137931034485, + "acc_stderr": 0.03878352372138622, + "acc_norm": 0.31724137931034485, + "acc_norm_stderr": 0.03878352372138622 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2724867724867725, + "acc_stderr": 0.022930973071633356, + "acc_norm": 0.2724867724867725, + "acc_norm_stderr": 0.022930973071633356 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.1746031746031746, + "acc_stderr": 0.03395490020856109, + "acc_norm": 0.1746031746031746, + "acc_norm_stderr": 0.03395490020856109 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25806451612903225, + "acc_stderr": 0.024892469172462843, + "acc_norm": 0.25806451612903225, + "acc_norm_stderr": 0.024892469172462843 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.03144712581678241, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.03144712581678241 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.3212121212121212, + "acc_stderr": 0.03646204963253812, + "acc_norm": 0.3212121212121212, + "acc_norm_stderr": 0.03646204963253812 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.30303030303030304, + "acc_stderr": 0.03274287914026867, + "acc_norm": 0.30303030303030304, + "acc_norm_stderr": 0.03274287914026867 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.35233160621761656, + "acc_stderr": 0.034474782864143565, + "acc_norm": 0.35233160621761656, + "acc_norm_stderr": 0.034474782864143565 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3282051282051282, + "acc_stderr": 0.023807633198657273, + "acc_norm": 0.3282051282051282, + "acc_norm_stderr": 0.023807633198657273 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.026202766534652148, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.026202766534652148 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3025210084033613, + "acc_stderr": 0.029837962388291936, + "acc_norm": 0.3025210084033613, + "acc_norm_stderr": 0.029837962388291936 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.28807339449541286, + "acc_stderr": 0.019416445892636025, + "acc_norm": 0.28807339449541286, + "acc_norm_stderr": 0.019416445892636025 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.028353212866863434, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.028353212866863434 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.3137254901960784, + "acc_stderr": 0.032566854844603886, + "acc_norm": 0.3137254901960784, + "acc_norm_stderr": 0.032566854844603886 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.3459915611814346, + "acc_stderr": 0.03096481058878671, + "acc_norm": 0.3459915611814346, + "acc_norm_stderr": 0.03096481058878671 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.37668161434977576, + "acc_stderr": 0.03252113489929187, + "acc_norm": 0.37668161434977576, + "acc_norm_stderr": 0.03252113489929187 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2748091603053435, + "acc_stderr": 0.03915345408847835, + "acc_norm": 0.2748091603053435, + "acc_norm_stderr": 0.03915345408847835 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.4297520661157025, + "acc_stderr": 0.04519082021319773, + "acc_norm": 0.4297520661157025, + "acc_norm_stderr": 0.04519082021319773 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04557239513497751, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04557239513497751 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.294478527607362, + "acc_stderr": 0.03581165790474082, + "acc_norm": 0.294478527607362, + "acc_norm_stderr": 0.03581165790474082 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.29464285714285715, + "acc_stderr": 0.04327040932578729, + "acc_norm": 0.29464285714285715, + "acc_norm_stderr": 0.04327040932578729 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.33980582524271846, + "acc_stderr": 0.046897659372781335, + "acc_norm": 0.33980582524271846, + "acc_norm_stderr": 0.046897659372781335 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.31196581196581197, + "acc_stderr": 0.030351527323344944, + "acc_norm": 0.31196581196581197, + "acc_norm_stderr": 0.030351527323344944 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.31800766283524906, + "acc_stderr": 0.0166534862756154, + "acc_norm": 0.31800766283524906, + "acc_norm_stderr": 0.0166534862756154 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.3236994219653179, + "acc_stderr": 0.02519018132760841, + "acc_norm": 0.3236994219653179, + "acc_norm_stderr": 0.02519018132760841 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2446927374301676, + "acc_stderr": 0.014378169884098405, + "acc_norm": 0.2446927374301676, + "acc_norm_stderr": 0.014378169884098405 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.3758169934640523, + "acc_stderr": 0.027732834353363937, + "acc_norm": 0.3758169934640523, + "acc_norm_stderr": 0.027732834353363937 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3440514469453376, + "acc_stderr": 0.026981478043648043, + "acc_norm": 0.3440514469453376, + "acc_norm_stderr": 0.026981478043648043 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.32407407407407407, + "acc_stderr": 0.026041766202717163, + "acc_norm": 0.32407407407407407, + "acc_norm_stderr": 0.026041766202717163 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2765957446808511, + "acc_stderr": 0.026684564340460997, + "acc_norm": 0.2765957446808511, + "acc_norm_stderr": 0.026684564340460997 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.318122555410691, + "acc_stderr": 0.011895407281104085, + "acc_norm": 0.318122555410691, + "acc_norm_stderr": 0.011895407281104085 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.34191176470588236, + "acc_stderr": 0.028814722422254177, + "acc_norm": 0.34191176470588236, + "acc_norm_stderr": 0.028814722422254177 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2973856209150327, + "acc_stderr": 0.01849259653639695, + "acc_norm": 0.2973856209150327, + "acc_norm_stderr": 0.01849259653639695 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.36363636363636365, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4530612244897959, + "acc_stderr": 0.03186785930004128, + "acc_norm": 0.4530612244897959, + "acc_norm_stderr": 0.03186785930004128 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.44776119402985076, + "acc_stderr": 0.035161847729521654, + "acc_norm": 0.44776119402985076, + "acc_norm_stderr": 0.035161847729521654 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3373493975903614, + "acc_stderr": 0.03680783690727581, + "acc_norm": 0.3373493975903614, + "acc_norm_stderr": 0.03680783690727581 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.34502923976608185, + "acc_stderr": 0.03645981377388807, + "acc_norm": 0.34502923976608185, + "acc_norm_stderr": 0.03645981377388807 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.21664626682986537, + "mc1_stderr": 0.01442146845250698, + "mc2": 0.34493913847379143, + "mc2_stderr": 0.013813267543937122 + }, + "all": { + "acc": 0.3166128212765898, + "acc_stderr": 0.03356756713275138, + "acc_norm": 0.32011341639956414, + "acc_norm_stderr": 0.033565882005159565, + "mc1": 0.21664626682986537, + "mc1_stderr": 0.01442146845250698, + "mc2": 0.34493913847379143, + "mc2_stderr": 0.013813267543937122 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "digitous/Janin-R", + "model_sha": "f6963f77098d8421ff4a1cf4d36f1e94c6c8f44b", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + } + } +} \ No newline at end of file diff --git a/eval-results/digitous/Janin-R/results_2023-09-17T03-14-06.115114.json b/eval-results/digitous/Janin-R/results_2023-09-17T03-14-06.115114.json new file mode 100644 index 0000000000000000000000000000000000000000..54c85cccbd7328bc50a38e302f5e0bd21f6cd687 --- /dev/null +++ b/eval-results/digitous/Janin-R/results_2023-09-17T03-14-06.115114.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "digitous/Janin-R", + "model_sha": "f6963f77098d8421ff4a1cf4d36f1e94c6c8f44b", + "model_size": "11.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001153523489932886, + "em_stderr": 0.0003476179896857095, + "f1": 0.04803796140939615, + "f1_stderr": 0.0011624552972241407 + }, + "harness|gsm8k|5": { + "acc": 0.022744503411675512, + "acc_stderr": 0.004106620637749676 + }, + "harness|winogrande|5": { + "acc": 0.6535122336227308, + "acc_stderr": 0.013373773411685646 + }, + "all": { + "em": 0.001153523489932886, + "em_stderr": 0.0003476179896857095, + "f1": 0.04803796140939615, + "f1_stderr": 0.0011624552972241407, + "acc": 0.3381283685172032, + "acc_stderr": 0.00874019702471766 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f21277d2c2d2e06c", + "hash_cont_tokens": "3710a9442da0a51d" + }, + "truncated": 382, + "non-truncated": 9154, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "de146e222ddc9f33" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "26cd3631535039d0", + "hash_cont_tokens": "35bb286cd424cfa6" + }, + "total_evaluation_time_secondes": "16244.010826587677", + "truncated": 382, + "non-truncated": 13007, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/digitous/Javalion-GPTJ/results_1982e56_3ce176b.json b/eval-results/digitous/Javalion-GPTJ/results_1982e56_3ce176b.json new file mode 100644 index 0000000000000000000000000000000000000000..8775abf14de1cb94e0cbfc9403b42cca2ffaa732 --- /dev/null +++ b/eval-results/digitous/Javalion-GPTJ/results_1982e56_3ce176b.json @@ -0,0 +1,509 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.3856655290102389, + "acc_stderr": 0.01422425097325717, + "acc_norm": 0.4189419795221843, + "acc_norm_stderr": 0.014418106953639013 + }, + "harness|hellaswag|10": { + "acc": 0.504282015534754, + "acc_stderr": 0.004989598426249545, + "acc_norm": 0.6869149571798446, + "acc_norm_stderr": 0.004628008661955065 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.03944624162501116, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.03944624162501116 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.2565789473684211, + "acc_stderr": 0.0355418036802569, + "acc_norm": 0.2565789473684211, + "acc_norm_stderr": 0.0355418036802569 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2943396226415094, + "acc_stderr": 0.028049186315695248, + "acc_norm": 0.2943396226415094, + "acc_norm_stderr": 0.028049186315695248 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.24305555555555555, + "acc_stderr": 0.03586879280080342, + "acc_norm": 0.24305555555555555, + "acc_norm_stderr": 0.03586879280080342 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.040201512610368445, + "acc_norm": 0.2, + "acc_norm_stderr": 0.040201512610368445 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2543352601156069, + "acc_stderr": 0.0332055644308557, + "acc_norm": 0.2543352601156069, + "acc_norm_stderr": 0.0332055644308557 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237655, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237655 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3276595744680851, + "acc_stderr": 0.030683020843231008, + "acc_norm": 0.3276595744680851, + "acc_norm_stderr": 0.030683020843231008 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022057, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022057 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2620689655172414, + "acc_stderr": 0.036646663372252565, + "acc_norm": 0.2620689655172414, + "acc_norm_stderr": 0.036646663372252565 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2724867724867725, + "acc_stderr": 0.02293097307163335, + "acc_norm": 0.2724867724867725, + "acc_norm_stderr": 0.02293097307163335 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.20634920634920634, + "acc_stderr": 0.036196045241242515, + "acc_norm": 0.20634920634920634, + "acc_norm_stderr": 0.036196045241242515 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24193548387096775, + "acc_stderr": 0.024362599693031107, + "acc_norm": 0.24193548387096775, + "acc_norm_stderr": 0.024362599693031107 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2660098522167488, + "acc_stderr": 0.03108982600293753, + "acc_norm": 0.2660098522167488, + "acc_norm_stderr": 0.03108982600293753 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.17, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.17, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2909090909090909, + "acc_stderr": 0.03546563019624336, + "acc_norm": 0.2909090909090909, + "acc_norm_stderr": 0.03546563019624336 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.22727272727272727, + "acc_stderr": 0.02985751567338641, + "acc_norm": 0.22727272727272727, + "acc_norm_stderr": 0.02985751567338641 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.23316062176165803, + "acc_stderr": 0.030516111371476005, + "acc_norm": 0.23316062176165803, + "acc_norm_stderr": 0.030516111371476005 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.26153846153846155, + "acc_stderr": 0.02228214120420442, + "acc_norm": 0.26153846153846155, + "acc_norm_stderr": 0.02228214120420442 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.026842057873833706, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.026842057873833706 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.25210084033613445, + "acc_stderr": 0.028205545033277723, + "acc_norm": 0.25210084033613445, + "acc_norm_stderr": 0.028205545033277723 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.25165562913907286, + "acc_stderr": 0.03543304234389985, + "acc_norm": 0.25165562913907286, + "acc_norm_stderr": 0.03543304234389985 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.24403669724770644, + "acc_stderr": 0.018415286351416413, + "acc_norm": 0.24403669724770644, + "acc_norm_stderr": 0.018415286351416413 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.14814814814814814, + "acc_stderr": 0.02422762927372836, + "acc_norm": 0.14814814814814814, + "acc_norm_stderr": 0.02422762927372836 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.03096451792692339, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.03096451792692339 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.26582278481012656, + "acc_stderr": 0.028756799629658342, + "acc_norm": 0.26582278481012656, + "acc_norm_stderr": 0.028756799629658342 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.36771300448430494, + "acc_stderr": 0.03236198350928275, + "acc_norm": 0.36771300448430494, + "acc_norm_stderr": 0.03236198350928275 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.22900763358778625, + "acc_stderr": 0.036853466317118506, + "acc_norm": 0.22900763358778625, + "acc_norm_stderr": 0.036853466317118506 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.35537190082644626, + "acc_stderr": 0.04369236326573981, + "acc_norm": 0.35537190082644626, + "acc_norm_stderr": 0.04369236326573981 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.3055555555555556, + "acc_stderr": 0.04453197507374984, + "acc_norm": 0.3055555555555556, + "acc_norm_stderr": 0.04453197507374984 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285714, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285714 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.20388349514563106, + "acc_stderr": 0.039891398595317706, + "acc_norm": 0.20388349514563106, + "acc_norm_stderr": 0.039891398595317706 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.28205128205128205, + "acc_stderr": 0.029480360549541194, + "acc_norm": 0.28205128205128205, + "acc_norm_stderr": 0.029480360549541194 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.3231162196679438, + "acc_stderr": 0.016723726512343048, + "acc_norm": 0.3231162196679438, + "acc_norm_stderr": 0.016723726512343048 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2832369942196532, + "acc_stderr": 0.024257901705323374, + "acc_norm": 0.2832369942196532, + "acc_norm_stderr": 0.024257901705323374 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2875816993464052, + "acc_stderr": 0.02591780611714716, + "acc_norm": 0.2875816993464052, + "acc_norm_stderr": 0.02591780611714716 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2990353697749196, + "acc_stderr": 0.026003301117885142, + "acc_norm": 0.2990353697749196, + "acc_norm_stderr": 0.026003301117885142 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2839506172839506, + "acc_stderr": 0.025089478523765134, + "acc_norm": 0.2839506172839506, + "acc_norm_stderr": 0.025089478523765134 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2765957446808511, + "acc_stderr": 0.02668456434046099, + "acc_norm": 0.2765957446808511, + "acc_norm_stderr": 0.02668456434046099 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.29139504563233376, + "acc_stderr": 0.011605720214257596, + "acc_norm": 0.29139504563233376, + "acc_norm_stderr": 0.011605720214257596 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.1948529411764706, + "acc_stderr": 0.024060599423487417, + "acc_norm": 0.1948529411764706, + "acc_norm_stderr": 0.024060599423487417 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.01774089950917779, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.01774089950917779 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.3, + "acc_stderr": 0.04389311454644286, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04389311454644286 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.35918367346938773, + "acc_stderr": 0.03071356045510849, + "acc_norm": 0.35918367346938773, + "acc_norm_stderr": 0.03071356045510849 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.31840796019900497, + "acc_stderr": 0.032941184790540944, + "acc_norm": 0.31840796019900497, + "acc_norm_stderr": 0.032941184790540944 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.30120481927710846, + "acc_stderr": 0.03571609230053481, + "acc_norm": 0.30120481927710846, + "acc_norm_stderr": 0.03571609230053481 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.28654970760233917, + "acc_stderr": 0.034678266857038266, + "acc_norm": 0.28654970760233917, + "acc_norm_stderr": 0.034678266857038266 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.21297429620563035, + "mc1_stderr": 0.01433220378705968, + "mc2": 0.35438024747556346, + "mc2_stderr": 0.013458662849986456 + }, + "all": { + "acc": 0.27446041951954186, + "acc_stderr": 0.03217097604032509, + "acc_norm": 0.2781199007425425, + "acc_norm_stderr": 0.032168133094835044, + "mc1": 0.21297429620563035, + "mc1_stderr": 0.01433220378705968, + "mc2": 0.35438024747556346, + "mc2_stderr": 0.013458662849986456 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "digitous/Javalion-GPTJ", + "model_sha": "3ce176bc0f91cae416c78e99f964f54b12472de0", + "model_dtype": "torch.float16", + "lighteval_sha": "1982e5669ed61622a77b3a79436ff5d00583e4ff", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + } +} \ No newline at end of file diff --git a/eval-results/digitous/Javalion-GPTJ/results_2023-10-17T16-30-40.510452.json b/eval-results/digitous/Javalion-GPTJ/results_2023-10-17T16-30-40.510452.json new file mode 100644 index 0000000000000000000000000000000000000000..8d397b3ee06f43e9afb179c03869e4855ce8d457 --- /dev/null +++ b/eval-results/digitous/Javalion-GPTJ/results_2023-10-17T16-30-40.510452.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "digitous/Javalion-GPTJ", + "model_sha": "3ce176bc0f91cae416c78e99f964f54b12472de0", + "model_size": "11.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0008389261744966443, + "em_stderr": 0.0002964962989801232, + "f1": 0.04887374161073851, + "f1_stderr": 0.0012121662940147047 + }, + "harness|gsm8k|5": { + "acc": 0.016679302501895376, + "acc_stderr": 0.0035275958887224543 + }, + "harness|winogrande|5": { + "acc": 0.6527229676400947, + "acc_stderr": 0.013380909249751237 + }, + "all": { + "em": 0.0008389261744966443, + "em_stderr": 0.0002964962989801232, + "f1": 0.04887374161073851, + "f1_stderr": 0.0012121662940147047, + "acc": 0.3347011350709951, + "acc_stderr": 0.008454252569236846 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f21277d2c2d2e06c", + "hash_cont_tokens": "f64e5f070dd73c6c" + }, + "truncated": 382, + "non-truncated": 9154, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "897eed5361b07f1a" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "26cd3631535039d0", + "hash_cont_tokens": "98e85a8971844270" + }, + "total_evaluation_time_secondes": "9722.790392875671", + "truncated": 382, + "non-truncated": 13007, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/digitous/Javalion-R/results_2023-07-19T14-00-54.512853.json b/eval-results/digitous/Javalion-R/results_2023-07-19T14-00-54.512853.json new file mode 100644 index 0000000000000000000000000000000000000000..f8310ad04695a6136ba8c019ea3c6d2822bbc776 --- /dev/null +++ b/eval-results/digitous/Javalion-R/results_2023-07-19T14-00-54.512853.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.378839590443686, + "acc_stderr": 0.01417591549000032, + "acc_norm": 0.41723549488054607, + "acc_norm_stderr": 0.014409825518403079 + }, + "harness|hellaswag|10": { + "acc": 0.5032861979685321, + "acc_stderr": 0.004989673640014259, + "acc_norm": 0.6802429794861581, + "acc_norm_stderr": 0.004654291661255928 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.34814814814814815, + "acc_stderr": 0.041153246103369526, + "acc_norm": 0.34814814814814815, + "acc_norm_stderr": 0.041153246103369526 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3223684210526316, + "acc_stderr": 0.03803510248351585, + "acc_norm": 0.3223684210526316, + "acc_norm_stderr": 0.03803510248351585 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768077, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768077 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.33962264150943394, + "acc_stderr": 0.02914690474779832, + "acc_norm": 0.33962264150943394, + "acc_norm_stderr": 0.02914690474779832 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2847222222222222, + "acc_stderr": 0.03773809990686935, + "acc_norm": 0.2847222222222222, + "acc_norm_stderr": 0.03773809990686935 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.23, + "acc_stderr": 0.042295258468165044, + "acc_norm": 0.23, + "acc_norm_stderr": 0.042295258468165044 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.30057803468208094, + "acc_stderr": 0.03496101481191181, + "acc_norm": 0.30057803468208094, + "acc_norm_stderr": 0.03496101481191181 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.040233822736177476, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.040233822736177476 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3617021276595745, + "acc_stderr": 0.03141082197596241, + "acc_norm": 0.3617021276595745, + "acc_norm_stderr": 0.03141082197596241 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.04372748290278007, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.04372748290278007 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.31724137931034485, + "acc_stderr": 0.038783523721386215, + "acc_norm": 0.31724137931034485, + "acc_norm_stderr": 0.038783523721386215 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2698412698412698, + "acc_stderr": 0.02286083830923207, + "acc_norm": 0.2698412698412698, + "acc_norm_stderr": 0.02286083830923207 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.19047619047619047, + "acc_stderr": 0.0351220741230205, + "acc_norm": 0.19047619047619047, + "acc_norm_stderr": 0.0351220741230205 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2903225806451613, + "acc_stderr": 0.025822106119415884, + "acc_norm": 0.2903225806451613, + "acc_norm_stderr": 0.025822106119415884 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.30049261083743845, + "acc_stderr": 0.03225799476233485, + "acc_norm": 0.30049261083743845, + "acc_norm_stderr": 0.03225799476233485 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909282, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909282 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.30303030303030304, + "acc_stderr": 0.035886248000917075, + "acc_norm": 0.30303030303030304, + "acc_norm_stderr": 0.035886248000917075 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.29797979797979796, + "acc_stderr": 0.03258630383836556, + "acc_norm": 0.29797979797979796, + "acc_norm_stderr": 0.03258630383836556 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.32124352331606215, + "acc_stderr": 0.033699508685490674, + "acc_norm": 0.32124352331606215, + "acc_norm_stderr": 0.033699508685490674 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.31025641025641026, + "acc_stderr": 0.023454674889404288, + "acc_norm": 0.31025641025641026, + "acc_norm_stderr": 0.023454674889404288 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.02646611753895991, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.02646611753895991 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2773109243697479, + "acc_stderr": 0.029079374539480007, + "acc_norm": 0.2773109243697479, + "acc_norm_stderr": 0.029079374539480007 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.24503311258278146, + "acc_stderr": 0.03511807571804724, + "acc_norm": 0.24503311258278146, + "acc_norm_stderr": 0.03511807571804724 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.28073394495412846, + "acc_stderr": 0.019266055045871616, + "acc_norm": 0.28073394495412846, + "acc_norm_stderr": 0.019266055045871616 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.17592592592592593, + "acc_stderr": 0.02596742095825853, + "acc_norm": 0.17592592592592593, + "acc_norm_stderr": 0.02596742095825853 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.03283472056108566, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.03283472056108566 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.350210970464135, + "acc_stderr": 0.031052391937584353, + "acc_norm": 0.350210970464135, + "acc_norm_stderr": 0.031052391937584353 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3991031390134529, + "acc_stderr": 0.03286745312567961, + "acc_norm": 0.3991031390134529, + "acc_norm_stderr": 0.03286745312567961 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.3884297520661157, + "acc_stderr": 0.04449270350068383, + "acc_norm": 0.3884297520661157, + "acc_norm_stderr": 0.04449270350068383 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.32407407407407407, + "acc_stderr": 0.04524596007030049, + "acc_norm": 0.32407407407407407, + "acc_norm_stderr": 0.04524596007030049 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.25153374233128833, + "acc_stderr": 0.03408997886857529, + "acc_norm": 0.25153374233128833, + "acc_norm_stderr": 0.03408997886857529 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.36893203883495146, + "acc_stderr": 0.04777615181156739, + "acc_norm": 0.36893203883495146, + "acc_norm_stderr": 0.04777615181156739 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.31196581196581197, + "acc_stderr": 0.030351527323344944, + "acc_norm": 0.31196581196581197, + "acc_norm_stderr": 0.030351527323344944 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.351213282247765, + "acc_stderr": 0.017069982051499434, + "acc_norm": 0.351213282247765, + "acc_norm_stderr": 0.017069982051499434 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.3092485549132948, + "acc_stderr": 0.024883140570071755, + "acc_norm": 0.3092485549132948, + "acc_norm_stderr": 0.024883140570071755 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2446927374301676, + "acc_stderr": 0.014378169884098405, + "acc_norm": 0.2446927374301676, + "acc_norm_stderr": 0.014378169884098405 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.369281045751634, + "acc_stderr": 0.02763417668960266, + "acc_norm": 0.369281045751634, + "acc_norm_stderr": 0.02763417668960266 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3215434083601286, + "acc_stderr": 0.026527724079528872, + "acc_norm": 0.3215434083601286, + "acc_norm_stderr": 0.026527724079528872 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.31790123456790126, + "acc_stderr": 0.02591006352824087, + "acc_norm": 0.31790123456790126, + "acc_norm_stderr": 0.02591006352824087 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.28368794326241137, + "acc_stderr": 0.02689170942834396, + "acc_norm": 0.28368794326241137, + "acc_norm_stderr": 0.02689170942834396 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.32529335071707954, + "acc_stderr": 0.011965311536571528, + "acc_norm": 0.32529335071707954, + "acc_norm_stderr": 0.011965311536571528 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.3125, + "acc_stderr": 0.02815637344037142, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.02815637344037142 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2973856209150327, + "acc_stderr": 0.01849259653639695, + "acc_norm": 0.2973856209150327, + "acc_norm_stderr": 0.01849259653639695 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.37272727272727274, + "acc_stderr": 0.04631381319425464, + "acc_norm": 0.37272727272727274, + "acc_norm_stderr": 0.04631381319425464 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.44081632653061226, + "acc_stderr": 0.03178419114175363, + "acc_norm": 0.44081632653061226, + "acc_norm_stderr": 0.03178419114175363 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.40298507462686567, + "acc_stderr": 0.034683432951111266, + "acc_norm": 0.40298507462686567, + "acc_norm_stderr": 0.034683432951111266 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3493975903614458, + "acc_stderr": 0.0371172519074075, + "acc_norm": 0.3493975903614458, + "acc_norm_stderr": 0.0371172519074075 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.036155076303109344, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.036155076303109344 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2215422276621787, + "mc1_stderr": 0.01453786760130114, + "mc2": 0.34436864491547176, + "mc2_stderr": 0.013839905350988124 + }, + "all": { + "acc": 0.3126187582488375, + "acc_stderr": 0.03346954750232309, + "acc_norm": 0.31626880377348976, + "acc_norm_stderr": 0.03346782763875774, + "mc1": 0.2215422276621787, + "mc1_stderr": 0.01453786760130114, + "mc2": 0.34436864491547176, + "mc2_stderr": 0.013839905350988124 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "digitous/Javalion-R", + "model_sha": "b881231ab6ea85da2a9a139f282df85d1d18b002", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "ed17e576dbafa5da" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "0875c25c8fc0a94d" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "18cfffb76bc8f0d1" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "16b3626c8a5e3797" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "21f0989f5760198a" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "f7d801bfd913884d" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "23f9089575432d5a" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "04b8293f2ab7fbbf" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "7994d94bfa36d003" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "a2c91752be5b1798" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "db71da66ed82b921" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "e81cf9738ad7e157" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "4a2d5f00cb00d9b7" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e9bcfaa6beefb456" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "6f8215a3de7eebd1" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "aacac708cd4c5a61" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "16b6c6e390eb7cea" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "4130880a19c4edb0" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "96b81f570a84328b" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "e3a7592f84b44888" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "f9edf462e8201551" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "ecf7754754c2bb76" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "30b07e31cf9b5c6f" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "4d1dc7c4ad251829" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "d36b9d9f0f4424fe" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "a0a7af55ac7ae037" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "84fd36aa004c8578" + } + } +} \ No newline at end of file diff --git a/eval-results/digitous/Javalion-R/results_2023-10-12T21-07-25.804829.json b/eval-results/digitous/Javalion-R/results_2023-10-12T21-07-25.804829.json new file mode 100644 index 0000000000000000000000000000000000000000..d9ede1143cbd247797c9f8232941fef96a05e6ba --- /dev/null +++ b/eval-results/digitous/Javalion-R/results_2023-10-12T21-07-25.804829.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "digitous/Javalion-R", + "model_sha": "b881231ab6ea85da2a9a139f282df85d1d18b002", + "model_size": "11.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0010486577181208054, + "em_stderr": 0.0003314581465219256, + "f1": 0.04845847315436258, + "f1_stderr": 0.0011637240305010866 + }, + "harness|gsm8k|5": { + "acc": 0.026535253980288095, + "acc_stderr": 0.004427045987265169 + }, + "harness|winogrande|5": { + "acc": 0.654301499605367, + "acc_stderr": 0.013366596951934376 + }, + "all": { + "em": 0.0010486577181208054, + "em_stderr": 0.0003314581465219256, + "f1": 0.04845847315436258, + "f1_stderr": 0.0011637240305010866, + "acc": 0.34041837679282755, + "acc_stderr": 0.008896821469599773 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f21277d2c2d2e06c", + "hash_cont_tokens": "362ddf043e5d1da7" + }, + "truncated": 382, + "non-truncated": 9154, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "1191171cf35f1532" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "26cd3631535039d0", + "hash_cont_tokens": "3eabff4ea993cdbf" + }, + "total_evaluation_time_secondes": "15949.516452550888", + "truncated": 382, + "non-truncated": 13007, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/digitous/Javelin-GPTJ/results_2023-07-19T14-13-27.511337.json b/eval-results/digitous/Javelin-GPTJ/results_2023-07-19T14-13-27.511337.json new file mode 100644 index 0000000000000000000000000000000000000000..d4289ff8a69372332e81b389dfc2265e318e6498 --- /dev/null +++ b/eval-results/digitous/Javelin-GPTJ/results_2023-07-19T14-13-27.511337.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.39078498293515357, + "acc_stderr": 0.014258563880513777, + "acc_norm": 0.42662116040955633, + "acc_norm_stderr": 0.014453185592920293 + }, + "harness|hellaswag|10": { + "acc": 0.5243975303724357, + "acc_stderr": 0.004983837641502894, + "acc_norm": 0.7045409281019717, + "acc_norm_stderr": 0.004553164013379556 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.03915450630414251, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.03915450630414251 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.25, + "acc_stderr": 0.03523807393012047, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03523807393012047 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.27547169811320754, + "acc_stderr": 0.027495663683724088, + "acc_norm": 0.27547169811320754, + "acc_norm_stderr": 0.027495663683724088 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.24305555555555555, + "acc_stderr": 0.03586879280080341, + "acc_norm": 0.24305555555555555, + "acc_norm_stderr": 0.03586879280080341 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816503, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816503 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653695, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653695 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.24277456647398843, + "acc_stderr": 0.0326926380614177, + "acc_norm": 0.24277456647398843, + "acc_norm_stderr": 0.0326926380614177 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.040233822736177476, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.040233822736177476 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.32340425531914896, + "acc_stderr": 0.030579442773610334, + "acc_norm": 0.32340425531914896, + "acc_norm_stderr": 0.030579442773610334 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022057, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022057 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.23448275862068965, + "acc_stderr": 0.035306258743465914, + "acc_norm": 0.23448275862068965, + "acc_norm_stderr": 0.035306258743465914 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.022644212615525214, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.022644212615525214 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.03670066451047182, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.03670066451047182 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24838709677419354, + "acc_stderr": 0.024580028921481003, + "acc_norm": 0.24838709677419354, + "acc_norm_stderr": 0.024580028921481003 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2660098522167488, + "acc_stderr": 0.03108982600293753, + "acc_norm": 0.2660098522167488, + "acc_norm_stderr": 0.03108982600293753 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.14, + "acc_stderr": 0.034873508801977704, + "acc_norm": 0.14, + "acc_norm_stderr": 0.034873508801977704 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2909090909090909, + "acc_stderr": 0.03546563019624336, + "acc_norm": 0.2909090909090909, + "acc_norm_stderr": 0.03546563019624336 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.22727272727272727, + "acc_stderr": 0.02985751567338641, + "acc_norm": 0.22727272727272727, + "acc_norm_stderr": 0.02985751567338641 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.22279792746113988, + "acc_stderr": 0.03003114797764154, + "acc_norm": 0.22279792746113988, + "acc_norm_stderr": 0.03003114797764154 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2512820512820513, + "acc_stderr": 0.021992016662370554, + "acc_norm": 0.2512820512820513, + "acc_norm_stderr": 0.021992016662370554 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.026842057873833706, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.026842057873833706 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.23949579831932774, + "acc_stderr": 0.02772206549336127, + "acc_norm": 0.23949579831932774, + "acc_norm_stderr": 0.02772206549336127 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.25165562913907286, + "acc_stderr": 0.03543304234389985, + "acc_norm": 0.25165562913907286, + "acc_norm_stderr": 0.03543304234389985 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.23486238532110093, + "acc_stderr": 0.018175110510343578, + "acc_norm": 0.23486238532110093, + "acc_norm_stderr": 0.018175110510343578 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.12962962962962962, + "acc_stderr": 0.022907883151288604, + "acc_norm": 0.12962962962962962, + "acc_norm_stderr": 0.022907883151288604 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.03058759135160425, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.03058759135160425 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2911392405063291, + "acc_stderr": 0.029571601065753374, + "acc_norm": 0.2911392405063291, + "acc_norm_stderr": 0.029571601065753374 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.35874439461883406, + "acc_stderr": 0.032190792004199956, + "acc_norm": 0.35874439461883406, + "acc_norm_stderr": 0.032190792004199956 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.22137404580152673, + "acc_stderr": 0.03641297081313729, + "acc_norm": 0.22137404580152673, + "acc_norm_stderr": 0.03641297081313729 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.3140495867768595, + "acc_stderr": 0.04236964753041017, + "acc_norm": 0.3140495867768595, + "acc_norm_stderr": 0.04236964753041017 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.04414343666854933, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.04414343666854933 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22699386503067484, + "acc_stderr": 0.032910995786157686, + "acc_norm": 0.22699386503067484, + "acc_norm_stderr": 0.032910995786157686 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3392857142857143, + "acc_stderr": 0.04493949068613539, + "acc_norm": 0.3392857142857143, + "acc_norm_stderr": 0.04493949068613539 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.1941747572815534, + "acc_stderr": 0.03916667762822584, + "acc_norm": 0.1941747572815534, + "acc_norm_stderr": 0.03916667762822584 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.28205128205128205, + "acc_stderr": 0.02948036054954119, + "acc_norm": 0.28205128205128205, + "acc_norm_stderr": 0.02948036054954119 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.3065134099616858, + "acc_stderr": 0.01648695289304151, + "acc_norm": 0.3065134099616858, + "acc_norm_stderr": 0.01648695289304151 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.27167630057803466, + "acc_stderr": 0.023948512905468365, + "acc_norm": 0.27167630057803466, + "acc_norm_stderr": 0.023948512905468365 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.025261691219729494, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.025261691219729494 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2829581993569132, + "acc_stderr": 0.02558306248998482, + "acc_norm": 0.2829581993569132, + "acc_norm_stderr": 0.02558306248998482 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.27469135802469136, + "acc_stderr": 0.02483605786829468, + "acc_norm": 0.27469135802469136, + "acc_norm_stderr": 0.02483605786829468 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2801418439716312, + "acc_stderr": 0.026789172351140242, + "acc_norm": 0.2801418439716312, + "acc_norm_stderr": 0.026789172351140242 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2861799217731421, + "acc_stderr": 0.011543642878150757, + "acc_norm": 0.2861799217731421, + "acc_norm_stderr": 0.011543642878150757 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.19852941176470587, + "acc_stderr": 0.02423101337054111, + "acc_norm": 0.19852941176470587, + "acc_norm_stderr": 0.02423101337054111 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25326797385620914, + "acc_stderr": 0.01759348689536683, + "acc_norm": 0.25326797385620914, + "acc_norm_stderr": 0.01759348689536683 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.32727272727272727, + "acc_stderr": 0.04494290866252088, + "acc_norm": 0.32727272727272727, + "acc_norm_stderr": 0.04494290866252088 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.27755102040816326, + "acc_stderr": 0.028666857790274655, + "acc_norm": 0.27755102040816326, + "acc_norm_stderr": 0.028666857790274655 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2935323383084577, + "acc_stderr": 0.03220024104534204, + "acc_norm": 0.2935323383084577, + "acc_norm_stderr": 0.03220024104534204 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3253012048192771, + "acc_stderr": 0.036471685236832266, + "acc_norm": 0.3253012048192771, + "acc_norm_stderr": 0.036471685236832266 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.25146198830409355, + "acc_stderr": 0.033275044238468436, + "acc_norm": 0.25146198830409355, + "acc_norm_stderr": 0.033275044238468436 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.21297429620563035, + "mc1_stderr": 0.014332203787059683, + "mc2": 0.36081837896921476, + "mc2_stderr": 0.013694636113835743 + }, + "all": { + "acc": 0.2686064553560151, + "acc_stderr": 0.03192323966390725, + "acc_norm": 0.2722671261221835, + "acc_norm_stderr": 0.03191923878397985, + "mc1": 0.21297429620563035, + "mc1_stderr": 0.014332203787059683, + "mc2": 0.36081837896921476, + "mc2_stderr": 0.013694636113835743 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "digitous/Javelin-GPTJ", + "model_sha": "bee7068ab002784420a1a30170db3906185359f2", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "ed17e576dbafa5da" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "0875c25c8fc0a94d" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "18cfffb76bc8f0d1" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "16b3626c8a5e3797" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "21f0989f5760198a" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "f7d801bfd913884d" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "23f9089575432d5a" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "04b8293f2ab7fbbf" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "7994d94bfa36d003" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "a2c91752be5b1798" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "db71da66ed82b921" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "e81cf9738ad7e157" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "4a2d5f00cb00d9b7" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e9bcfaa6beefb456" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "6f8215a3de7eebd1" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "aacac708cd4c5a61" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "16b6c6e390eb7cea" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "4130880a19c4edb0" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "96b81f570a84328b" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "e3a7592f84b44888" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "f9edf462e8201551" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "ecf7754754c2bb76" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "30b07e31cf9b5c6f" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "4d1dc7c4ad251829" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "d36b9d9f0f4424fe" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "a0a7af55ac7ae037" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "84fd36aa004c8578" + } + } +} \ No newline at end of file diff --git a/eval-results/digitous/Javelin-GPTJ/results_2023-10-16T01-31-09.179674.json b/eval-results/digitous/Javelin-GPTJ/results_2023-10-16T01-31-09.179674.json new file mode 100644 index 0000000000000000000000000000000000000000..3893d1c6744947d9d583d638c0e1376a18f185d1 --- /dev/null +++ b/eval-results/digitous/Javelin-GPTJ/results_2023-10-16T01-31-09.179674.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "digitous/Javelin-GPTJ", + "model_sha": "bee7068ab002784420a1a30170db3906185359f2", + "model_size": "11.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0008389261744966443, + "em_stderr": 0.0002964962989801232, + "f1": 0.04767722315436259, + "f1_stderr": 0.0011834240833723825 + }, + "harness|gsm8k|5": { + "acc": 0.01819560272934041, + "acc_stderr": 0.0036816118940738727 + }, + "harness|winogrande|5": { + "acc": 0.6416732438831886, + "acc_stderr": 0.01347658117256753 + }, + "all": { + "em": 0.0008389261744966443, + "em_stderr": 0.0002964962989801232, + "f1": 0.04767722315436259, + "f1_stderr": 0.0011834240833723825, + "acc": 0.3299344233062645, + "acc_stderr": 0.008579096533320701 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f21277d2c2d2e06c", + "hash_cont_tokens": "0a7b2d379c75c334" + }, + "truncated": 382, + "non-truncated": 9154, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "7b532b871babf8b0" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "26cd3631535039d0", + "hash_cont_tokens": "c25009e69e24f63d" + }, + "total_evaluation_time_secondes": "9340.589137792587", + "truncated": 382, + "non-truncated": 13007, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/digitous/Javelin-R/results_2023-07-19T19-50-05.826283.json b/eval-results/digitous/Javelin-R/results_2023-07-19T19-50-05.826283.json new file mode 100644 index 0000000000000000000000000000000000000000..8bd41577c2d13625a9de37bfc1b7cec2834effbf --- /dev/null +++ b/eval-results/digitous/Javelin-R/results_2023-07-19T19-50-05.826283.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.386518771331058, + "acc_stderr": 0.01423008476191048, + "acc_norm": 0.41638225255972694, + "acc_norm_stderr": 0.014405618279436178 + }, + "harness|hellaswag|10": { + "acc": 0.5119498107946624, + "acc_stderr": 0.004988356146499033, + "acc_norm": 0.6901015733917546, + "acc_norm_stderr": 0.004615063817741851 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.34814814814814815, + "acc_stderr": 0.041153246103369526, + "acc_norm": 0.34814814814814815, + "acc_norm_stderr": 0.041153246103369526 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3223684210526316, + "acc_stderr": 0.03803510248351585, + "acc_norm": 0.3223684210526316, + "acc_norm_stderr": 0.03803510248351585 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.33584905660377357, + "acc_stderr": 0.02906722014664483, + "acc_norm": 0.33584905660377357, + "acc_norm_stderr": 0.02906722014664483 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2708333333333333, + "acc_stderr": 0.03716177437566016, + "acc_norm": 0.2708333333333333, + "acc_norm_stderr": 0.03716177437566016 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3063583815028902, + "acc_stderr": 0.03514942551267437, + "acc_norm": 0.3063583815028902, + "acc_norm_stderr": 0.03514942551267437 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237655, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237655 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3574468085106383, + "acc_stderr": 0.03132941789476425, + "acc_norm": 0.3574468085106383, + "acc_norm_stderr": 0.03132941789476425 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.0433913832257986, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.0433913832257986 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.296551724137931, + "acc_stderr": 0.03806142687309994, + "acc_norm": 0.296551724137931, + "acc_norm_stderr": 0.03806142687309994 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.26455026455026454, + "acc_stderr": 0.022717467897708617, + "acc_norm": 0.26455026455026454, + "acc_norm_stderr": 0.022717467897708617 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.16666666666666666, + "acc_stderr": 0.03333333333333337, + "acc_norm": 0.16666666666666666, + "acc_norm_stderr": 0.03333333333333337 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2838709677419355, + "acc_stderr": 0.02564938106302928, + "acc_norm": 0.2838709677419355, + "acc_norm_stderr": 0.02564938106302928 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.29064039408866993, + "acc_stderr": 0.03194740072265541, + "acc_norm": 0.29064039408866993, + "acc_norm_stderr": 0.03194740072265541 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.3212121212121212, + "acc_stderr": 0.0364620496325381, + "acc_norm": 0.3212121212121212, + "acc_norm_stderr": 0.0364620496325381 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.30303030303030304, + "acc_stderr": 0.032742879140268674, + "acc_norm": 0.30303030303030304, + "acc_norm_stderr": 0.032742879140268674 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.31088082901554404, + "acc_stderr": 0.03340361906276585, + "acc_norm": 0.31088082901554404, + "acc_norm_stderr": 0.03340361906276585 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.30256410256410254, + "acc_stderr": 0.02329088805377274, + "acc_norm": 0.30256410256410254, + "acc_norm_stderr": 0.02329088805377274 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.02646611753895991, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.02646611753895991 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.31092436974789917, + "acc_stderr": 0.030066761582977927, + "acc_norm": 0.31092436974789917, + "acc_norm_stderr": 0.030066761582977927 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.25165562913907286, + "acc_stderr": 0.03543304234389985, + "acc_norm": 0.25165562913907286, + "acc_norm_stderr": 0.03543304234389985 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.29541284403669726, + "acc_stderr": 0.019560619182975997, + "acc_norm": 0.29541284403669726, + "acc_norm_stderr": 0.019560619182975997 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.16203703703703703, + "acc_stderr": 0.025130453652268455, + "acc_norm": 0.16203703703703703, + "acc_norm_stderr": 0.025130453652268455 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.3382352941176471, + "acc_stderr": 0.03320574612945432, + "acc_norm": 0.3382352941176471, + "acc_norm_stderr": 0.03320574612945432 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.3670886075949367, + "acc_stderr": 0.03137624072561619, + "acc_norm": 0.3670886075949367, + "acc_norm_stderr": 0.03137624072561619 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.39461883408071746, + "acc_stderr": 0.03280400504755291, + "acc_norm": 0.39461883408071746, + "acc_norm_stderr": 0.03280400504755291 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2824427480916031, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.2824427480916031, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.3884297520661157, + "acc_stderr": 0.04449270350068383, + "acc_norm": 0.3884297520661157, + "acc_norm_stderr": 0.04449270350068383 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.046166311118017125, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.046166311118017125 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.25153374233128833, + "acc_stderr": 0.03408997886857529, + "acc_norm": 0.25153374233128833, + "acc_norm_stderr": 0.03408997886857529 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2767857142857143, + "acc_stderr": 0.042466243366976256, + "acc_norm": 0.2767857142857143, + "acc_norm_stderr": 0.042466243366976256 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3592233009708738, + "acc_stderr": 0.04750458399041694, + "acc_norm": 0.3592233009708738, + "acc_norm_stderr": 0.04750458399041694 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.29914529914529914, + "acc_stderr": 0.029996951858349483, + "acc_norm": 0.29914529914529914, + "acc_norm_stderr": 0.029996951858349483 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.36015325670498083, + "acc_stderr": 0.017166362471369306, + "acc_norm": 0.36015325670498083, + "acc_norm_stderr": 0.017166362471369306 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.30346820809248554, + "acc_stderr": 0.024752411960917212, + "acc_norm": 0.30346820809248554, + "acc_norm_stderr": 0.024752411960917212 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.369281045751634, + "acc_stderr": 0.02763417668960266, + "acc_norm": 0.369281045751634, + "acc_norm_stderr": 0.02763417668960266 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3086816720257235, + "acc_stderr": 0.026236965881153262, + "acc_norm": 0.3086816720257235, + "acc_norm_stderr": 0.026236965881153262 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.3117283950617284, + "acc_stderr": 0.02577311116963045, + "acc_norm": 0.3117283950617284, + "acc_norm_stderr": 0.02577311116963045 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2872340425531915, + "acc_stderr": 0.026992199173064356, + "acc_norm": 0.2872340425531915, + "acc_norm_stderr": 0.026992199173064356 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.32529335071707954, + "acc_stderr": 0.011965311536571528, + "acc_norm": 0.32529335071707954, + "acc_norm_stderr": 0.011965311536571528 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.3014705882352941, + "acc_stderr": 0.027875982114273168, + "acc_norm": 0.3014705882352941, + "acc_norm_stderr": 0.027875982114273168 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2875816993464052, + "acc_stderr": 0.018311653053648222, + "acc_norm": 0.2875816993464052, + "acc_norm_stderr": 0.018311653053648222 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.36363636363636365, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.031680911612338825, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.031680911612338825 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.40298507462686567, + "acc_stderr": 0.034683432951111266, + "acc_norm": 0.40298507462686567, + "acc_norm_stderr": 0.034683432951111266 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.35542168674698793, + "acc_stderr": 0.03726214354322415, + "acc_norm": 0.35542168674698793, + "acc_norm_stderr": 0.03726214354322415 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.036155076303109344, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.036155076303109344 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.21664626682986537, + "mc1_stderr": 0.014421468452506978, + "mc2": 0.34500578848693414, + "mc2_stderr": 0.013869193515420503 + }, + "all": { + "acc": 0.31177422110109976, + "acc_stderr": 0.033417812231248484, + "acc_norm": 0.31529990319984147, + "acc_norm_stderr": 0.03341446038699032, + "mc1": 0.21664626682986537, + "mc1_stderr": 0.014421468452506978, + "mc2": 0.34500578848693414, + "mc2_stderr": 0.013869193515420503 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "digitous/Javelin-R", + "model_sha": "4c4a5caf5d9049a47f5565b72e5a53dede08ac8b", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + } + } +} \ No newline at end of file diff --git a/eval-results/digitous/Javelin-R/results_2023-10-17T16-47-23.562896.json b/eval-results/digitous/Javelin-R/results_2023-10-17T16-47-23.562896.json new file mode 100644 index 0000000000000000000000000000000000000000..567ab11dc1764b5c06c7bac89fa603393841243f --- /dev/null +++ b/eval-results/digitous/Javelin-R/results_2023-10-17T16-47-23.562896.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "digitous/Javelin-R", + "model_sha": "4c4a5caf5d9049a47f5565b72e5a53dede08ac8b", + "model_size": "11.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0010486577181208054, + "em_stderr": 0.0003314581465219256, + "f1": 0.05006606543624186, + "f1_stderr": 0.001221286433761839 + }, + "harness|gsm8k|5": { + "acc": 0.016679302501895376, + "acc_stderr": 0.0035275958887224313 + }, + "harness|winogrande|5": { + "acc": 0.6479873717442778, + "acc_stderr": 0.01342287482492972 + }, + "all": { + "em": 0.0010486577181208054, + "em_stderr": 0.0003314581465219256, + "f1": 0.05006606543624186, + "f1_stderr": 0.001221286433761839, + "acc": 0.3323333371230866, + "acc_stderr": 0.008475235356826075 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f21277d2c2d2e06c", + "hash_cont_tokens": "b7c474f896e3c847" + }, + "truncated": 382, + "non-truncated": 9154, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "92e74bc0820f3cda" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "26cd3631535039d0", + "hash_cont_tokens": "76296e50ae953a1e" + }, + "total_evaluation_time_secondes": "9818.065600633621", + "truncated": 382, + "non-truncated": 13007, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/digitous/Skegma-GPTJ/results_2023-07-19T19-58-51.471216.json b/eval-results/digitous/Skegma-GPTJ/results_2023-07-19T19-58-51.471216.json new file mode 100644 index 0000000000000000000000000000000000000000..ab333c03b0c7777905e756785822dbcbae95439a --- /dev/null +++ b/eval-results/digitous/Skegma-GPTJ/results_2023-07-19T19-58-51.471216.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.39505119453924914, + "acc_stderr": 0.014285898292938174, + "acc_norm": 0.4377133105802048, + "acc_norm_stderr": 0.014497573881108282 + }, + "harness|hellaswag|10": { + "acc": 0.5075682135032862, + "acc_stderr": 0.004989209770743236, + "acc_norm": 0.6921927902808206, + "acc_norm_stderr": 0.004606429684604543 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768081, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768081 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.03853254836552003, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.03853254836552003 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.034597776068105365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.034597776068105365 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.27547169811320754, + "acc_stderr": 0.027495663683724077, + "acc_norm": 0.27547169811320754, + "acc_norm_stderr": 0.027495663683724077 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.22916666666666666, + "acc_stderr": 0.035146974678623884, + "acc_norm": 0.22916666666666666, + "acc_norm_stderr": 0.035146974678623884 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.15, + "acc_stderr": 0.035887028128263714, + "acc_norm": 0.15, + "acc_norm_stderr": 0.035887028128263714 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2138728323699422, + "acc_stderr": 0.031265112061730424, + "acc_norm": 0.2138728323699422, + "acc_norm_stderr": 0.031265112061730424 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179961, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179961 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.32340425531914896, + "acc_stderr": 0.030579442773610334, + "acc_norm": 0.32340425531914896, + "acc_norm_stderr": 0.030579442773610334 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.21379310344827587, + "acc_stderr": 0.03416520447747548, + "acc_norm": 0.21379310344827587, + "acc_norm_stderr": 0.03416520447747548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2566137566137566, + "acc_stderr": 0.022494510767503154, + "acc_norm": 0.2566137566137566, + "acc_norm_stderr": 0.022494510767503154 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.20634920634920634, + "acc_stderr": 0.036196045241242515, + "acc_norm": 0.20634920634920634, + "acc_norm_stderr": 0.036196045241242515 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25483870967741934, + "acc_stderr": 0.024790118459332208, + "acc_norm": 0.25483870967741934, + "acc_norm_stderr": 0.024790118459332208 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.031447125816782405, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.031447125816782405 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2606060606060606, + "acc_stderr": 0.034277431758165236, + "acc_norm": 0.2606060606060606, + "acc_norm_stderr": 0.034277431758165236 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.22727272727272727, + "acc_stderr": 0.02985751567338641, + "acc_norm": 0.22727272727272727, + "acc_norm_stderr": 0.02985751567338641 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.21243523316062177, + "acc_stderr": 0.029519282616817244, + "acc_norm": 0.21243523316062177, + "acc_norm_stderr": 0.029519282616817244 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.23333333333333334, + "acc_stderr": 0.021444547301560493, + "acc_norm": 0.23333333333333334, + "acc_norm_stderr": 0.021444547301560493 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24814814814814815, + "acc_stderr": 0.0263357394040558, + "acc_norm": 0.24814814814814815, + "acc_norm_stderr": 0.0263357394040558 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.23949579831932774, + "acc_stderr": 0.02772206549336127, + "acc_norm": 0.23949579831932774, + "acc_norm_stderr": 0.02772206549336127 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2185430463576159, + "acc_stderr": 0.03374235550425694, + "acc_norm": 0.2185430463576159, + "acc_norm_stderr": 0.03374235550425694 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.23669724770642203, + "acc_stderr": 0.018224078117299085, + "acc_norm": 0.23669724770642203, + "acc_norm_stderr": 0.018224078117299085 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.14814814814814814, + "acc_stderr": 0.024227629273728356, + "acc_norm": 0.14814814814814814, + "acc_norm_stderr": 0.024227629273728356 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.031321798030832904, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.031321798030832904 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.24472573839662448, + "acc_stderr": 0.02798569938703642, + "acc_norm": 0.24472573839662448, + "acc_norm_stderr": 0.02798569938703642 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3632286995515695, + "acc_stderr": 0.03227790442850499, + "acc_norm": 0.3632286995515695, + "acc_norm_stderr": 0.03227790442850499 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.22137404580152673, + "acc_stderr": 0.03641297081313729, + "acc_norm": 0.22137404580152673, + "acc_norm_stderr": 0.03641297081313729 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2809917355371901, + "acc_stderr": 0.04103203830514512, + "acc_norm": 0.2809917355371901, + "acc_norm_stderr": 0.04103203830514512 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.04414343666854933, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.04414343666854933 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.24539877300613497, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.24539877300613497, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.29464285714285715, + "acc_stderr": 0.04327040932578728, + "acc_norm": 0.29464285714285715, + "acc_norm_stderr": 0.04327040932578728 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.22330097087378642, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.22330097087378642, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.27350427350427353, + "acc_stderr": 0.029202540153431183, + "acc_norm": 0.27350427350427353, + "acc_norm_stderr": 0.029202540153431183 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.3103448275862069, + "acc_stderr": 0.01654378502604832, + "acc_norm": 0.3103448275862069, + "acc_norm_stderr": 0.01654378502604832 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.23121387283236994, + "acc_stderr": 0.022698657167855716, + "acc_norm": 0.23121387283236994, + "acc_norm_stderr": 0.022698657167855716 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.27124183006535946, + "acc_stderr": 0.025457756696667874, + "acc_norm": 0.27124183006535946, + "acc_norm_stderr": 0.025457756696667874 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2958199356913183, + "acc_stderr": 0.025922371788818777, + "acc_norm": 0.2958199356913183, + "acc_norm_stderr": 0.025922371788818777 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.024922001168886335, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.024922001168886335 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2765957446808511, + "acc_stderr": 0.026684564340460994, + "acc_norm": 0.2765957446808511, + "acc_norm_stderr": 0.026684564340460994 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2529335071707953, + "acc_stderr": 0.011102268713839987, + "acc_norm": 0.2529335071707953, + "acc_norm_stderr": 0.011102268713839987 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.20220588235294118, + "acc_stderr": 0.02439819298665492, + "acc_norm": 0.20220588235294118, + "acc_norm_stderr": 0.02439819298665492 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.26143790849673204, + "acc_stderr": 0.017776947157528037, + "acc_norm": 0.26143790849673204, + "acc_norm_stderr": 0.017776947157528037 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.3181818181818182, + "acc_stderr": 0.04461272175910508, + "acc_norm": 0.3181818181818182, + "acc_norm_stderr": 0.04461272175910508 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.23673469387755103, + "acc_stderr": 0.027212835884073153, + "acc_norm": 0.23673469387755103, + "acc_norm_stderr": 0.027212835884073153 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2537313432835821, + "acc_stderr": 0.030769444967296024, + "acc_norm": 0.2537313432835821, + "acc_norm_stderr": 0.030769444967296024 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3253012048192771, + "acc_stderr": 0.03647168523683226, + "acc_norm": 0.3253012048192771, + "acc_norm_stderr": 0.03647168523683226 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.0330140594698725, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.0330140594698725 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.21297429620563035, + "mc1_stderr": 0.014332203787059686, + "mc2": 0.34673353659204126, + "mc2_stderr": 0.01346681957873724 + }, + "all": { + "acc": 0.26042646712515355, + "acc_stderr": 0.03164422655259705, + "acc_norm": 0.2642787839525856, + "acc_norm_stderr": 0.03164132647636029, + "mc1": 0.21297429620563035, + "mc1_stderr": 0.014332203787059686, + "mc2": 0.34673353659204126, + "mc2_stderr": 0.01346681957873724 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "digitous/Skegma-GPTJ", + "model_sha": "4dff006b2ea7e8d9b067dfe8af8ca1a16bc44dce", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "1b78325b154497a6", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "97de5fb5652ec7fa", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "8bf46ce3a98e6e3f", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "af38d1bbc0517ac5", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "6c6256000dbf914a", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "092923836e135996", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + } + } +} \ No newline at end of file diff --git a/eval-results/digitous/Skegma-GPTJ/results_2023-10-22T01-11-06.361461.json b/eval-results/digitous/Skegma-GPTJ/results_2023-10-22T01-11-06.361461.json new file mode 100644 index 0000000000000000000000000000000000000000..5772200092d749b72d89cd61aa8b8a70b68df208 --- /dev/null +++ b/eval-results/digitous/Skegma-GPTJ/results_2023-10-22T01-11-06.361461.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "digitous/Skegma-GPTJ", + "model_sha": "4dff006b2ea7e8d9b067dfe8af8ca1a16bc44dce", + "model_size": "11.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0006291946308724832, + "em_stderr": 0.0002568002749723976, + "f1": 0.04913485738255054, + "f1_stderr": 0.0012043047173197863 + }, + "harness|gsm8k|5": { + "acc": 0.015163002274450341, + "acc_stderr": 0.0033660229497263316 + }, + "harness|winogrande|5": { + "acc": 0.6464088397790055, + "acc_stderr": 0.013436541262599952 + }, + "all": { + "em": 0.0006291946308724832, + "em_stderr": 0.0002568002749723976, + "f1": 0.04913485738255054, + "f1_stderr": 0.0012043047173197863, + "acc": 0.33078592102672794, + "acc_stderr": 0.008401282106163142 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f21277d2c2d2e06c", + "hash_cont_tokens": "fa66018574331dfd" + }, + "truncated": 382, + "non-truncated": 9154, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "7f543dcec7904b66" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "26cd3631535039d0", + "hash_cont_tokens": "e1c6111526a64608" + }, + "total_evaluation_time_secondes": "9804.932166337967", + "truncated": 382, + "non-truncated": 13007, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/dotvignesh/perry-7b/results_2023-10-04T00-15-19.939384.json b/eval-results/dotvignesh/perry-7b/results_2023-10-04T00-15-19.939384.json new file mode 100644 index 0000000000000000000000000000000000000000..b9fd3a84f7c143d04641ab569a8db0e1e67534bf --- /dev/null +++ b/eval-results/dotvignesh/perry-7b/results_2023-10-04T00-15-19.939384.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "dotvignesh/perry-7b", + "model_sha": "f35ae37b436637cd3e14d086324ccdaccfd69045", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.46928327645051193, + "acc_stderr": 0.014583792546304038, + "acc_norm": 0.5179180887372014, + "acc_norm_stderr": 0.014602005585490975 + }, + "harness|hellaswag|10": { + "acc": 0.5706034654451304, + "acc_stderr": 0.004939784311448985, + "acc_norm": 0.7642899820752838, + "acc_norm_stderr": 0.004235743182042551 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421296, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421296 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4, + "acc_stderr": 0.04232073695151589, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04232073695151589 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5, + "acc_stderr": 0.04068942293855797, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04068942293855797 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5018867924528302, + "acc_stderr": 0.030772653642075657, + "acc_norm": 0.5018867924528302, + "acc_norm_stderr": 0.030772653642075657 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.04174752578923185, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.04174752578923185 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3872832369942196, + "acc_stderr": 0.03714325906302065, + "acc_norm": 0.3872832369942196, + "acc_norm_stderr": 0.03714325906302065 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179963, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179963 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.40425531914893614, + "acc_stderr": 0.03208115750788684, + "acc_norm": 0.40425531914893614, + "acc_norm_stderr": 0.03208115750788684 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.35964912280701755, + "acc_stderr": 0.045144961328736334, + "acc_norm": 0.35964912280701755, + "acc_norm_stderr": 0.045144961328736334 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.04166567577101579, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.04166567577101579 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2671957671957672, + "acc_stderr": 0.02278967314577656, + "acc_norm": 0.2671957671957672, + "acc_norm_stderr": 0.02278967314577656 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.0404061017820884, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.0404061017820884 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5096774193548387, + "acc_stderr": 0.02843867799890955, + "acc_norm": 0.5096774193548387, + "acc_norm_stderr": 0.02843867799890955 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.31527093596059114, + "acc_stderr": 0.03269080871970186, + "acc_norm": 0.31527093596059114, + "acc_norm_stderr": 0.03269080871970186 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.593939393939394, + "acc_stderr": 0.03834816355401181, + "acc_norm": 0.593939393939394, + "acc_norm_stderr": 0.03834816355401181 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5808080808080808, + "acc_stderr": 0.03515520728670417, + "acc_norm": 0.5808080808080808, + "acc_norm_stderr": 0.03515520728670417 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6839378238341969, + "acc_stderr": 0.033553973696861736, + "acc_norm": 0.6839378238341969, + "acc_norm_stderr": 0.033553973696861736 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4256410256410256, + "acc_stderr": 0.025069094387296532, + "acc_norm": 0.4256410256410256, + "acc_norm_stderr": 0.025069094387296532 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25555555555555554, + "acc_stderr": 0.026593939101844065, + "acc_norm": 0.25555555555555554, + "acc_norm_stderr": 0.026593939101844065 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3865546218487395, + "acc_stderr": 0.0316314580755238, + "acc_norm": 0.3865546218487395, + "acc_norm_stderr": 0.0316314580755238 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943343, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943343 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6073394495412844, + "acc_stderr": 0.020937505161201093, + "acc_norm": 0.6073394495412844, + "acc_norm_stderr": 0.020937505161201093 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.30092592592592593, + "acc_stderr": 0.03128039084329883, + "acc_norm": 0.30092592592592593, + "acc_norm_stderr": 0.03128039084329883 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.03454236585380608, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.03454236585380608 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6118143459915611, + "acc_stderr": 0.031722950043323275, + "acc_norm": 0.6118143459915611, + "acc_norm_stderr": 0.031722950043323275 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5381165919282511, + "acc_stderr": 0.033460150119732274, + "acc_norm": 0.5381165919282511, + "acc_norm_stderr": 0.033460150119732274 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5419847328244275, + "acc_stderr": 0.04369802690578756, + "acc_norm": 0.5419847328244275, + "acc_norm_stderr": 0.04369802690578756 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6115702479338843, + "acc_stderr": 0.04449270350068382, + "acc_norm": 0.6115702479338843, + "acc_norm_stderr": 0.04449270350068382 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.04803752235190192, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.04803752235190192 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.44171779141104295, + "acc_stderr": 0.039015918258361836, + "acc_norm": 0.44171779141104295, + "acc_norm_stderr": 0.039015918258361836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.04521829902833586, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.04521829902833586 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6601941747572816, + "acc_stderr": 0.04689765937278135, + "acc_norm": 0.6601941747572816, + "acc_norm_stderr": 0.04689765937278135 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7136752136752137, + "acc_stderr": 0.02961432369045665, + "acc_norm": 0.7136752136752137, + "acc_norm_stderr": 0.02961432369045665 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.56, + "acc_stderr": 0.0498887651569859, + "acc_norm": 0.56, + "acc_norm_stderr": 0.0498887651569859 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6500638569604087, + "acc_stderr": 0.017055679797150426, + "acc_norm": 0.6500638569604087, + "acc_norm_stderr": 0.017055679797150426 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.48265895953757226, + "acc_stderr": 0.026902900458666647, + "acc_norm": 0.48265895953757226, + "acc_norm_stderr": 0.026902900458666647 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24022346368715083, + "acc_stderr": 0.014288343803925296, + "acc_norm": 0.24022346368715083, + "acc_norm_stderr": 0.014288343803925296 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4803921568627451, + "acc_stderr": 0.028607893699576066, + "acc_norm": 0.4803921568627451, + "acc_norm_stderr": 0.028607893699576066 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5434083601286174, + "acc_stderr": 0.028290869054197608, + "acc_norm": 0.5434083601286174, + "acc_norm_stderr": 0.028290869054197608 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.49074074074074076, + "acc_stderr": 0.027815973433878014, + "acc_norm": 0.49074074074074076, + "acc_norm_stderr": 0.027815973433878014 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.33687943262411346, + "acc_stderr": 0.02819553487396673, + "acc_norm": 0.33687943262411346, + "acc_norm_stderr": 0.02819553487396673 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3226857887874837, + "acc_stderr": 0.01194026419319599, + "acc_norm": 0.3226857887874837, + "acc_norm_stderr": 0.01194026419319599 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4227941176470588, + "acc_stderr": 0.03000856284500348, + "acc_norm": 0.4227941176470588, + "acc_norm_stderr": 0.03000856284500348 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4199346405228758, + "acc_stderr": 0.019966811178256487, + "acc_norm": 0.4199346405228758, + "acc_norm_stderr": 0.019966811178256487 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.509090909090909, + "acc_stderr": 0.0478833976870286, + "acc_norm": 0.509090909090909, + "acc_norm_stderr": 0.0478833976870286 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5877551020408164, + "acc_stderr": 0.03151236044674268, + "acc_norm": 0.5877551020408164, + "acc_norm_stderr": 0.03151236044674268 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6119402985074627, + "acc_stderr": 0.034457899643627506, + "acc_norm": 0.6119402985074627, + "acc_norm_stderr": 0.034457899643627506 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421296, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421296 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.43373493975903615, + "acc_stderr": 0.03858158940685517, + "acc_norm": 0.43373493975903615, + "acc_norm_stderr": 0.03858158940685517 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03615507630310936, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03615507630310936 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.25703794369645044, + "mc1_stderr": 0.01529807750948508, + "mc2": 0.4008031328552837, + "mc2_stderr": 0.014310534656953405 + }, + "all": { + "acc": 0.46376192978198955, + "acc_stderr": 0.03509876929191512, + "acc_norm": 0.4678690709500716, + "acc_norm_stderr": 0.03508714508699615, + "mc1": 0.25703794369645044, + "mc1_stderr": 0.01529807750948508, + "mc2": 0.4008031328552837, + "mc2_stderr": 0.014310534656953405 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4183.307262182236", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/dotvignesh/perry-7b/results_2023-10-23T10-51-37.935635.json b/eval-results/dotvignesh/perry-7b/results_2023-10-23T10-51-37.935635.json new file mode 100644 index 0000000000000000000000000000000000000000..00a3ac12b43cd47bf0c5d7a39188d3052b8d362e --- /dev/null +++ b/eval-results/dotvignesh/perry-7b/results_2023-10-23T10-51-37.935635.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "dotvignesh/perry-7b", + "model_sha": "f35ae37b436637cd3e14d086324ccdaccfd69045", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0008389261744966443, + "em_stderr": 0.0002964962989801269, + "f1": 0.05790478187919471, + "f1_stderr": 0.0013248182101283533 + }, + "harness|gsm8k|5": { + "acc": 0.10310841546626232, + "acc_stderr": 0.008376436987507811 + }, + "harness|winogrande|5": { + "acc": 0.7253354380426204, + "acc_stderr": 0.012544516005117188 + }, + "all": { + "em": 0.0008389261744966443, + "em_stderr": 0.0002964962989801269, + "f1": 0.05790478187919471, + "f1_stderr": 0.0013248182101283533, + "acc": 0.41422192675444136, + "acc_stderr": 0.0104604764963125 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "da63084aedecdde1" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "2949df1f1b2a1758" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "386cc643af7dfeb2" + }, + "total_evaluation_time_secondes": "10732.093454122543", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/dsvv-cair/alpaca-cleaned-llama-30b-bf16/results_2023-07-19T22-14-09.885019.json b/eval-results/dsvv-cair/alpaca-cleaned-llama-30b-bf16/results_2023-07-19T22-14-09.885019.json new file mode 100644 index 0000000000000000000000000000000000000000..05bfe9d509d532e348bc9af81c342ffe267c9a89 --- /dev/null +++ b/eval-results/dsvv-cair/alpaca-cleaned-llama-30b-bf16/results_2023-07-19T22-14-09.885019.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5802047781569966, + "acc_stderr": 0.01442218122630303, + "acc_norm": 0.6177474402730375, + "acc_norm_stderr": 0.014200454049979279 + }, + "harness|hellaswag|10": { + "acc": 0.6389165504879506, + "acc_stderr": 0.00479333052565621, + "acc_norm": 0.8506273650667198, + "acc_norm_stderr": 0.0035572690393421828 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6513157894736842, + "acc_stderr": 0.0387813988879761, + "acc_norm": 0.6513157894736842, + "acc_norm_stderr": 0.0387813988879761 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5962264150943396, + "acc_stderr": 0.030197611600197946, + "acc_norm": 0.5962264150943396, + "acc_norm_stderr": 0.030197611600197946 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6041666666666666, + "acc_stderr": 0.04089465449325582, + "acc_norm": 0.6041666666666666, + "acc_norm_stderr": 0.04089465449325582 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.48554913294797686, + "acc_stderr": 0.03810871630454764, + "acc_norm": 0.48554913294797686, + "acc_norm_stderr": 0.03810871630454764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.04755129616062946, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.04755129616062946 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4553191489361702, + "acc_stderr": 0.03255525359340355, + "acc_norm": 0.4553191489361702, + "acc_norm_stderr": 0.03255525359340355 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.04537815354939392, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.04537815354939392 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5103448275862069, + "acc_stderr": 0.04165774775728763, + "acc_norm": 0.5103448275862069, + "acc_norm_stderr": 0.04165774775728763 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.37037037037037035, + "acc_stderr": 0.02487081525105709, + "acc_norm": 0.37037037037037035, + "acc_norm_stderr": 0.02487081525105709 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3412698412698413, + "acc_stderr": 0.04240799327574924, + "acc_norm": 0.3412698412698413, + "acc_norm_stderr": 0.04240799327574924 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7161290322580646, + "acc_stderr": 0.02564938106302926, + "acc_norm": 0.7161290322580646, + "acc_norm_stderr": 0.02564938106302926 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43842364532019706, + "acc_stderr": 0.03491207857486519, + "acc_norm": 0.43842364532019706, + "acc_norm_stderr": 0.03491207857486519 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.036085410115739666, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.036085410115739666 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7373737373737373, + "acc_stderr": 0.031353050095330855, + "acc_norm": 0.7373737373737373, + "acc_norm_stderr": 0.031353050095330855 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8082901554404145, + "acc_stderr": 0.02840895362624527, + "acc_norm": 0.8082901554404145, + "acc_norm_stderr": 0.02840895362624527 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.558974358974359, + "acc_stderr": 0.025174048384000752, + "acc_norm": 0.558974358974359, + "acc_norm_stderr": 0.025174048384000752 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.026962424325073828, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.026962424325073828 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5546218487394958, + "acc_stderr": 0.032284106267163895, + "acc_norm": 0.5546218487394958, + "acc_norm_stderr": 0.032284106267163895 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.03879687024073327, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.03879687024073327 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7541284403669725, + "acc_stderr": 0.018461940968708436, + "acc_norm": 0.7541284403669725, + "acc_norm_stderr": 0.018461940968708436 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.46296296296296297, + "acc_stderr": 0.03400603625538271, + "acc_norm": 0.46296296296296297, + "acc_norm_stderr": 0.03400603625538271 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7892156862745098, + "acc_stderr": 0.02862654791243739, + "acc_norm": 0.7892156862745098, + "acc_norm_stderr": 0.02862654791243739 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7848101265822784, + "acc_stderr": 0.026750826994676166, + "acc_norm": 0.7848101265822784, + "acc_norm_stderr": 0.026750826994676166 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6278026905829597, + "acc_stderr": 0.032443052830087304, + "acc_norm": 0.6278026905829597, + "acc_norm_stderr": 0.032443052830087304 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6412213740458015, + "acc_stderr": 0.04206739313864908, + "acc_norm": 0.6412213740458015, + "acc_norm_stderr": 0.04206739313864908 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908705, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.043300437496507416, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.043300437496507416 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6809815950920245, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.6809815950920245, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.0457237235873743, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.0457237235873743 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7378640776699029, + "acc_stderr": 0.043546310772605956, + "acc_norm": 0.7378640776699029, + "acc_norm_stderr": 0.043546310772605956 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8247863247863247, + "acc_stderr": 0.02490443909891823, + "acc_norm": 0.8247863247863247, + "acc_norm_stderr": 0.02490443909891823 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7522349936143039, + "acc_stderr": 0.015438083080568965, + "acc_norm": 0.7522349936143039, + "acc_norm_stderr": 0.015438083080568965 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6647398843930635, + "acc_stderr": 0.025416003773165555, + "acc_norm": 0.6647398843930635, + "acc_norm_stderr": 0.025416003773165555 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.394413407821229, + "acc_stderr": 0.01634538676210397, + "acc_norm": 0.394413407821229, + "acc_norm_stderr": 0.01634538676210397 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6078431372549019, + "acc_stderr": 0.027956046165424516, + "acc_norm": 0.6078431372549019, + "acc_norm_stderr": 0.027956046165424516 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6784565916398714, + "acc_stderr": 0.026527724079528872, + "acc_norm": 0.6784565916398714, + "acc_norm_stderr": 0.026527724079528872 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6851851851851852, + "acc_stderr": 0.02584224870090217, + "acc_norm": 0.6851851851851852, + "acc_norm_stderr": 0.02584224870090217 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.46099290780141844, + "acc_stderr": 0.029736592526424438, + "acc_norm": 0.46099290780141844, + "acc_norm_stderr": 0.029736592526424438 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.42959582790091266, + "acc_stderr": 0.012643004623790206, + "acc_norm": 0.42959582790091266, + "acc_norm_stderr": 0.012643004623790206 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5661764705882353, + "acc_stderr": 0.03010563657001664, + "acc_norm": 0.5661764705882353, + "acc_norm_stderr": 0.03010563657001664 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5866013071895425, + "acc_stderr": 0.019922115682786685, + "acc_norm": 0.5866013071895425, + "acc_norm_stderr": 0.019922115682786685 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6040816326530613, + "acc_stderr": 0.03130802899065686, + "acc_norm": 0.6040816326530613, + "acc_norm_stderr": 0.03130802899065686 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7512437810945274, + "acc_stderr": 0.030567675938916718, + "acc_norm": 0.7512437810945274, + "acc_norm_stderr": 0.030567675938916718 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5180722891566265, + "acc_stderr": 0.03889951252827216, + "acc_norm": 0.5180722891566265, + "acc_norm_stderr": 0.03889951252827216 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7719298245614035, + "acc_stderr": 0.03218093795602357, + "acc_norm": 0.7719298245614035, + "acc_norm_stderr": 0.03218093795602357 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3463892288861689, + "mc1_stderr": 0.016656997109125153, + "mc2": 0.5149066221951282, + "mc2_stderr": 0.015549975999002924 + }, + "all": { + "acc": 0.5763869472729457, + "acc_stderr": 0.03430497343403847, + "acc_norm": 0.5806115824711628, + "acc_norm_stderr": 0.03428026515162088, + "mc1": 0.3463892288861689, + "mc1_stderr": 0.016656997109125153, + "mc2": 0.5149066221951282, + "mc2_stderr": 0.015549975999002924 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "dsvv-cair/alpaca-cleaned-llama-30b-bf16", + "model_sha": "2424b6346e9e8fd749b9a6734f5d7125b5926daf", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/dsvv-cair/alpaca-cleaned-llama-30b-bf16/results_2023-09-22T20-32-42.598667.json b/eval-results/dsvv-cair/alpaca-cleaned-llama-30b-bf16/results_2023-09-22T20-32-42.598667.json new file mode 100644 index 0000000000000000000000000000000000000000..e137a4259846c01efec3046b17f27e9b66f5af07 --- /dev/null +++ b/eval-results/dsvv-cair/alpaca-cleaned-llama-30b-bf16/results_2023-09-22T20-32-42.598667.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "dsvv-cair/alpaca-cleaned-llama-30b-bf16", + "model_sha": "2424b6346e9e8fd749b9a6734f5d7125b5926daf", + "model_size": "60.65 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.032508389261744965, + "em_stderr": 0.0018161887490111502, + "f1": 0.09911283557047008, + "f1_stderr": 0.0022624364500590114 + }, + "harness|gsm8k|5": { + "acc": 0.07733131159969674, + "acc_stderr": 0.007357713523222344 + }, + "harness|winogrande|5": { + "acc": 0.7734806629834254, + "acc_stderr": 0.011764149054698332 + }, + "all": { + "em": 0.032508389261744965, + "em_stderr": 0.0018161887490111502, + "f1": 0.09911283557047008, + "f1_stderr": 0.0022624364500590114, + "acc": 0.4254059872915611, + "acc_stderr": 0.009560931288960338 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "b0a152f50a3868a7" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "1fd68749ec9f6faf" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "e600c29e6ae824a8" + }, + "total_evaluation_time_secondes": "26453.59212398529", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/edor/Hermes-Platypus2-mini-7B/results_2023-08-16T10-47-02.037059.json b/eval-results/edor/Hermes-Platypus2-mini-7B/results_2023-08-16T10-47-02.037059.json new file mode 100644 index 0000000000000000000000000000000000000000..255517a98ce0b1e52552c00a532dcacde0c52ccf --- /dev/null +++ b/eval-results/edor/Hermes-Platypus2-mini-7B/results_2023-08-16T10-47-02.037059.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.523037542662116, + "acc_stderr": 0.014595873205358269, + "acc_norm": 0.537542662116041, + "acc_norm_stderr": 0.014570144495075581 + }, + "harness|hellaswag|10": { + "acc": 0.6015733917546305, + "acc_stderr": 0.004885735963346904, + "acc_norm": 0.7923720374427405, + "acc_norm_stderr": 0.0040477996462346365 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.42105263157894735, + "acc_stderr": 0.040179012759817494, + "acc_norm": 0.42105263157894735, + "acc_norm_stderr": 0.040179012759817494 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5056603773584906, + "acc_stderr": 0.030770900763851316, + "acc_norm": 0.5056603773584906, + "acc_norm_stderr": 0.030770900763851316 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5, + "acc_stderr": 0.04181210050035455, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04181210050035455 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4161849710982659, + "acc_stderr": 0.03758517775404947, + "acc_norm": 0.4161849710982659, + "acc_norm_stderr": 0.03758517775404947 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179962, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179962 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4, + "acc_stderr": 0.03202563076101735, + "acc_norm": 0.4, + "acc_norm_stderr": 0.03202563076101735 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.041424397194893624, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.041424397194893624 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.43448275862068964, + "acc_stderr": 0.04130740879555497, + "acc_norm": 0.43448275862068964, + "acc_norm_stderr": 0.04130740879555497 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.0236369759961018, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.0236369759961018 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.042163702135578345, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.042163702135578345 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5225806451612903, + "acc_stderr": 0.02841498501970786, + "acc_norm": 0.5225806451612903, + "acc_norm_stderr": 0.02841498501970786 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.33004926108374383, + "acc_stderr": 0.033085304262282574, + "acc_norm": 0.33004926108374383, + "acc_norm_stderr": 0.033085304262282574 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6181818181818182, + "acc_stderr": 0.03793713171165635, + "acc_norm": 0.6181818181818182, + "acc_norm_stderr": 0.03793713171165635 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5707070707070707, + "acc_stderr": 0.035265527246012, + "acc_norm": 0.5707070707070707, + "acc_norm_stderr": 0.035265527246012 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6683937823834197, + "acc_stderr": 0.03397636541089118, + "acc_norm": 0.6683937823834197, + "acc_norm_stderr": 0.03397636541089118 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4307692307692308, + "acc_stderr": 0.02510682066053975, + "acc_norm": 0.4307692307692308, + "acc_norm_stderr": 0.02510682066053975 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.026466117538959912, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.026466117538959912 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.031968769891957786, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.031968769891957786 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.26490066225165565, + "acc_stderr": 0.036030385453603826, + "acc_norm": 0.26490066225165565, + "acc_norm_stderr": 0.036030385453603826 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6440366972477064, + "acc_stderr": 0.020528559278244214, + "acc_norm": 0.6440366972477064, + "acc_norm_stderr": 0.020528559278244214 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.27314814814814814, + "acc_stderr": 0.030388051301678116, + "acc_norm": 0.27314814814814814, + "acc_norm_stderr": 0.030388051301678116 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6323529411764706, + "acc_stderr": 0.03384132045674119, + "acc_norm": 0.6323529411764706, + "acc_norm_stderr": 0.03384132045674119 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.030685820596610805, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.030685820596610805 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5515695067264574, + "acc_stderr": 0.03337883736255098, + "acc_norm": 0.5515695067264574, + "acc_norm_stderr": 0.03337883736255098 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5190839694656488, + "acc_stderr": 0.04382094705550988, + "acc_norm": 0.5190839694656488, + "acc_norm_stderr": 0.04382094705550988 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6611570247933884, + "acc_stderr": 0.043207678075366705, + "acc_norm": 0.6611570247933884, + "acc_norm_stderr": 0.043207678075366705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.04820403072760628, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.04820403072760628 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4601226993865031, + "acc_stderr": 0.03915857291436971, + "acc_norm": 0.4601226993865031, + "acc_norm_stderr": 0.03915857291436971 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4375, + "acc_stderr": 0.04708567521880525, + "acc_norm": 0.4375, + "acc_norm_stderr": 0.04708567521880525 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5825242718446602, + "acc_stderr": 0.048828405482122375, + "acc_norm": 0.5825242718446602, + "acc_norm_stderr": 0.048828405482122375 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.02934311479809444, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.02934311479809444 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.55, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.55, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6577266922094508, + "acc_stderr": 0.016967031766413624, + "acc_norm": 0.6577266922094508, + "acc_norm_stderr": 0.016967031766413624 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5346820809248555, + "acc_stderr": 0.026854257928258875, + "acc_norm": 0.5346820809248555, + "acc_norm_stderr": 0.026854257928258875 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25251396648044694, + "acc_stderr": 0.014530330201468636, + "acc_norm": 0.25251396648044694, + "acc_norm_stderr": 0.014530330201468636 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.49673202614379086, + "acc_stderr": 0.028629305194003543, + "acc_norm": 0.49673202614379086, + "acc_norm_stderr": 0.028629305194003543 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5691318327974276, + "acc_stderr": 0.028125340983972714, + "acc_norm": 0.5691318327974276, + "acc_norm_stderr": 0.028125340983972714 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5061728395061729, + "acc_stderr": 0.027818623962583295, + "acc_norm": 0.5061728395061729, + "acc_norm_stderr": 0.027818623962583295 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3900709219858156, + "acc_stderr": 0.029097675599463926, + "acc_norm": 0.3900709219858156, + "acc_norm_stderr": 0.029097675599463926 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3539765319426336, + "acc_stderr": 0.012213504731731637, + "acc_norm": 0.3539765319426336, + "acc_norm_stderr": 0.012213504731731637 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.47058823529411764, + "acc_stderr": 0.030320243265004137, + "acc_norm": 0.47058823529411764, + "acc_norm_stderr": 0.030320243265004137 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.44607843137254904, + "acc_stderr": 0.02010986454718136, + "acc_norm": 0.44607843137254904, + "acc_norm_stderr": 0.02010986454718136 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5181818181818182, + "acc_stderr": 0.04785964010794916, + "acc_norm": 0.5181818181818182, + "acc_norm_stderr": 0.04785964010794916 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.563265306122449, + "acc_stderr": 0.031751952375833226, + "acc_norm": 0.563265306122449, + "acc_norm_stderr": 0.031751952375833226 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6218905472636815, + "acc_stderr": 0.034288678487786564, + "acc_norm": 0.6218905472636815, + "acc_norm_stderr": 0.034288678487786564 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42168674698795183, + "acc_stderr": 0.03844453181770917, + "acc_norm": 0.42168674698795183, + "acc_norm_stderr": 0.03844453181770917 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6374269005847953, + "acc_stderr": 0.0368713061556206, + "acc_norm": 0.6374269005847953, + "acc_norm_stderr": 0.0368713061556206 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3329253365973072, + "mc1_stderr": 0.016497402382012055, + "mc2": 0.49276058409873585, + "mc2_stderr": 0.01516224977207343 + }, + "all": { + "acc": 0.4739285188775824, + "acc_stderr": 0.035185125877572575, + "acc_norm": 0.4774082437104984, + "acc_norm_stderr": 0.035170487487277746, + "mc1": 0.3329253365973072, + "mc1_stderr": 0.016497402382012055, + "mc2": 0.49276058409873585, + "mc2_stderr": 0.01516224977207343 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "edor/Hermes-Platypus2-mini-7B", + "model_sha": "2797c255626b396cc89c416110a4d785aa5cbe25", + "model_dtype": "torch.float16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4051.8374376296997", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/edor/Platypus2-mini-7B/results_2023-08-16T02-34-42.873458.json b/eval-results/edor/Platypus2-mini-7B/results_2023-08-16T02-34-42.873458.json new file mode 100644 index 0000000000000000000000000000000000000000..662b0f31babf303ff82327debc46c703840b3427 --- /dev/null +++ b/eval-results/edor/Platypus2-mini-7B/results_2023-08-16T02-34-42.873458.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.4948805460750853, + "acc_stderr": 0.014610624890309157, + "acc_norm": 0.5332764505119454, + "acc_norm_stderr": 0.014578995859605808 + }, + "harness|hellaswag|10": { + "acc": 0.589523999203346, + "acc_stderr": 0.004909148239488275, + "acc_norm": 0.7880900219079865, + "acc_norm_stderr": 0.004078262107595545 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4342105263157895, + "acc_stderr": 0.040335656678483205, + "acc_norm": 0.4342105263157895, + "acc_norm_stderr": 0.040335656678483205 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4377358490566038, + "acc_stderr": 0.030533338430467516, + "acc_norm": 0.4377358490566038, + "acc_norm_stderr": 0.030533338430467516 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.04174752578923185, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.04174752578923185 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.37572254335260113, + "acc_stderr": 0.036928207672648664, + "acc_norm": 0.37572254335260113, + "acc_norm_stderr": 0.036928207672648664 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179963, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179963 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.39574468085106385, + "acc_stderr": 0.03196758697835363, + "acc_norm": 0.39574468085106385, + "acc_norm_stderr": 0.03196758697835363 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.32456140350877194, + "acc_stderr": 0.04404556157374767, + "acc_norm": 0.32456140350877194, + "acc_norm_stderr": 0.04404556157374767 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3724137931034483, + "acc_stderr": 0.04028731532947559, + "acc_norm": 0.3724137931034483, + "acc_norm_stderr": 0.04028731532947559 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.022569897074918407, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.022569897074918407 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.04190596438871135, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.04190596438871135 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.46774193548387094, + "acc_stderr": 0.02838474778881333, + "acc_norm": 0.46774193548387094, + "acc_norm_stderr": 0.02838474778881333 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.32019704433497537, + "acc_stderr": 0.032826493853041504, + "acc_norm": 0.32019704433497537, + "acc_norm_stderr": 0.032826493853041504 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.593939393939394, + "acc_stderr": 0.03834816355401181, + "acc_norm": 0.593939393939394, + "acc_norm_stderr": 0.03834816355401181 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.4898989898989899, + "acc_stderr": 0.035616254886737454, + "acc_norm": 0.4898989898989899, + "acc_norm_stderr": 0.035616254886737454 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6839378238341969, + "acc_stderr": 0.03355397369686173, + "acc_norm": 0.6839378238341969, + "acc_norm_stderr": 0.03355397369686173 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.38974358974358975, + "acc_stderr": 0.024726967886647074, + "acc_norm": 0.38974358974358975, + "acc_norm_stderr": 0.024726967886647074 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085622, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085622 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.031968769891957786, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.031968769891957786 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2582781456953642, + "acc_stderr": 0.035737053147634576, + "acc_norm": 0.2582781456953642, + "acc_norm_stderr": 0.035737053147634576 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.5779816513761468, + "acc_stderr": 0.021174991407763175, + "acc_norm": 0.5779816513761468, + "acc_norm_stderr": 0.021174991407763175 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.25, + "acc_stderr": 0.029531221160930918, + "acc_norm": 0.25, + "acc_norm_stderr": 0.029531221160930918 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5931372549019608, + "acc_stderr": 0.03447891136353382, + "acc_norm": 0.5931372549019608, + "acc_norm_stderr": 0.03447891136353382 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6582278481012658, + "acc_stderr": 0.030874537537553617, + "acc_norm": 0.6582278481012658, + "acc_norm_stderr": 0.030874537537553617 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5381165919282511, + "acc_stderr": 0.033460150119732274, + "acc_norm": 0.5381165919282511, + "acc_norm_stderr": 0.033460150119732274 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5267175572519084, + "acc_stderr": 0.04379024936553894, + "acc_norm": 0.5267175572519084, + "acc_norm_stderr": 0.04379024936553894 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6611570247933884, + "acc_stderr": 0.0432076780753667, + "acc_norm": 0.6611570247933884, + "acc_norm_stderr": 0.0432076780753667 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.04803752235190192, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.04803752235190192 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4785276073619632, + "acc_stderr": 0.0392474687675113, + "acc_norm": 0.4785276073619632, + "acc_norm_stderr": 0.0392474687675113 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4017857142857143, + "acc_stderr": 0.04653333146973646, + "acc_norm": 0.4017857142857143, + "acc_norm_stderr": 0.04653333146973646 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5145631067961165, + "acc_stderr": 0.049486373240266356, + "acc_norm": 0.5145631067961165, + "acc_norm_stderr": 0.049486373240266356 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.717948717948718, + "acc_stderr": 0.029480360549541194, + "acc_norm": 0.717948717948718, + "acc_norm_stderr": 0.029480360549541194 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.55, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.55, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6245210727969349, + "acc_stderr": 0.01731661319718279, + "acc_norm": 0.6245210727969349, + "acc_norm_stderr": 0.01731661319718279 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5057803468208093, + "acc_stderr": 0.02691729617914911, + "acc_norm": 0.5057803468208093, + "acc_norm_stderr": 0.02691729617914911 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4673202614379085, + "acc_stderr": 0.02856869975222588, + "acc_norm": 0.4673202614379085, + "acc_norm_stderr": 0.02856869975222588 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5530546623794212, + "acc_stderr": 0.028237769422085335, + "acc_norm": 0.5530546623794212, + "acc_norm_stderr": 0.028237769422085335 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5061728395061729, + "acc_stderr": 0.027818623962583295, + "acc_norm": 0.5061728395061729, + "acc_norm_stderr": 0.027818623962583295 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3900709219858156, + "acc_stderr": 0.02909767559946393, + "acc_norm": 0.3900709219858156, + "acc_norm_stderr": 0.02909767559946393 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3774445893089961, + "acc_stderr": 0.012380680911165813, + "acc_norm": 0.3774445893089961, + "acc_norm_stderr": 0.012380680911165813 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4852941176470588, + "acc_stderr": 0.03035969707904611, + "acc_norm": 0.4852941176470588, + "acc_norm_stderr": 0.03035969707904611 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4542483660130719, + "acc_stderr": 0.02014297455379519, + "acc_norm": 0.4542483660130719, + "acc_norm_stderr": 0.02014297455379519 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5181818181818182, + "acc_stderr": 0.04785964010794916, + "acc_norm": 0.5181818181818182, + "acc_norm_stderr": 0.04785964010794916 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5102040816326531, + "acc_stderr": 0.03200255347893782, + "acc_norm": 0.5102040816326531, + "acc_norm_stderr": 0.03200255347893782 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6268656716417911, + "acc_stderr": 0.03419832608176008, + "acc_norm": 0.6268656716417911, + "acc_norm_stderr": 0.03419832608176008 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4036144578313253, + "acc_stderr": 0.038194861407583984, + "acc_norm": 0.4036144578313253, + "acc_norm_stderr": 0.038194861407583984 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6900584795321637, + "acc_stderr": 0.035469769593931624, + "acc_norm": 0.6900584795321637, + "acc_norm_stderr": 0.035469769593931624 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.27906976744186046, + "mc1_stderr": 0.01570210709062791, + "mc2": 0.42004433319337015, + "mc2_stderr": 0.01412081278861875 + }, + "all": { + "acc": 0.4587007865748887, + "acc_stderr": 0.03503205205009946, + "acc_norm": 0.46271709042474474, + "acc_norm_stderr": 0.03501743314903851, + "mc1": 0.27906976744186046, + "mc1_stderr": 0.01570210709062791, + "mc2": 0.42004433319337015, + "mc2_stderr": 0.01412081278861875 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "edor/Platypus2-mini-7B", + "model_sha": "4ede4a6f8a8d6cc3bfff8b98837116c74c280f63", + "model_dtype": "torch.float16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4045.8140304088593", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/edor/Platypus2-mini-7B/results_2023-10-15T00-02-17.687247.json b/eval-results/edor/Platypus2-mini-7B/results_2023-10-15T00-02-17.687247.json new file mode 100644 index 0000000000000000000000000000000000000000..c948eb242a62983ab43afcc330d3d6b1c51074ea --- /dev/null +++ b/eval-results/edor/Platypus2-mini-7B/results_2023-10-15T00-02-17.687247.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "edor/Platypus2-mini-7B", + "model_sha": "d7eb8e16b5367c9c036ca18459277d3d2ada6e10", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.037856543624161076, + "em_stderr": 0.0019544746699158705, + "f1": 0.09621854026845604, + "f1_stderr": 0.00226406080927082 + }, + "harness|gsm8k|5": { + "acc": 0.0621683093252464, + "acc_stderr": 0.006651035644531692 + }, + "harness|winogrande|5": { + "acc": 0.7513812154696132, + "acc_stderr": 0.012147314713403108 + }, + "all": { + "em": 0.037856543624161076, + "em_stderr": 0.0019544746699158705, + "f1": 0.09621854026845604, + "f1_stderr": 0.00226406080927082, + "acc": 0.4067747623974298, + "acc_stderr": 0.0093991751789674 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "ed979c2268ac2c32" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "83cfd9d96d6f29e4" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "6f64bd6e93b6bf7e" + }, + "total_evaluation_time_secondes": "10213.96265912056", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/edor/Stable-Platypus2-mini-7B/results_2023-08-16T10-44-20.574252.json b/eval-results/edor/Stable-Platypus2-mini-7B/results_2023-08-16T10-44-20.574252.json new file mode 100644 index 0000000000000000000000000000000000000000..0569260eef0d1cf998d0f6360ca7745efd6c50a2 --- /dev/null +++ b/eval-results/edor/Stable-Platypus2-mini-7B/results_2023-08-16T10-44-20.574252.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5238907849829352, + "acc_stderr": 0.014594701798071654, + "acc_norm": 0.5486348122866894, + "acc_norm_stderr": 0.014542104569955267 + }, + "harness|hellaswag|10": { + "acc": 0.5965943039235212, + "acc_stderr": 0.004895782107786497, + "acc_norm": 0.7894841665006971, + "acc_norm_stderr": 0.0040684184172756635 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.40789473684210525, + "acc_stderr": 0.03999309712777471, + "acc_norm": 0.40789473684210525, + "acc_norm_stderr": 0.03999309712777471 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5924528301886792, + "acc_stderr": 0.03024223380085449, + "acc_norm": 0.5924528301886792, + "acc_norm_stderr": 0.03024223380085449 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5416666666666666, + "acc_stderr": 0.04166666666666666, + "acc_norm": 0.5416666666666666, + "acc_norm_stderr": 0.04166666666666666 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4682080924855491, + "acc_stderr": 0.03804749744364764, + "acc_norm": 0.4682080924855491, + "acc_norm_stderr": 0.03804749744364764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.04440521906179327, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.04440521906179327 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4765957446808511, + "acc_stderr": 0.03265019475033582, + "acc_norm": 0.4765957446808511, + "acc_norm_stderr": 0.03265019475033582 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.041424397194893624, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.041424397194893624 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4896551724137931, + "acc_stderr": 0.04165774775728763, + "acc_norm": 0.4896551724137931, + "acc_norm_stderr": 0.04165774775728763 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.0236369759961018, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.0236369759961018 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.04163453031302859, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.04163453031302859 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5645161290322581, + "acc_stderr": 0.02820622559150274, + "acc_norm": 0.5645161290322581, + "acc_norm_stderr": 0.02820622559150274 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3448275862068966, + "acc_stderr": 0.033442837442804574, + "acc_norm": 0.3448275862068966, + "acc_norm_stderr": 0.033442837442804574 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7212121212121212, + "acc_stderr": 0.03501438706296781, + "acc_norm": 0.7212121212121212, + "acc_norm_stderr": 0.03501438706296781 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6414141414141414, + "acc_stderr": 0.034169036403915214, + "acc_norm": 0.6414141414141414, + "acc_norm_stderr": 0.034169036403915214 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7772020725388601, + "acc_stderr": 0.030031147977641538, + "acc_norm": 0.7772020725388601, + "acc_norm_stderr": 0.030031147977641538 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4948717948717949, + "acc_stderr": 0.02534967290683866, + "acc_norm": 0.4948717948717949, + "acc_norm_stderr": 0.02534967290683866 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24814814814814815, + "acc_stderr": 0.0263357394040558, + "acc_norm": 0.24814814814814815, + "acc_norm_stderr": 0.0263357394040558 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5168067226890757, + "acc_stderr": 0.03246013680375308, + "acc_norm": 0.5168067226890757, + "acc_norm_stderr": 0.03246013680375308 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.037345356767871984, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.037345356767871984 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7321100917431193, + "acc_stderr": 0.018987462257978652, + "acc_norm": 0.7321100917431193, + "acc_norm_stderr": 0.018987462257978652 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.03372343271653063, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.03372343271653063 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.696078431372549, + "acc_stderr": 0.03228210387037893, + "acc_norm": 0.696078431372549, + "acc_norm_stderr": 0.03228210387037893 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7130801687763713, + "acc_stderr": 0.029443773022594693, + "acc_norm": 0.7130801687763713, + "acc_norm_stderr": 0.029443773022594693 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6143497757847534, + "acc_stderr": 0.03266842214289201, + "acc_norm": 0.6143497757847534, + "acc_norm_stderr": 0.03266842214289201 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6106870229007634, + "acc_stderr": 0.04276486542814591, + "acc_norm": 0.6106870229007634, + "acc_norm_stderr": 0.04276486542814591 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6859504132231405, + "acc_stderr": 0.042369647530410184, + "acc_norm": 0.6859504132231405, + "acc_norm_stderr": 0.042369647530410184 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.04766075165356461, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.04766075165356461 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5460122699386503, + "acc_stderr": 0.0391170190467718, + "acc_norm": 0.5460122699386503, + "acc_norm_stderr": 0.0391170190467718 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4107142857142857, + "acc_stderr": 0.04669510663875191, + "acc_norm": 0.4107142857142857, + "acc_norm_stderr": 0.04669510663875191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7281553398058253, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.7281553398058253, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.782051282051282, + "acc_stderr": 0.02704685763071669, + "acc_norm": 0.782051282051282, + "acc_norm_stderr": 0.02704685763071669 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7164750957854407, + "acc_stderr": 0.01611731816683227, + "acc_norm": 0.7164750957854407, + "acc_norm_stderr": 0.01611731816683227 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5780346820809249, + "acc_stderr": 0.026589231142174263, + "acc_norm": 0.5780346820809249, + "acc_norm_stderr": 0.026589231142174263 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2569832402234637, + "acc_stderr": 0.01461446582196633, + "acc_norm": 0.2569832402234637, + "acc_norm_stderr": 0.01461446582196633 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5424836601307189, + "acc_stderr": 0.028526383452142635, + "acc_norm": 0.5424836601307189, + "acc_norm_stderr": 0.028526383452142635 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5852090032154341, + "acc_stderr": 0.027982680459759563, + "acc_norm": 0.5852090032154341, + "acc_norm_stderr": 0.027982680459759563 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.027744313443376536, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.027744313443376536 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3900709219858156, + "acc_stderr": 0.029097675599463926, + "acc_norm": 0.3900709219858156, + "acc_norm_stderr": 0.029097675599463926 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3917861799217731, + "acc_stderr": 0.01246756441814513, + "acc_norm": 0.3917861799217731, + "acc_norm_stderr": 0.01246756441814513 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5183823529411765, + "acc_stderr": 0.03035230339535197, + "acc_norm": 0.5183823529411765, + "acc_norm_stderr": 0.03035230339535197 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5098039215686274, + "acc_stderr": 0.0202239460050743, + "acc_norm": 0.5098039215686274, + "acc_norm_stderr": 0.0202239460050743 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6181818181818182, + "acc_stderr": 0.046534298079135075, + "acc_norm": 0.6181818181818182, + "acc_norm_stderr": 0.046534298079135075 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6571428571428571, + "acc_stderr": 0.030387262919547735, + "acc_norm": 0.6571428571428571, + "acc_norm_stderr": 0.030387262919547735 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6318407960199005, + "acc_stderr": 0.03410410565495302, + "acc_norm": 0.6318407960199005, + "acc_norm_stderr": 0.03410410565495302 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.73, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.73, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42771084337349397, + "acc_stderr": 0.03851597683718534, + "acc_norm": 0.42771084337349397, + "acc_norm_stderr": 0.03851597683718534 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.695906432748538, + "acc_stderr": 0.03528211258245229, + "acc_norm": 0.695906432748538, + "acc_norm_stderr": 0.03528211258245229 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3561811505507956, + "mc1_stderr": 0.01676379072844634, + "mc2": 0.5106039601116779, + "mc2_stderr": 0.015454187246822623 + }, + "all": { + "acc": 0.519238503099194, + "acc_stderr": 0.03487887571401071, + "acc_norm": 0.5229272130971759, + "acc_norm_stderr": 0.03486396112216957, + "mc1": 0.3561811505507956, + "mc1_stderr": 0.01676379072844634, + "mc2": 0.5106039601116779, + "mc2_stderr": 0.015454187246822623 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "edor/Stable-Platypus2-mini-7B", + "model_sha": "a595cdcbee7562e5ff13ff720245a8c5cf26ffdf", + "model_dtype": "torch.float16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4039.3992779254913", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/gpt2-medium/results_2023-07-24T09-55-17.325605.json b/eval-results/gpt2-medium/results_2023-07-24T09-55-17.325605.json new file mode 100644 index 0000000000000000000000000000000000000000..5daaab3116777fff3da04b79089d3875be0f0440 --- /dev/null +++ b/eval-results/gpt2-medium/results_2023-07-24T09-55-17.325605.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.22098976109215018, + "acc_stderr": 0.012124929206818258, + "acc_norm": 0.27047781569965873, + "acc_norm_stderr": 0.012980954547659554 + }, + "harness|hellaswag|10": { + "acc": 0.33061143198566023, + "acc_stderr": 0.004694718918225759, + "acc_norm": 0.4017128062139016, + "acc_norm_stderr": 0.004892425356375716 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768081, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768081 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.03712537833614867, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.03712537833614867 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3026315789473684, + "acc_stderr": 0.03738520676119668, + "acc_norm": 0.3026315789473684, + "acc_norm_stderr": 0.03738520676119668 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.15, + "acc_stderr": 0.035887028128263686, + "acc_norm": 0.15, + "acc_norm_stderr": 0.035887028128263686 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.3018867924528302, + "acc_stderr": 0.02825420034443867, + "acc_norm": 0.3018867924528302, + "acc_norm_stderr": 0.02825420034443867 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2708333333333333, + "acc_stderr": 0.03716177437566016, + "acc_norm": 0.2708333333333333, + "acc_norm_stderr": 0.03716177437566016 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.31, + "acc_stderr": 0.046482319871173156, + "acc_norm": 0.31, + "acc_norm_stderr": 0.046482319871173156 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2658959537572254, + "acc_stderr": 0.033687629322594316, + "acc_norm": 0.2658959537572254, + "acc_norm_stderr": 0.033687629322594316 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.1568627450980392, + "acc_stderr": 0.036186648199362466, + "acc_norm": 0.1568627450980392, + "acc_norm_stderr": 0.036186648199362466 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.23, + "acc_stderr": 0.042295258468165044, + "acc_norm": 0.23, + "acc_norm_stderr": 0.042295258468165044 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2936170212765957, + "acc_stderr": 0.029771642712491227, + "acc_norm": 0.2936170212765957, + "acc_norm_stderr": 0.029771642712491227 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.0433913832257986, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.0433913832257986 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2482758620689655, + "acc_stderr": 0.03600105692727771, + "acc_norm": 0.2482758620689655, + "acc_norm_stderr": 0.03600105692727771 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.022569897074918424, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.022569897074918424 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.15873015873015872, + "acc_stderr": 0.03268454013011742, + "acc_norm": 0.15873015873015872, + "acc_norm_stderr": 0.03268454013011742 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24193548387096775, + "acc_stderr": 0.0243625996930311, + "acc_norm": 0.24193548387096775, + "acc_norm_stderr": 0.0243625996930311 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3054187192118227, + "acc_stderr": 0.03240661565868408, + "acc_norm": 0.3054187192118227, + "acc_norm_stderr": 0.03240661565868408 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.03453131801885415, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.03453131801885415 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.35858585858585856, + "acc_stderr": 0.03416903640391521, + "acc_norm": 0.35858585858585856, + "acc_norm_stderr": 0.03416903640391521 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.3160621761658031, + "acc_stderr": 0.033553973696861736, + "acc_norm": 0.3160621761658031, + "acc_norm_stderr": 0.033553973696861736 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3076923076923077, + "acc_stderr": 0.023400928918310502, + "acc_norm": 0.3076923076923077, + "acc_norm_stderr": 0.023400928918310502 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.026842057873833706, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.026842057873833706 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3025210084033613, + "acc_stderr": 0.02983796238829192, + "acc_norm": 0.3025210084033613, + "acc_norm_stderr": 0.02983796238829192 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3357798165137615, + "acc_stderr": 0.020248081396752937, + "acc_norm": 0.3357798165137615, + "acc_norm_stderr": 0.020248081396752937 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.033509916046960436, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.033509916046960436 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.02933116229425173, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.02933116229425173 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2109704641350211, + "acc_stderr": 0.02655837250266192, + "acc_norm": 0.2109704641350211, + "acc_norm_stderr": 0.02655837250266192 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.2062780269058296, + "acc_stderr": 0.02715715047956382, + "acc_norm": 0.2062780269058296, + "acc_norm_stderr": 0.02715715047956382 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.1652892561983471, + "acc_stderr": 0.03390780612972777, + "acc_norm": 0.1652892561983471, + "acc_norm_stderr": 0.03390780612972777 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2037037037037037, + "acc_stderr": 0.03893542518824847, + "acc_norm": 0.2037037037037037, + "acc_norm_stderr": 0.03893542518824847 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.26380368098159507, + "acc_stderr": 0.03462419931615624, + "acc_norm": 0.26380368098159507, + "acc_norm_stderr": 0.03462419931615624 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.19642857142857142, + "acc_stderr": 0.03770970049347019, + "acc_norm": 0.19642857142857142, + "acc_norm_stderr": 0.03770970049347019 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3592233009708738, + "acc_stderr": 0.04750458399041692, + "acc_norm": 0.3592233009708738, + "acc_norm_stderr": 0.04750458399041692 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.20512820512820512, + "acc_stderr": 0.026453508054040353, + "acc_norm": 0.20512820512820512, + "acc_norm_stderr": 0.026453508054040353 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421296, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421296 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.24010217113665389, + "acc_stderr": 0.015274685213734191, + "acc_norm": 0.24010217113665389, + "acc_norm_stderr": 0.015274685213734191 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24277456647398843, + "acc_stderr": 0.0230836585869842, + "acc_norm": 0.24277456647398843, + "acc_norm_stderr": 0.0230836585869842 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2679738562091503, + "acc_stderr": 0.025360603796242557, + "acc_norm": 0.2679738562091503, + "acc_norm_stderr": 0.025360603796242557 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24115755627009647, + "acc_stderr": 0.02429659403476343, + "acc_norm": 0.24115755627009647, + "acc_norm_stderr": 0.02429659403476343 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2191358024691358, + "acc_stderr": 0.023016705640262196, + "acc_norm": 0.2191358024691358, + "acc_norm_stderr": 0.023016705640262196 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.25886524822695034, + "acc_stderr": 0.02612957252718085, + "acc_norm": 0.25886524822695034, + "acc_norm_stderr": 0.02612957252718085 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24511082138200782, + "acc_stderr": 0.010986307870045524, + "acc_norm": 0.24511082138200782, + "acc_norm_stderr": 0.010986307870045524 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4522058823529412, + "acc_stderr": 0.030233758551596455, + "acc_norm": 0.4522058823529412, + "acc_norm_stderr": 0.030233758551596455 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.01716058723504635, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.01716058723504635 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2636363636363636, + "acc_stderr": 0.04220224692971987, + "acc_norm": 0.2636363636363636, + "acc_norm_stderr": 0.04220224692971987 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3346938775510204, + "acc_stderr": 0.030209235226242307, + "acc_norm": 0.3346938775510204, + "acc_norm_stderr": 0.030209235226242307 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.22885572139303484, + "acc_stderr": 0.029705284056772432, + "acc_norm": 0.22885572139303484, + "acc_norm_stderr": 0.029705284056772432 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.20481927710843373, + "acc_stderr": 0.03141784291663926, + "acc_norm": 0.20481927710843373, + "acc_norm_stderr": 0.03141784291663926 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.25146198830409355, + "acc_stderr": 0.033275044238468436, + "acc_norm": 0.25146198830409355, + "acc_norm_stderr": 0.033275044238468436 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2252141982864137, + "mc1_stderr": 0.014623240768023505, + "mc2": 0.4075602335796246, + "mc2_stderr": 0.014596763158762415 + }, + "all": { + "acc": 0.26635359145248716, + "acc_stderr": 0.031872871258390986, + "acc_norm": 0.2683974800768219, + "acc_norm_stderr": 0.03189073111905186, + "mc1": 0.2252141982864137, + "mc1_stderr": 0.014623240768023505, + "mc2": 0.4075602335796246, + "mc2_stderr": 0.014596763158762415 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "gpt2-medium", + "model_sha": "f65d4965d1221eff2bcf34f53a2ba12120e18f24", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "truncated": 1568, + "non-truncated": 3119, + "padded": 3087, + "non-padded": 1600, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "truncated": 1975, + "non-truncated": 38193, + "padded": 38021, + "non-padded": 2147, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 20, + "non-truncated": 672, + "padded": 660, + "non-padded": 32, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "truncated": 16, + "non-truncated": 384, + "padded": 384, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "truncated": 4, + "non-truncated": 860, + "padded": 860, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "truncated": 948, + "non-truncated": 0, + "padded": 0, + "non-padded": 948, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 6136, + "non-truncated": 0, + "padded": 0, + "non-padded": 6136, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "truncated": 1032, + "non-truncated": 56, + "padded": 48, + "non-padded": 1040, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "truncated": 980, + "non-truncated": 0, + "padded": 0, + "non-padded": 980, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "18a3fbefef0c4910", + "hash_cont_tokens": "6159bf1904a8c8fb" + }, + "total_evaluation_time_secondes": "1297.7707328796387", + "truncated": 14155, + "non-truncated": 96864, + "padded": 96540, + "non-padded": 14479, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/grantprice/Cerebras-GPT-590M-finetuned-DND/results_2023-08-12T08-00-51.924323.json b/eval-results/grantprice/Cerebras-GPT-590M-finetuned-DND/results_2023-08-12T08-00-51.924323.json new file mode 100644 index 0000000000000000000000000000000000000000..11721174c3f2c67c9084dcacc58399b5bea99c13 --- /dev/null +++ b/eval-results/grantprice/Cerebras-GPT-590M-finetuned-DND/results_2023-08-12T08-00-51.924323.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.21245733788395904, + "acc_stderr": 0.011953482906582954, + "acc_norm": 0.24744027303754265, + "acc_norm_stderr": 0.01261035266329267 + }, + "harness|hellaswag|10": { + "acc": 0.26090420235012945, + "acc_stderr": 0.004382303181183646, + "acc_norm": 0.2784305915156343, + "acc_norm_stderr": 0.00447310453702692 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.18518518518518517, + "acc_stderr": 0.03355677216313142, + "acc_norm": 0.18518518518518517, + "acc_norm_stderr": 0.03355677216313142 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.21509433962264152, + "acc_stderr": 0.02528839450289137, + "acc_norm": 0.21509433962264152, + "acc_norm_stderr": 0.02528839450289137 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749874, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749874 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813365 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.20899470899470898, + "acc_stderr": 0.02094048156533486, + "acc_norm": 0.20899470899470898, + "acc_norm_stderr": 0.02094048156533486 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04040610178208841, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04040610178208841 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.1774193548387097, + "acc_stderr": 0.02173254068932927, + "acc_norm": 0.1774193548387097, + "acc_norm_stderr": 0.02173254068932927 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.15270935960591134, + "acc_stderr": 0.02530890453938063, + "acc_norm": 0.15270935960591134, + "acc_norm_stderr": 0.02530890453938063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.17676767676767677, + "acc_stderr": 0.027178752639044915, + "acc_norm": 0.17676767676767677, + "acc_norm_stderr": 0.027178752639044915 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.19689119170984457, + "acc_stderr": 0.028697873971860664, + "acc_norm": 0.19689119170984457, + "acc_norm_stderr": 0.028697873971860664 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.20256410256410257, + "acc_stderr": 0.020377660970371372, + "acc_norm": 0.20256410256410257, + "acc_norm_stderr": 0.020377660970371372 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2111111111111111, + "acc_stderr": 0.024882116857655075, + "acc_norm": 0.2111111111111111, + "acc_norm_stderr": 0.024882116857655075 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21008403361344538, + "acc_stderr": 0.026461398717471874, + "acc_norm": 0.21008403361344538, + "acc_norm_stderr": 0.026461398717471874 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.1986754966887417, + "acc_stderr": 0.03257847384436776, + "acc_norm": 0.1986754966887417, + "acc_norm_stderr": 0.03257847384436776 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1926605504587156, + "acc_stderr": 0.016909276884936094, + "acc_norm": 0.1926605504587156, + "acc_norm_stderr": 0.016909276884936094 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.1527777777777778, + "acc_stderr": 0.024536326026134224, + "acc_norm": 0.1527777777777778, + "acc_norm_stderr": 0.024536326026134224 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.31390134529147984, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.31390134529147984, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2905982905982906, + "acc_stderr": 0.02974504857267404, + "acc_norm": 0.2905982905982906, + "acc_norm_stderr": 0.02974504857267404 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.23754789272030652, + "acc_stderr": 0.015218733046150193, + "acc_norm": 0.23754789272030652, + "acc_norm_stderr": 0.015218733046150193 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.023929155517351284, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.023929155517351284 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1864951768488746, + "acc_stderr": 0.02212243977248077, + "acc_norm": 0.1864951768488746, + "acc_norm_stderr": 0.02212243977248077 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21604938271604937, + "acc_stderr": 0.022899162918445806, + "acc_norm": 0.21604938271604937, + "acc_norm_stderr": 0.022899162918445806 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23404255319148937, + "acc_stderr": 0.025257861359432417, + "acc_norm": 0.23404255319148937, + "acc_norm_stderr": 0.025257861359432417 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142692, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142692 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.18382352941176472, + "acc_stderr": 0.023529242185193106, + "acc_norm": 0.18382352941176472, + "acc_norm_stderr": 0.023529242185193106 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25, + "acc_stderr": 0.01751781884501444, + "acc_norm": 0.25, + "acc_norm_stderr": 0.01751781884501444 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.18775510204081633, + "acc_stderr": 0.02500025603954621, + "acc_norm": 0.18775510204081633, + "acc_norm_stderr": 0.02500025603954621 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401465, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401465 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.28313253012048195, + "acc_stderr": 0.03507295431370518, + "acc_norm": 0.28313253012048195, + "acc_norm_stderr": 0.03507295431370518 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2558139534883721, + "mc1_stderr": 0.015274176219283349, + "mc2": 0.4826115841429237, + "mc2_stderr": 0.016350418098570588 + }, + "all": { + "acc": 0.2313554296067047, + "acc_stderr": 0.03070734613717412, + "acc_norm": 0.23224541815448585, + "acc_norm_stderr": 0.030720018528912305, + "mc1": 0.2558139534883721, + "mc1_stderr": 0.015274176219283349, + "mc2": 0.4826115841429237, + "mc2_stderr": 0.016350418098570588 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "grantprice/Cerebras-GPT-590M-finetuned-DND", + "model_sha": "a0a2fbe342cdc86433913ba5f96978e4703ff672", + "model_dtype": "torch.float16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "fcb10622c87b0ef1", + "hash_cont_tokens": "166e563f6182f0d2" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4670, + "non-padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "81ae13648c4b529e", + "hash_cont_tokens": "68e90da93a0937a9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40023, + "non-padded": 145, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "1276dc2e762abca8", + "hash_cont_tokens": "28a44907f2213e2f" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d50af8a8a98216cd", + "hash_cont_tokens": "2d729a159dfdbddd" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "69cc784c2eb33af9", + "hash_cont_tokens": "f9c76a754de95ca3" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "300a468610c9bdc9", + "hash_cont_tokens": "5adc884740f2259d" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "3a02b678147e3e34", + "hash_cont_tokens": "3e1d3b2458c62b77" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "703481671acaac45", + "hash_cont_tokens": "cdd13fb83b6f5282" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "e804e0acf0782cf6", + "hash_cont_tokens": "1cb7b79c20973e70" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "b0a670f33f050e85", + "hash_cont_tokens": "bcdc038d490c74e2" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "64554d663159d56d", + "hash_cont_tokens": "3a67316bd922aac1" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "dc45be2053e37a73", + "hash_cont_tokens": "1964ef941691fef5" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "bcfed99948f4fdc8", + "hash_cont_tokens": "fd8755c4e6593833" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "ae9da12eb434043e", + "hash_cont_tokens": "28a44907f2213e2f" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "37a932005e796f96", + "hash_cont_tokens": "627b6ca2a6cffe2d" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "6d4f769f01976034", + "hash_cont_tokens": "dd7e98ab2f3f2b92" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "16bff5a080814aea", + "hash_cont_tokens": "6fd38645b3266a45" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "bd01d3c835e5f99b", + "hash_cont_tokens": "da77efdd68a563cd" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "aaa95191496e5710", + "hash_cont_tokens": "531801cebecb253b" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "20f3a123fd282360", + "hash_cont_tokens": "28a44907f2213e2f" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d86dbb90e1984e58", + "hash_cont_tokens": "4b2c32a3c8172a3d" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "304f40ab5951ed20", + "hash_cont_tokens": "afcef330efc395ca" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "c718cb35f110bb14", + "hash_cont_tokens": "d53c31dc13663a78" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "94c1059972b4cc17", + "hash_cont_tokens": "0752fe326b0c2c83" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "386fcb8c4f6ec746", + "hash_cont_tokens": "1aa2cd9416ef8451" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "d4c44f0db9da605e", + "hash_cont_tokens": "dabf42ed09412d49" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "dabcd052010ca20f", + "hash_cont_tokens": "b73c767ca8255e20" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "b8345503aeb8ea82", + "hash_cont_tokens": "e436735fbe758af4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "0bcb272060f95419", + "hash_cont_tokens": "c1dd2a3517293e30" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "13246d3e5e96a240", + "hash_cont_tokens": "f54ce6f45d96c4b0" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "a4a32b9642c9108b", + "hash_cont_tokens": "8a4ff8848bd1ffab" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a6dc30dedd478862", + "hash_cont_tokens": "482070326dcc14db" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "18de952aa41892a3", + "hash_cont_tokens": "19c2d0435300ae0f" + }, + "truncated": 752, + "non-truncated": 64, + "padded": 64, + "non-padded": 752, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "f2e86562e92c2188", + "hash_cont_tokens": "b1749710c852be30" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "905b3010aeeaeaea", + "hash_cont_tokens": "882ac84335f03103" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "81b890343872b198", + "hash_cont_tokens": "9e8d46a7d94e4be4" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "a3bb91882629bd8e", + "hash_cont_tokens": "bcdc0bc7f84d6e75" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "b60cd31165f87cde", + "hash_cont_tokens": "1d65eb48106d5599" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "014c7e3c5c12029c", + "hash_cont_tokens": "0606992ac742d870" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "7d7ba4c856b71e64", + "hash_cont_tokens": "4e00e86c4726cf28" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "a40f647b089bb6de", + "hash_cont_tokens": "78137ebe6b139023" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2569465ebc548f50", + "hash_cont_tokens": "855b392646799efa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "dd50c39bfe67722e", + "hash_cont_tokens": "28a44907f2213e2f" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "ac469ddc6c3fcb0f", + "hash_cont_tokens": "98cfde288cd74ea4" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "2e1f78780d6c4f7d", + "hash_cont_tokens": "0d6be5d663ede340" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1384, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "29a5db63c59b5ef5", + "hash_cont_tokens": "8d3ae0cade822aa4" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3572, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a1b61f7601aaba07", + "hash_cont_tokens": "573881c181c32793" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "1b76f7a56032eefa", + "hash_cont_tokens": "6abd62a3edc8b9b7" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "07869ff72846d700", + "hash_cont_tokens": "6445402688120367" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "dd69bf3e23340f0a", + "hash_cont_tokens": "90f6e759fafc0b4d" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "5ff8c596c29e5743", + "hash_cont_tokens": "1455afdc087891c1" + }, + "truncated": 20, + "non-truncated": 6116, + "padded": 6116, + "non-padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "771ede064dc78dee", + "hash_cont_tokens": "bc2727170b1f95cb" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "4cc77eba23343164", + "hash_cont_tokens": "326e10e67f79b051" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "98d1df69c059f587", + "hash_cont_tokens": "b6f271e075accaf6" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "b166e0ae09f633c6", + "hash_cont_tokens": "f3cf9986ceb9cb43" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "015015c3dd725fac", + "hash_cont_tokens": "ae2e74635d355669" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "faebeec3508ecb29", + "hash_cont_tokens": "28a44907f2213e2f" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "42d550320a174619", + "hash_cont_tokens": "5ad382f31b6e8167" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "5dea5c824a42dbba", + "hash_cont_tokens": "c27f3566a7d479ab" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "8f3671a8e2614556", + "hash_cont_tokens": "eb503caaaae2fee8" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "13cb38d3cc2fe8ee", + "hash_cont_tokens": "432adfb748f55312" + }, + "total_evaluation_time_secondes": "898.1719336509705", + "truncated": 1432, + "non-truncated": 109587, + "padded": 109417, + "non-padded": 1602, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/hakurei/instruct-12b/results_2023-07-19T18-10-16.385807.json b/eval-results/hakurei/instruct-12b/results_2023-07-19T18-10-16.385807.json new file mode 100644 index 0000000000000000000000000000000000000000..ef29167b2d7dbb883228dc49a8775d375f667d4d --- /dev/null +++ b/eval-results/hakurei/instruct-12b/results_2023-07-19T18-10-16.385807.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.3856655290102389, + "acc_stderr": 0.01422425097325717, + "acc_norm": 0.4257679180887372, + "acc_norm_stderr": 0.014449464278868802 + }, + "harness|hellaswag|10": { + "acc": 0.5104560844453296, + "acc_stderr": 0.004988690229505662, + "acc_norm": 0.6675960963951404, + "acc_norm_stderr": 0.004701121421805423 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.35555555555555557, + "acc_stderr": 0.04135176749720385, + "acc_norm": 0.35555555555555557, + "acc_norm_stderr": 0.04135176749720385 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.26973684210526316, + "acc_stderr": 0.03611780560284898, + "acc_norm": 0.26973684210526316, + "acc_norm_stderr": 0.03611780560284898 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909281, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909281 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.24150943396226415, + "acc_stderr": 0.026341480371118352, + "acc_norm": 0.24150943396226415, + "acc_norm_stderr": 0.026341480371118352 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2361111111111111, + "acc_stderr": 0.03551446610810826, + "acc_norm": 0.2361111111111111, + "acc_norm_stderr": 0.03551446610810826 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.23121387283236994, + "acc_stderr": 0.03214737302029471, + "acc_norm": 0.23121387283236994, + "acc_norm_stderr": 0.03214737302029471 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.16666666666666666, + "acc_stderr": 0.037082846624165444, + "acc_norm": 0.16666666666666666, + "acc_norm_stderr": 0.037082846624165444 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2425531914893617, + "acc_stderr": 0.028020226271200217, + "acc_norm": 0.2425531914893617, + "acc_norm_stderr": 0.028020226271200217 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.0414243971948936, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.0414243971948936 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.25517241379310346, + "acc_stderr": 0.03632984052707842, + "acc_norm": 0.25517241379310346, + "acc_norm_stderr": 0.03632984052707842 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24338624338624337, + "acc_stderr": 0.02210112878741543, + "acc_norm": 0.24338624338624337, + "acc_norm_stderr": 0.02210112878741543 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.1746031746031746, + "acc_stderr": 0.03395490020856112, + "acc_norm": 0.1746031746031746, + "acc_norm_stderr": 0.03395490020856112 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2870967741935484, + "acc_stderr": 0.025736542745594528, + "acc_norm": 0.2870967741935484, + "acc_norm_stderr": 0.025736542745594528 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2955665024630542, + "acc_stderr": 0.032104944337514575, + "acc_norm": 0.2955665024630542, + "acc_norm_stderr": 0.032104944337514575 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.03453131801885415, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.03453131801885415 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3686868686868687, + "acc_stderr": 0.034373055019806184, + "acc_norm": 0.3686868686868687, + "acc_norm_stderr": 0.034373055019806184 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.23316062176165803, + "acc_stderr": 0.030516111371476008, + "acc_norm": 0.23316062176165803, + "acc_norm_stderr": 0.030516111371476008 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2205128205128205, + "acc_stderr": 0.021020672680827912, + "acc_norm": 0.2205128205128205, + "acc_norm_stderr": 0.021020672680827912 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085626, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085626 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.02665353159671548, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.02665353159671548 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.24770642201834864, + "acc_stderr": 0.018508143602547805, + "acc_norm": 0.24770642201834864, + "acc_norm_stderr": 0.018508143602547805 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.25, + "acc_stderr": 0.029531221160930918, + "acc_norm": 0.25, + "acc_norm_stderr": 0.029531221160930918 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.29535864978902954, + "acc_stderr": 0.02969633871342289, + "acc_norm": 0.29535864978902954, + "acc_norm_stderr": 0.02969633871342289 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.23766816143497757, + "acc_stderr": 0.028568079464714277, + "acc_norm": 0.23766816143497757, + "acc_norm_stderr": 0.028568079464714277 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.45454545454545453, + "acc_stderr": 0.045454545454545456, + "acc_norm": 0.45454545454545453, + "acc_norm_stderr": 0.045454545454545456 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.04330043749650741, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.04330043749650741 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.294478527607362, + "acc_stderr": 0.03581165790474082, + "acc_norm": 0.294478527607362, + "acc_norm_stderr": 0.03581165790474082 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.24107142857142858, + "acc_stderr": 0.04059867246952687, + "acc_norm": 0.24107142857142858, + "acc_norm_stderr": 0.04059867246952687 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.18446601941747573, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.18446601941747573, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.24358974358974358, + "acc_stderr": 0.028120966503914397, + "acc_norm": 0.24358974358974358, + "acc_norm_stderr": 0.028120966503914397 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2988505747126437, + "acc_stderr": 0.016369256815093117, + "acc_norm": 0.2988505747126437, + "acc_norm_stderr": 0.016369256815093117 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.29190751445086704, + "acc_stderr": 0.024476994076247337, + "acc_norm": 0.29190751445086704, + "acc_norm_stderr": 0.024476994076247337 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2875816993464052, + "acc_stderr": 0.02591780611714716, + "acc_norm": 0.2875816993464052, + "acc_norm_stderr": 0.02591780611714716 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3215434083601286, + "acc_stderr": 0.026527724079528872, + "acc_norm": 0.3215434083601286, + "acc_norm_stderr": 0.026527724079528872 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2716049382716049, + "acc_stderr": 0.024748624490537375, + "acc_norm": 0.2716049382716049, + "acc_norm_stderr": 0.024748624490537375 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2553191489361702, + "acc_stderr": 0.026011992930902, + "acc_norm": 0.2553191489361702, + "acc_norm_stderr": 0.026011992930902 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2816166883963494, + "acc_stderr": 0.011487783272786696, + "acc_norm": 0.2816166883963494, + "acc_norm_stderr": 0.011487783272786696 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.20220588235294118, + "acc_stderr": 0.02439819298665492, + "acc_norm": 0.20220588235294118, + "acc_norm_stderr": 0.02439819298665492 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2875816993464052, + "acc_stderr": 0.018311653053648222, + "acc_norm": 0.2875816993464052, + "acc_norm_stderr": 0.018311653053648222 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2818181818181818, + "acc_stderr": 0.0430911870994646, + "acc_norm": 0.2818181818181818, + "acc_norm_stderr": 0.0430911870994646 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.27755102040816326, + "acc_stderr": 0.028666857790274648, + "acc_norm": 0.27755102040816326, + "acc_norm_stderr": 0.028666857790274648 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.26865671641791045, + "acc_stderr": 0.03134328358208955, + "acc_norm": 0.26865671641791045, + "acc_norm_stderr": 0.03134328358208955 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.25903614457831325, + "acc_stderr": 0.03410646614071855, + "acc_norm": 0.25903614457831325, + "acc_norm_stderr": 0.03410646614071855 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.30994152046783624, + "acc_stderr": 0.03546976959393163, + "acc_norm": 0.30994152046783624, + "acc_norm_stderr": 0.03546976959393163 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.21664626682986537, + "mc1_stderr": 0.014421468452506983, + "mc2": 0.3196486720150373, + "mc2_stderr": 0.013605255058273893 + }, + "all": { + "acc": 0.27403154507818933, + "acc_stderr": 0.032179983157039, + "acc_norm": 0.277374636621042, + "acc_norm_stderr": 0.03217892628412224, + "mc1": 0.21664626682986537, + "mc1_stderr": 0.014421468452506983, + "mc2": 0.3196486720150373, + "mc2_stderr": 0.013605255058273893 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "hakurei/instruct-12b", + "model_sha": "ff4699b502b79c716330b6f761002588a65dcba6", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "573b1b078b6e9deb", + "hash_cont_tokens": "22424bcffb42ecdf" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "f0fd0caf4d4c1110", + "hash_cont_tokens": "62a15ef112ea07d6" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "f076ac6b177ca28c", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "059827606e6b0780", + "hash_cont_tokens": "ec7e2288ab5f1ce9" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "1dd0dab88aa9e4b2", + "hash_cont_tokens": "d7e922da5bc6d1bf" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "d51eb5246cbe2173", + "hash_cont_tokens": "08933598b321179c" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "2337a7f17800c6ec", + "hash_cont_tokens": "bc82b3cc5072f164" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "e394ebbb8ceace76", + "hash_cont_tokens": "3bc45e0c4b6d612d" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "9221fbdf710a6f67", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "ebe2748d21b2ba41", + "hash_cont_tokens": "d839b8186e0f3d94" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "bfecefb08ffb7faa", + "hash_cont_tokens": "3c16f9c45a7a7272" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "2ac8aec9025dc58b", + "hash_cont_tokens": "16f654508cdc19c4" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "faf44c77f43368ef", + "hash_cont_tokens": "a3a24586c7218684" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "280c7f12abde10a5", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "217a841c86d2d992", + "hash_cont_tokens": "43818b3dc0c7496f" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "354267c0f98aad3b", + "hash_cont_tokens": "4f0a3e41169314a8" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "4f5e8d051d04dde0", + "hash_cont_tokens": "7e14ccd1e2688bb8" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "cd12bec1d5448dda", + "hash_cont_tokens": "317e29ee6bba387d" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "c549e395850984fe", + "hash_cont_tokens": "c01a9b75f55e32e0" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "81b06f5caa221f97", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "ad626d781102fe51", + "hash_cont_tokens": "edb2063e955bd5ca" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "2c0d3f2eacc6bbd5", + "hash_cont_tokens": "8000de09bc1dc113" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "aada51d0571db37b", + "hash_cont_tokens": "dcd6a0ada4ab8e0b" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "6e47d696116edd01", + "hash_cont_tokens": "47a5e5973f50fe17" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "0e8ee6c9e572e3c4", + "hash_cont_tokens": "812f79117b9593de" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8fa2bf90de3b07e7", + "hash_cont_tokens": "b4c405890ebd3ee1" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fabb8f176276af2f", + "hash_cont_tokens": "8d468d84a686647d" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3e86d13ef021476a", + "hash_cont_tokens": "e5d02f8f1c5dcf31" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a132b5e9c9531b36", + "hash_cont_tokens": "4c32e38c066727bc" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "f8f6fe5143776cb4", + "hash_cont_tokens": "9416ad85fd6f4a2c" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "e28121967b27a315", + "hash_cont_tokens": "57cc212706ddcdf4" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "bdbe90efb4a1c4ce", + "hash_cont_tokens": "8c5c954092a64343" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "b8f58f05dc082011", + "hash_cont_tokens": "e5ab34a54e3f5b7c" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "3af911bf93093a85", + "hash_cont_tokens": "f3276c80ce1b205b" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "1dd2240eb90b9a70", + "hash_cont_tokens": "7982edf99219e1b0" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f3de2f8181824a79", + "hash_cont_tokens": "ed73d516c5552dd0" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "0c2a1dd63cc74137", + "hash_cont_tokens": "549d9b32b8a90e4e" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "08e3527985f33aab", + "hash_cont_tokens": "ddf5241e450210d6" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "bf7216a648529f68", + "hash_cont_tokens": "eb791fcbee9e0682" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "28f5891c956afd65", + "hash_cont_tokens": "c66b1f3b46001b09" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6de88b824d4f64c3", + "hash_cont_tokens": "27795e9c98bdeda8" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "5ef855d01044fd83", + "hash_cont_tokens": "874c5b0b496cbe8a" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "1840e0b96d7e619e", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "02483f6b53dc13ac", + "hash_cont_tokens": "313ee361fbdbab3c" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "93202e79d594dde4", + "hash_cont_tokens": "fe7747dc69c4909e" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "41c03f41d2ba9fe7", + "hash_cont_tokens": "e0d0ad58a3f1ff22" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "d83bcb6dd08809ac", + "hash_cont_tokens": "c55a10a018de0228" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "65c70474c8a5d205", + "hash_cont_tokens": "7916d26928435f1a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "4d4126ac9a91ac47", + "hash_cont_tokens": "81836c52a10e6ffd" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "592f80ad364d686a", + "hash_cont_tokens": "f5d669014a273483" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "7f837322b1b62ac1", + "hash_cont_tokens": "6b31cf265df9b81b" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "05a8ef0dd10b4bba", + "hash_cont_tokens": "4b3ac60441ad14ec" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3c7944f0b2c49f64", + "hash_cont_tokens": "f139af481f2a9e74" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "637e934bb716d5ec", + "hash_cont_tokens": "ca79966b90cda0ea" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "3bad229573ed6a9c", + "hash_cont_tokens": "952a2e479fc3a83e" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "70a479e96d02d5d8", + "hash_cont_tokens": "f49476cf49b37d7c" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "0d690fc0db462440", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "4b0fdf8e692dd640", + "hash_cont_tokens": "0065c4bbe6134c1c" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "cfd7092dc8aacd96", + "hash_cont_tokens": "9a178e9ec050bf3e" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "e820abadeb7ebfb3", + "hash_cont_tokens": "7f48ddfffa64eb41" + } + } +} \ No newline at end of file diff --git a/eval-results/hakurei/instruct-12b/results_2023-10-13T03-15-41.124238.json b/eval-results/hakurei/instruct-12b/results_2023-10-13T03-15-41.124238.json new file mode 100644 index 0000000000000000000000000000000000000000..85f5d2cd8985d7013070d3145cfa5a462f7ba746 --- /dev/null +++ b/eval-results/hakurei/instruct-12b/results_2023-10-13T03-15-41.124238.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "hakurei/instruct-12b", + "model_sha": "ff4699b502b79c716330b6f761002588a65dcba6", + "model_size": "22.07 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.16977768456375839, + "em_stderr": 0.0038448301206051574, + "f1": 0.22378880033557078, + "f1_stderr": 0.00395143426678314 + }, + "harness|gsm8k|5": { + "acc": 0.002274450341167551, + "acc_stderr": 0.0013121578148674003 + }, + "harness|winogrande|5": { + "acc": 0.6345698500394633, + "acc_stderr": 0.013533965097638788 + }, + "all": { + "em": 0.16977768456375839, + "em_stderr": 0.0038448301206051574, + "f1": 0.22378880033557078, + "f1_stderr": 0.00395143426678314, + "acc": 0.31842215019031544, + "acc_stderr": 0.007423061456253094 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "4bf3f6ba1bae765a", + "hash_cont_tokens": "aa1e376f3416f5ff" + }, + "truncated": 439, + "non-truncated": 9097, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "ef516f9ffbe76423", + "hash_cont_tokens": "7016bcf551149761" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c469718508f43cab", + "hash_cont_tokens": "87eeb79172195781" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2456, + "non-padded": 78, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "401c6c49053f17ab", + "hash_cont_tokens": "e233d163791a140c" + }, + "total_evaluation_time_secondes": "8561.643742799759", + "truncated": 439, + "non-truncated": 12950, + "padded": 2456, + "non-padded": 10933, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/hakurei/lotus-12B/results_2023-07-18T13-41-37.836572.json b/eval-results/hakurei/lotus-12B/results_2023-07-18T13-41-37.836572.json new file mode 100644 index 0000000000000000000000000000000000000000..8fa5a63163082cda4fb2e4851670693c1fe5ef09 --- /dev/null +++ b/eval-results/hakurei/lotus-12B/results_2023-07-18T13-41-37.836572.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.26535836177474403, + "acc_stderr": 0.012902554762313962, + "acc_norm": 0.30716723549488056, + "acc_norm_stderr": 0.013481034054980945 + }, + "harness|hellaswag|10": { + "acc": 0.4054969129655447, + "acc_stderr": 0.0048998450871831105, + "acc_norm": 0.5270862378012349, + "acc_norm_stderr": 0.004982454383162063 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.03785714465066653, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.03785714465066653 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.19736842105263158, + "acc_stderr": 0.03238981601699397, + "acc_norm": 0.19736842105263158, + "acc_norm_stderr": 0.03238981601699397 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.27169811320754716, + "acc_stderr": 0.02737770662467071, + "acc_norm": 0.27169811320754716, + "acc_norm_stderr": 0.02737770662467071 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.03476590104304134, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.03476590104304134 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.17, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.17, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.21965317919075145, + "acc_stderr": 0.031568093627031744, + "acc_norm": 0.21965317919075145, + "acc_norm_stderr": 0.031568093627031744 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.04280105837364396, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.04280105837364396 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.17, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.17, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3021276595744681, + "acc_stderr": 0.03001755447188055, + "acc_norm": 0.3021276595744681, + "acc_norm_stderr": 0.03001755447188055 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.23448275862068965, + "acc_stderr": 0.035306258743465914, + "acc_norm": 0.23448275862068965, + "acc_norm_stderr": 0.035306258743465914 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24867724867724866, + "acc_stderr": 0.02226181769240017, + "acc_norm": 0.24867724867724866, + "acc_norm_stderr": 0.02226181769240017 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.18253968253968253, + "acc_stderr": 0.03455071019102148, + "acc_norm": 0.18253968253968253, + "acc_norm_stderr": 0.03455071019102148 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25806451612903225, + "acc_stderr": 0.024892469172462833, + "acc_norm": 0.25806451612903225, + "acc_norm_stderr": 0.024892469172462833 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.29064039408866993, + "acc_stderr": 0.0319474007226554, + "acc_norm": 0.29064039408866993, + "acc_norm_stderr": 0.0319474007226554 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.03453131801885416, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.03453131801885416 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.24242424242424243, + "acc_stderr": 0.030532892233932036, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.030532892233932036 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.21761658031088082, + "acc_stderr": 0.029778663037752954, + "acc_norm": 0.21761658031088082, + "acc_norm_stderr": 0.029778663037752954 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.22564102564102564, + "acc_stderr": 0.021193632525148533, + "acc_norm": 0.22564102564102564, + "acc_norm_stderr": 0.021193632525148533 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.02592887613276612, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.02592887613276612 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.18907563025210083, + "acc_stderr": 0.02543511943810535, + "acc_norm": 0.18907563025210083, + "acc_norm_stderr": 0.02543511943810535 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.19205298013245034, + "acc_stderr": 0.032162984205936135, + "acc_norm": 0.19205298013245034, + "acc_norm_stderr": 0.032162984205936135 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.24403669724770644, + "acc_stderr": 0.018415286351416416, + "acc_norm": 0.24403669724770644, + "acc_norm_stderr": 0.018415286351416416 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.031141447823536023, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.031141447823536023 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.030964517926923403, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.030964517926923403 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3452914798206278, + "acc_stderr": 0.031911001928357954, + "acc_norm": 0.3452914798206278, + "acc_norm_stderr": 0.031911001928357954 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2366412213740458, + "acc_stderr": 0.03727673575596918, + "acc_norm": 0.2366412213740458, + "acc_norm_stderr": 0.03727673575596918 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2644628099173554, + "acc_stderr": 0.04026187527591206, + "acc_norm": 0.2644628099173554, + "acc_norm_stderr": 0.04026187527591206 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.26993865030674846, + "acc_stderr": 0.03487825168497892, + "acc_norm": 0.26993865030674846, + "acc_norm_stderr": 0.03487825168497892 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.26785714285714285, + "acc_stderr": 0.04203277291467764, + "acc_norm": 0.26785714285714285, + "acc_norm_stderr": 0.04203277291467764 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3300970873786408, + "acc_stderr": 0.04656147110012352, + "acc_norm": 0.3300970873786408, + "acc_norm_stderr": 0.04656147110012352 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2264957264957265, + "acc_stderr": 0.027421007295392916, + "acc_norm": 0.2264957264957265, + "acc_norm_stderr": 0.027421007295392916 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.28735632183908044, + "acc_stderr": 0.0161824107306827, + "acc_norm": 0.28735632183908044, + "acc_norm_stderr": 0.0161824107306827 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2254335260115607, + "acc_stderr": 0.022497230190967547, + "acc_norm": 0.2254335260115607, + "acc_norm_stderr": 0.022497230190967547 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.02428861946604612, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.02428861946604612 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2540192926045016, + "acc_stderr": 0.02472386150477169, + "acc_norm": 0.2540192926045016, + "acc_norm_stderr": 0.02472386150477169 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2808641975308642, + "acc_stderr": 0.025006469755799204, + "acc_norm": 0.2808641975308642, + "acc_norm_stderr": 0.025006469755799204 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2765957446808511, + "acc_stderr": 0.026684564340461004, + "acc_norm": 0.2765957446808511, + "acc_norm_stderr": 0.026684564340461004 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.23859191655801826, + "acc_stderr": 0.010885929742002205, + "acc_norm": 0.23859191655801826, + "acc_norm_stderr": 0.010885929742002205 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.024562204314142317, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.024562204314142317 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.26143790849673204, + "acc_stderr": 0.017776947157528037, + "acc_norm": 0.26143790849673204, + "acc_norm_stderr": 0.017776947157528037 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.04172343038705383, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.04172343038705383 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.17551020408163265, + "acc_stderr": 0.024352800722970015, + "acc_norm": 0.17551020408163265, + "acc_norm_stderr": 0.024352800722970015 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23383084577114427, + "acc_stderr": 0.029929415408348387, + "acc_norm": 0.23383084577114427, + "acc_norm_stderr": 0.029929415408348387 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2710843373493976, + "acc_stderr": 0.03460579907553026, + "acc_norm": 0.2710843373493976, + "acc_norm_stderr": 0.03460579907553026 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.03188578017686399, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.03188578017686399 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.22643818849449204, + "mc1_stderr": 0.014651337324602574, + "mc2": 0.40115476804436745, + "mc2_stderr": 0.014756133562988513 + }, + "all": { + "acc": 0.2485367177317206, + "acc_stderr": 0.03124226591981919, + "acc_norm": 0.25130617872419225, + "acc_norm_stderr": 0.03125347081115218, + "mc1": 0.22643818849449204, + "mc1_stderr": 0.014651337324602574, + "mc2": 0.40115476804436745, + "mc2_stderr": 0.014756133562988513 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "hakurei/lotus-12B", + "model_sha": "f212b695aabf5dafb5dccf5013ddb765ba1e47d7", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "573b1b078b6e9deb", + "hash_cont_tokens": "22424bcffb42ecdf" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "f0fd0caf4d4c1110", + "hash_cont_tokens": "62a15ef112ea07d6" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "f076ac6b177ca28c", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "059827606e6b0780", + "hash_cont_tokens": "ec7e2288ab5f1ce9" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "1dd0dab88aa9e4b2", + "hash_cont_tokens": "d7e922da5bc6d1bf" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "d51eb5246cbe2173", + "hash_cont_tokens": "08933598b321179c" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "2337a7f17800c6ec", + "hash_cont_tokens": "bc82b3cc5072f164" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "e394ebbb8ceace76", + "hash_cont_tokens": "3bc45e0c4b6d612d" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "9221fbdf710a6f67", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "ebe2748d21b2ba41", + "hash_cont_tokens": "d839b8186e0f3d94" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "bfecefb08ffb7faa", + "hash_cont_tokens": "3c16f9c45a7a7272" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "2ac8aec9025dc58b", + "hash_cont_tokens": "16f654508cdc19c4" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "faf44c77f43368ef", + "hash_cont_tokens": "a3a24586c7218684" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "280c7f12abde10a5", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "217a841c86d2d992", + "hash_cont_tokens": "43818b3dc0c7496f" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "354267c0f98aad3b", + "hash_cont_tokens": "4f0a3e41169314a8" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "4f5e8d051d04dde0", + "hash_cont_tokens": "7e14ccd1e2688bb8" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "cd12bec1d5448dda", + "hash_cont_tokens": "317e29ee6bba387d" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "c549e395850984fe", + "hash_cont_tokens": "c01a9b75f55e32e0" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "81b06f5caa221f97", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "ad626d781102fe51", + "hash_cont_tokens": "edb2063e955bd5ca" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "2c0d3f2eacc6bbd5", + "hash_cont_tokens": "8000de09bc1dc113" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "aada51d0571db37b", + "hash_cont_tokens": "dcd6a0ada4ab8e0b" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "6e47d696116edd01", + "hash_cont_tokens": "47a5e5973f50fe17" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "0e8ee6c9e572e3c4", + "hash_cont_tokens": "812f79117b9593de" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8fa2bf90de3b07e7", + "hash_cont_tokens": "b4c405890ebd3ee1" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fabb8f176276af2f", + "hash_cont_tokens": "8d468d84a686647d" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3e86d13ef021476a", + "hash_cont_tokens": "e5d02f8f1c5dcf31" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a132b5e9c9531b36", + "hash_cont_tokens": "4c32e38c066727bc" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "f8f6fe5143776cb4", + "hash_cont_tokens": "9416ad85fd6f4a2c" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "e28121967b27a315", + "hash_cont_tokens": "57cc212706ddcdf4" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "bdbe90efb4a1c4ce", + "hash_cont_tokens": "8c5c954092a64343" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "b8f58f05dc082011", + "hash_cont_tokens": "e5ab34a54e3f5b7c" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "3af911bf93093a85", + "hash_cont_tokens": "f3276c80ce1b205b" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "1dd2240eb90b9a70", + "hash_cont_tokens": "7982edf99219e1b0" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f3de2f8181824a79", + "hash_cont_tokens": "ed73d516c5552dd0" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "0c2a1dd63cc74137", + "hash_cont_tokens": "549d9b32b8a90e4e" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "08e3527985f33aab", + "hash_cont_tokens": "ddf5241e450210d6" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "bf7216a648529f68", + "hash_cont_tokens": "eb791fcbee9e0682" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "28f5891c956afd65", + "hash_cont_tokens": "c66b1f3b46001b09" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6de88b824d4f64c3", + "hash_cont_tokens": "27795e9c98bdeda8" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "5ef855d01044fd83", + "hash_cont_tokens": "874c5b0b496cbe8a" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "1840e0b96d7e619e", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "02483f6b53dc13ac", + "hash_cont_tokens": "313ee361fbdbab3c" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "93202e79d594dde4", + "hash_cont_tokens": "fe7747dc69c4909e" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "41c03f41d2ba9fe7", + "hash_cont_tokens": "e0d0ad58a3f1ff22" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "d83bcb6dd08809ac", + "hash_cont_tokens": "c55a10a018de0228" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "65c70474c8a5d205", + "hash_cont_tokens": "7916d26928435f1a" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "4d4126ac9a91ac47", + "hash_cont_tokens": "81836c52a10e6ffd" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "592f80ad364d686a", + "hash_cont_tokens": "f5d669014a273483" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "7f837322b1b62ac1", + "hash_cont_tokens": "6b31cf265df9b81b" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "05a8ef0dd10b4bba", + "hash_cont_tokens": "4b3ac60441ad14ec" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3c7944f0b2c49f64", + "hash_cont_tokens": "f139af481f2a9e74" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "637e934bb716d5ec", + "hash_cont_tokens": "ca79966b90cda0ea" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "3bad229573ed6a9c", + "hash_cont_tokens": "952a2e479fc3a83e" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "70a479e96d02d5d8", + "hash_cont_tokens": "f49476cf49b37d7c" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "0d690fc0db462440", + "hash_cont_tokens": "74c639e56bb475af" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "4b0fdf8e692dd640", + "hash_cont_tokens": "0065c4bbe6134c1c" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "cfd7092dc8aacd96", + "hash_cont_tokens": "9a178e9ec050bf3e" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "e820abadeb7ebfb3", + "hash_cont_tokens": "7f48ddfffa64eb41" + } + } +} \ No newline at end of file diff --git a/eval-results/hakurei/mommygpt-3B/results_2023-11-28T02-00-43.800415.json b/eval-results/hakurei/mommygpt-3B/results_2023-11-28T02-00-43.800415.json new file mode 100644 index 0000000000000000000000000000000000000000..933faf1248c69688608e85933e55c25daf8ca88a --- /dev/null +++ b/eval-results/hakurei/mommygpt-3B/results_2023-11-28T02-00-43.800415.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 930526.2028628, + "end_time": 942757.91778338, + "total_evaluation_time_secondes": "12231.714920580038", + "model_name": "hakurei/mommygpt-3B", + "model_sha": "0369335d693b753774050ae44dbaf73bac39e9eb", + "model_dtype": "torch.float16", + "model_size": "6.4 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.39419795221843, + "acc_stderr": 0.014280522667467325, + "acc_norm": 0.4189419795221843, + "acc_norm_stderr": 0.014418106953639013 + }, + "harness|hellaswag|10": { + "acc": 0.5411272654849631, + "acc_stderr": 0.004972872811662297, + "acc_norm": 0.7168890659231228, + "acc_norm_stderr": 0.004495891440519415 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2814814814814815, + "acc_stderr": 0.03885004245800254, + "acc_norm": 0.2814814814814815, + "acc_norm_stderr": 0.03885004245800254 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.32894736842105265, + "acc_stderr": 0.03823428969926604, + "acc_norm": 0.32894736842105265, + "acc_norm_stderr": 0.03823428969926604 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252606, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252606 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.27169811320754716, + "acc_stderr": 0.027377706624670713, + "acc_norm": 0.27169811320754716, + "acc_norm_stderr": 0.027377706624670713 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.24277456647398843, + "acc_stderr": 0.0326926380614177, + "acc_norm": 0.24277456647398843, + "acc_norm_stderr": 0.0326926380614177 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.30392156862745096, + "acc_stderr": 0.04576665403207762, + "acc_norm": 0.30392156862745096, + "acc_norm_stderr": 0.04576665403207762 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2936170212765957, + "acc_stderr": 0.02977164271249123, + "acc_norm": 0.2936170212765957, + "acc_norm_stderr": 0.02977164271249123 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.22807017543859648, + "acc_stderr": 0.03947152782669415, + "acc_norm": 0.22807017543859648, + "acc_norm_stderr": 0.03947152782669415 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2206896551724138, + "acc_stderr": 0.0345593020192481, + "acc_norm": 0.2206896551724138, + "acc_norm_stderr": 0.0345593020192481 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.023266512213730575, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.023266512213730575 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.04263906892795132, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.04263906892795132 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.20967741935483872, + "acc_stderr": 0.023157879349083522, + "acc_norm": 0.20967741935483872, + "acc_norm_stderr": 0.023157879349083522 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.26108374384236455, + "acc_stderr": 0.03090379695211447, + "acc_norm": 0.26108374384236455, + "acc_norm_stderr": 0.03090379695211447 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.296969696969697, + "acc_stderr": 0.03567969772268049, + "acc_norm": 0.296969696969697, + "acc_norm_stderr": 0.03567969772268049 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2828282828282828, + "acc_stderr": 0.03208779558786751, + "acc_norm": 0.2828282828282828, + "acc_norm_stderr": 0.03208779558786751 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.20725388601036268, + "acc_stderr": 0.029252823291803617, + "acc_norm": 0.20725388601036268, + "acc_norm_stderr": 0.029252823291803617 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.28717948717948716, + "acc_stderr": 0.022939925418530627, + "acc_norm": 0.28717948717948716, + "acc_norm_stderr": 0.022939925418530627 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.026067159222275794, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.026067159222275794 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.02755361446786381, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.02755361446786381 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.03802039760107903, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.03802039760107903 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.26605504587155965, + "acc_stderr": 0.018946022322225593, + "acc_norm": 0.26605504587155965, + "acc_norm_stderr": 0.018946022322225593 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3287037037037037, + "acc_stderr": 0.032036140846700596, + "acc_norm": 0.3287037037037037, + "acc_norm_stderr": 0.032036140846700596 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.34080717488789236, + "acc_stderr": 0.031811497470553604, + "acc_norm": 0.34080717488789236, + "acc_norm_stderr": 0.031811497470553604 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.24427480916030533, + "acc_stderr": 0.037683359597287434, + "acc_norm": 0.24427480916030533, + "acc_norm_stderr": 0.037683359597287434 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.38016528925619836, + "acc_stderr": 0.04431324501968432, + "acc_norm": 0.38016528925619836, + "acc_norm_stderr": 0.04431324501968432 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.043300437496507416, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.043300437496507416 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2392638036809816, + "acc_stderr": 0.03351953879521269, + "acc_norm": 0.2392638036809816, + "acc_norm_stderr": 0.03351953879521269 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.24107142857142858, + "acc_stderr": 0.04059867246952688, + "acc_norm": 0.24107142857142858, + "acc_norm_stderr": 0.04059867246952688 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.2524271844660194, + "acc_stderr": 0.04301250399690878, + "acc_norm": 0.2524271844660194, + "acc_norm_stderr": 0.04301250399690878 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.31196581196581197, + "acc_stderr": 0.03035152732334494, + "acc_norm": 0.31196581196581197, + "acc_norm_stderr": 0.03035152732334494 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2822477650063857, + "acc_stderr": 0.016095302969878544, + "acc_norm": 0.2822477650063857, + "acc_norm_stderr": 0.016095302969878544 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.26878612716763006, + "acc_stderr": 0.023868003262500104, + "acc_norm": 0.26878612716763006, + "acc_norm_stderr": 0.023868003262500104 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.21452513966480447, + "acc_stderr": 0.013728923407828855, + "acc_norm": 0.21452513966480447, + "acc_norm_stderr": 0.013728923407828855 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.24183006535947713, + "acc_stderr": 0.024518195641879334, + "acc_norm": 0.24183006535947713, + "acc_norm_stderr": 0.024518195641879334 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3183279742765273, + "acc_stderr": 0.02645722506781102, + "acc_norm": 0.3183279742765273, + "acc_norm_stderr": 0.02645722506781102 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.28703703703703703, + "acc_stderr": 0.025171041915309684, + "acc_norm": 0.28703703703703703, + "acc_norm_stderr": 0.025171041915309684 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2801418439716312, + "acc_stderr": 0.02678917235114023, + "acc_norm": 0.2801418439716312, + "acc_norm_stderr": 0.02678917235114023 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2711864406779661, + "acc_stderr": 0.011354581451622985, + "acc_norm": 0.2711864406779661, + "acc_norm_stderr": 0.011354581451622985 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.29044117647058826, + "acc_stderr": 0.027576468622740522, + "acc_norm": 0.29044117647058826, + "acc_norm_stderr": 0.027576468622740522 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.018433427649401896, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.018433427649401896 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.3090909090909091, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.3090909090909091, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.39591836734693875, + "acc_stderr": 0.03130802899065686, + "acc_norm": 0.39591836734693875, + "acc_norm_stderr": 0.03130802899065686 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2935323383084577, + "acc_stderr": 0.03220024104534205, + "acc_norm": 0.2935323383084577, + "acc_norm_stderr": 0.03220024104534205 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3072289156626506, + "acc_stderr": 0.03591566797824664, + "acc_norm": 0.3072289156626506, + "acc_norm_stderr": 0.03591566797824664 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.03615507630310933, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.03615507630310933 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.24724602203182375, + "mc1_stderr": 0.015102404797359652, + "mc2": 0.37904442323905513, + "mc2_stderr": 0.014134668871302195 + }, + "harness|winogrande|5": { + "acc": 0.6582478295185478, + "acc_stderr": 0.013330103018622854 + }, + "harness|drop|3": { + "em": 0.0017827181208053692, + "em_stderr": 0.0004320097346038873, + "f1": 0.06302642617449686, + "f1_stderr": 0.0014610915084093902 + }, + "harness|gsm8k|5": { + "acc": 0.009855951478392721, + "acc_stderr": 0.002721076577041663 + }, + "all": { + "acc": 0.2947974745205159, + "acc_stderr": 0.032190357869113936, + "acc_norm": 0.29686522021180417, + "acc_norm_stderr": 0.03300375005653076, + "mc1": 0.24724602203182375, + "mc1_stderr": 0.015102404797359652, + "mc2": 0.37904442323905513, + "mc2_stderr": 0.014134668871302195, + "em": 0.0017827181208053692, + "em_stderr": 0.0004320097346038873, + "f1": 0.06302642617449686, + "f1_stderr": 0.0014610915084093902 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "59c328d432da064f", + "hash_cont_tokens": "2e8835aa03b9c2cf" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4676, + "non_padded": 11, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "9eaa83dae54ba52a", + "hash_cont_tokens": "18a48de3edcef462" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 39987, + "non_padded": 181, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "4129e579fbf0ebc2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "85c455354ae2ebd0", + "hash_cont_tokens": "1d81fa80e3039a08" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "221506ab8405000a", + "hash_cont_tokens": "247dc44c6b578728" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "16c21dd1ddd4ee38", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "24b21e9d78658e4d", + "hash_cont_tokens": "26e3b69d5fb27bb2" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "770d74c6a8c9c0b7", + "hash_cont_tokens": "bbda31842f3930d5" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 568, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "7dea1631558d65ac", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "22600976f0f9ffc6", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "564ae334c5a56510", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "bce86eecdc3bb76a", + "hash_cont_tokens": "894854ed7bec57f7" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 688, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "1188d9d525ab28e7", + "hash_cont_tokens": "13130ec6de384bbb" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "692856445804bec5", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "5ade2ffc8b9f5d4a", + "hash_cont_tokens": "29089b8b7020611e" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "9b766b5e103ce426", + "hash_cont_tokens": "efc596dfa1a1f073" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "dd9935cf301e82f9", + "hash_cont_tokens": "70817a7ac9f44af2" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 560, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "78c8ba2ecf6e0dc2", + "hash_cont_tokens": "937cd53d06cc6e16" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "661893e4f7f37eba", + "hash_cont_tokens": "eec972abe0fc0f5a" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "4a8d10395fdc21f0", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "816c7d936dbe01da", + "hash_cont_tokens": "94971ccfe8e59c25" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "769ab5386fedf26e", + "hash_cont_tokens": "a78e38b59778a04c" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "5b6bcda94f3ca2df", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "142b719c7d7d4fe0", + "hash_cont_tokens": "91dc522e4e4e91c3" + }, + "truncated": 660, + "non_truncated": -495, + "padded": 0, + "non_padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "281dcc445ad0af4a", + "hash_cont_tokens": "f275c901b3d285f9" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "bb8f5852975ec963", + "hash_cont_tokens": "85eb58f423437cce" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 770, + "non_padded": 2, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "e769357a349b7644", + "hash_cont_tokens": "39a93706184f896b" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "4ab345e3c0507320", + "hash_cont_tokens": "d41065d20b689af3" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "52ec665069da063e", + "hash_cont_tokens": "28c1f7c11bf85409" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "f23b89453c7c6050", + "hash_cont_tokens": "78c510e6c5d316ac" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "bb0f46fa5669c46e", + "hash_cont_tokens": "0ba4ecffc67603c5" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "db3276d6935c41ac", + "hash_cont_tokens": "4a0339e9ad3efa6d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a54112084a848a44", + "hash_cont_tokens": "2529d55ec490f81f" + }, + "truncated": 816, + "non_truncated": -612, + "padded": 0, + "non_padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "89cf33fb840f27be", + "hash_cont_tokens": "21808b54f5df97b2" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "ecf9f32ac289d1be", + "hash_cont_tokens": "92acdd467ed943e1" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ebf05f3ed8d69562", + "hash_cont_tokens": "a6034ed95a124315" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "b0d9e6f90b58599e", + "hash_cont_tokens": "223fbf3fd106c04b" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "ddb8c4eaa3d71594", + "hash_cont_tokens": "7c8e30f486ff156a" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 428, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "a04883884a711ebf", + "hash_cont_tokens": "b4cc4a8d31bbaa03" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 636, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "d5511967956880ea", + "hash_cont_tokens": "7f0e1289ec188e82" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "8c35c18f5a96b3b3", + "hash_cont_tokens": "66b726b356a02feb" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "a80e346390d1f88c", + "hash_cont_tokens": "f08457005b652d25" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "5caf5eb895cd3ccd", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "795c466e9f87e4c1", + "hash_cont_tokens": "647bcbd68f292558" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "505a224f2325b0ec", + "hash_cont_tokens": "6849b7fe56c50dda" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1368, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3f767d07e9ec8662", + "hash_cont_tokens": "81585ec455b1e3e5" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "0bc8cefb3f763640", + "hash_cont_tokens": "471b68eb20e5d34b" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "36e85ac3fd3f3c64", + "hash_cont_tokens": "6e39384b9c0a8cc2" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "1b04a90b19ce0623", + "hash_cont_tokens": "bfe513578190093f" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "8db39e7efe9edb93", + "hash_cont_tokens": "9ce431b67350b312" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "f638aace411a0bd9", + "hash_cont_tokens": "0ff990d9cc38024d" + }, + "truncated": 168, + "non_truncated": 1366, + "padded": 5968, + "non_padded": 168, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c0f160879d378d4d", + "hash_cont_tokens": "bc3c70e15bc7dce0" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "a66dcd2d6795f6ec", + "hash_cont_tokens": "58464ea26d81f908" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5263b25641f9702c", + "hash_cont_tokens": "eaf6a5d3ddd39a12" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "0350ab02a3d50c5f", + "hash_cont_tokens": "618fd4f954253134" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "2c8688ec4c1a1673", + "hash_cont_tokens": "b4962d9e583b12c0" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "c24ed5c990a2b92c", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "59ca81fd3abf68b3", + "hash_cont_tokens": "397a75462a9735e3" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4cebe9a8da92320d", + "hash_cont_tokens": "de629d1414e01de8" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "3e6036a8ea87ff4f", + "hash_cont_tokens": "df48bc66e06781f2" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "0591af93c06ece74", + "hash_cont_tokens": "828897df1f4f08a1" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a65c9eacad86ea52", + "hash_cont_tokens": "fd1608cc81c56741" + }, + "truncated": 980, + "non_truncated": 8556, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bf7d8c6b5e4f7948", + "hash_cont_tokens": "225a8c22e9a37cb2" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0cc44b083394b097", + "hash_cont_tokens": "4a4ac9c4b8fd0fde" + }, + "truncated": 2624, + "non_truncated": 35571, + "padded": 111639, + "non_padded": 12769, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/hakurei/mommygpt-3B/results_2023-12-02T15-58-26.242540.json b/eval-results/hakurei/mommygpt-3B/results_2023-12-02T15-58-26.242540.json new file mode 100644 index 0000000000000000000000000000000000000000..ea487d67c581ba5e7bed66201cfa4b2b02e0574b --- /dev/null +++ b/eval-results/hakurei/mommygpt-3B/results_2023-12-02T15-58-26.242540.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1401599.378324041, + "end_time": 1404271.36015377, + "total_evaluation_time_secondes": "2671.981829729164", + "model_name": "hakurei/mommygpt-3B", + "model_sha": "0369335d693b753774050ae44dbaf73bac39e9eb", + "model_dtype": "torch.float16", + "model_size": "6.4 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.02122820318423048, + "acc_stderr": 0.003970449129848636 + }, + "all": { + "acc": 0.02122820318423048, + "acc_stderr": 0.003970449129848636 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bf7d8c6b5e4f7948", + "hash_cont_tokens": "225a8c22e9a37cb2" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "554ff890feb7b7d6", + "hash_cont_tokens": "7dd7ad471510b94a" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/harborwater/open-llama-3b-claude-30k/results_2023-11-21T06-37-40.765216.json b/eval-results/harborwater/open-llama-3b-claude-30k/results_2023-11-21T06-37-40.765216.json new file mode 100644 index 0000000000000000000000000000000000000000..1d3bd7acc46e91451220f1937ad16f0f82d5d3bb --- /dev/null +++ b/eval-results/harborwater/open-llama-3b-claude-30k/results_2023-11-21T06-37-40.765216.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 318693.35694302, + "end_time": 336283.636153515, + "total_evaluation_time_secondes": "17590.279210495006", + "model_name": "harborwater/open-llama-3b-claude-30k", + "model_sha": "049db7fda44e5ce1e8febf5c3f45e3a93aaaa859", + "model_dtype": "torch.float16", + "model_size": "6.4 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.3984641638225256, + "acc_stderr": 0.014306946052735562, + "acc_norm": 0.41723549488054607, + "acc_norm_stderr": 0.014409825518403082 + }, + "harness|hellaswag|10": { + "acc": 0.5435172276438957, + "acc_stderr": 0.00497084669755231, + "acc_norm": 0.7264489145588529, + "acc_norm_stderr": 0.004448701611795089 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.18518518518518517, + "acc_stderr": 0.03355677216313142, + "acc_norm": 0.18518518518518517, + "acc_norm_stderr": 0.03355677216313142 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.03317672787533157, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.03317672787533157 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.23018867924528302, + "acc_stderr": 0.02590789712240817, + "acc_norm": 0.23018867924528302, + "acc_norm_stderr": 0.02590789712240817 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.25, + "acc_stderr": 0.03621034121889507, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03621034121889507 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036623, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036623 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.21965317919075145, + "acc_stderr": 0.031568093627031744, + "acc_norm": 0.21965317919075145, + "acc_norm_stderr": 0.031568093627031744 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.042801058373643966, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.042801058373643966 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.31063829787234043, + "acc_stderr": 0.03025123757921317, + "acc_norm": 0.31063829787234043, + "acc_norm_stderr": 0.03025123757921317 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.0404933929774814, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.0404933929774814 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.20689655172413793, + "acc_stderr": 0.03375672449560554, + "acc_norm": 0.20689655172413793, + "acc_norm_stderr": 0.03375672449560554 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.20899470899470898, + "acc_stderr": 0.020940481565334866, + "acc_norm": 0.20899470899470898, + "acc_norm_stderr": 0.020940481565334866 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04040610178208841, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04040610178208841 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.1774193548387097, + "acc_stderr": 0.021732540689329262, + "acc_norm": 0.1774193548387097, + "acc_norm_stderr": 0.021732540689329262 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.15763546798029557, + "acc_stderr": 0.025639014131172404, + "acc_norm": 0.15763546798029557, + "acc_norm_stderr": 0.025639014131172404 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.24242424242424243, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.17676767676767677, + "acc_stderr": 0.027178752639044915, + "acc_norm": 0.17676767676767677, + "acc_norm_stderr": 0.027178752639044915 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.20207253886010362, + "acc_stderr": 0.02897908979429673, + "acc_norm": 0.20207253886010362, + "acc_norm_stderr": 0.02897908979429673 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.20256410256410257, + "acc_stderr": 0.020377660970371372, + "acc_norm": 0.20256410256410257, + "acc_norm_stderr": 0.020377660970371372 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.21851851851851853, + "acc_stderr": 0.02519575225182379, + "acc_norm": 0.21851851851851853, + "acc_norm_stderr": 0.02519575225182379 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.226890756302521, + "acc_stderr": 0.02720537153827946, + "acc_norm": 0.226890756302521, + "acc_norm_stderr": 0.02720537153827946 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2119205298013245, + "acc_stderr": 0.033367670865679766, + "acc_norm": 0.2119205298013245, + "acc_norm_stderr": 0.033367670865679766 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.20550458715596331, + "acc_stderr": 0.017324352325016012, + "acc_norm": 0.20550458715596331, + "acc_norm_stderr": 0.017324352325016012 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.1527777777777778, + "acc_stderr": 0.024536326026134224, + "acc_norm": 0.1527777777777778, + "acc_norm_stderr": 0.024536326026134224 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.02933116229425174, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.02933116229425174 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.33183856502242154, + "acc_stderr": 0.03160295143776679, + "acc_norm": 0.33183856502242154, + "acc_norm_stderr": 0.03160295143776679 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2748091603053435, + "acc_stderr": 0.03915345408847836, + "acc_norm": 0.2748091603053435, + "acc_norm_stderr": 0.03915345408847836 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2231404958677686, + "acc_stderr": 0.03800754475228733, + "acc_norm": 0.2231404958677686, + "acc_norm_stderr": 0.03800754475228733 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.04284467968052192, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.04284467968052192 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2767857142857143, + "acc_stderr": 0.04246624336697624, + "acc_norm": 0.2767857142857143, + "acc_norm_stderr": 0.04246624336697624 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.14563106796116504, + "acc_stderr": 0.0349260647662379, + "acc_norm": 0.14563106796116504, + "acc_norm_stderr": 0.0349260647662379 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.3076923076923077, + "acc_stderr": 0.030236389942173116, + "acc_norm": 0.3076923076923077, + "acc_norm_stderr": 0.030236389942173116 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.24521072796934865, + "acc_stderr": 0.015384352284543936, + "acc_norm": 0.24521072796934865, + "acc_norm_stderr": 0.015384352284543936 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.28034682080924855, + "acc_stderr": 0.024182427496577612, + "acc_norm": 0.28034682080924855, + "acc_norm_stderr": 0.024182427496577612 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2245810055865922, + "acc_stderr": 0.01395680366654464, + "acc_norm": 0.2245810055865922, + "acc_norm_stderr": 0.01395680366654464 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.02355083135199509, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.02355083135199509 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2315112540192926, + "acc_stderr": 0.02395653276663914, + "acc_norm": 0.2315112540192926, + "acc_norm_stderr": 0.02395653276663914 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25617283950617287, + "acc_stderr": 0.024288533637726095, + "acc_norm": 0.25617283950617287, + "acc_norm_stderr": 0.024288533637726095 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.24822695035460993, + "acc_stderr": 0.025770015644290392, + "acc_norm": 0.24822695035460993, + "acc_norm_stderr": 0.025770015644290392 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.22816166883963493, + "acc_stderr": 0.010717992192047882, + "acc_norm": 0.22816166883963493, + "acc_norm_stderr": 0.010717992192047882 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.1875, + "acc_stderr": 0.023709788253811766, + "acc_norm": 0.1875, + "acc_norm_stderr": 0.023709788253811766 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.24673202614379086, + "acc_stderr": 0.0174408203674025, + "acc_norm": 0.24673202614379086, + "acc_norm_stderr": 0.0174408203674025 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.22727272727272727, + "acc_stderr": 0.04013964554072775, + "acc_norm": 0.22727272727272727, + "acc_norm_stderr": 0.04013964554072775 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2163265306122449, + "acc_stderr": 0.026358916334904038, + "acc_norm": 0.2163265306122449, + "acc_norm_stderr": 0.026358916334904038 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23383084577114427, + "acc_stderr": 0.029929415408348384, + "acc_norm": 0.23383084577114427, + "acc_norm_stderr": 0.029929415408348384 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.29518072289156627, + "acc_stderr": 0.035509201856896294, + "acc_norm": 0.29518072289156627, + "acc_norm_stderr": 0.035509201856896294 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.24479804161566707, + "mc1_stderr": 0.01505186948671501, + "mc2": 0.38459449683775515, + "mc2_stderr": 0.013974794796020382 + }, + "harness|winogrande|5": { + "acc": 0.665351223362273, + "acc_stderr": 0.013261823629558366 + }, + "harness|drop|3": { + "em": 0.003145973154362416, + "em_stderr": 0.0005734993648436403, + "f1": 0.061020343959731986, + "f1_stderr": 0.0014337915290486725 + }, + "harness|gsm8k|5": { + "acc": 0.01061410159211524, + "acc_stderr": 0.002822713322387704 + }, + "all": { + "acc": 0.2510514177729245, + "acc_stderr": 0.030400819251377123, + "acc_norm": 0.2515232911384234, + "acc_norm_stderr": 0.031151630029863882, + "mc1": 0.24479804161566707, + "mc1_stderr": 0.01505186948671501, + "mc2": 0.38459449683775515, + "mc2_stderr": 0.013974794796020382, + "em": 0.003145973154362416, + "em_stderr": 0.0005734993648436403, + "f1": 0.061020343959731986, + "f1_stderr": 0.0014337915290486725 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "59c328d432da064f", + "hash_cont_tokens": "2e8835aa03b9c2cf" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4676, + "non_padded": 11, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "9eaa83dae54ba52a", + "hash_cont_tokens": "18a48de3edcef462" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 39987, + "non_padded": 181, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "4129e579fbf0ebc2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "85c455354ae2ebd0", + "hash_cont_tokens": "1d81fa80e3039a08" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "221506ab8405000a", + "hash_cont_tokens": "247dc44c6b578728" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "16c21dd1ddd4ee38", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "24b21e9d78658e4d", + "hash_cont_tokens": "26e3b69d5fb27bb2" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "770d74c6a8c9c0b7", + "hash_cont_tokens": "bbda31842f3930d5" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 568, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "7dea1631558d65ac", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "22600976f0f9ffc6", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "564ae334c5a56510", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "bce86eecdc3bb76a", + "hash_cont_tokens": "894854ed7bec57f7" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 688, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "1188d9d525ab28e7", + "hash_cont_tokens": "13130ec6de384bbb" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "692856445804bec5", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "5ade2ffc8b9f5d4a", + "hash_cont_tokens": "29089b8b7020611e" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "9b766b5e103ce426", + "hash_cont_tokens": "efc596dfa1a1f073" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "dd9935cf301e82f9", + "hash_cont_tokens": "70817a7ac9f44af2" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 560, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "78c8ba2ecf6e0dc2", + "hash_cont_tokens": "937cd53d06cc6e16" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "661893e4f7f37eba", + "hash_cont_tokens": "eec972abe0fc0f5a" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "4a8d10395fdc21f0", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "816c7d936dbe01da", + "hash_cont_tokens": "94971ccfe8e59c25" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "769ab5386fedf26e", + "hash_cont_tokens": "a78e38b59778a04c" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "5b6bcda94f3ca2df", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "142b719c7d7d4fe0", + "hash_cont_tokens": "91dc522e4e4e91c3" + }, + "truncated": 660, + "non_truncated": -495, + "padded": 0, + "non_padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "281dcc445ad0af4a", + "hash_cont_tokens": "f275c901b3d285f9" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "bb8f5852975ec963", + "hash_cont_tokens": "85eb58f423437cce" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 770, + "non_padded": 2, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "e769357a349b7644", + "hash_cont_tokens": "39a93706184f896b" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "4ab345e3c0507320", + "hash_cont_tokens": "d41065d20b689af3" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "52ec665069da063e", + "hash_cont_tokens": "28c1f7c11bf85409" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "f23b89453c7c6050", + "hash_cont_tokens": "78c510e6c5d316ac" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "bb0f46fa5669c46e", + "hash_cont_tokens": "0ba4ecffc67603c5" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "db3276d6935c41ac", + "hash_cont_tokens": "4a0339e9ad3efa6d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a54112084a848a44", + "hash_cont_tokens": "2529d55ec490f81f" + }, + "truncated": 816, + "non_truncated": -612, + "padded": 0, + "non_padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "89cf33fb840f27be", + "hash_cont_tokens": "21808b54f5df97b2" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "ecf9f32ac289d1be", + "hash_cont_tokens": "92acdd467ed943e1" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ebf05f3ed8d69562", + "hash_cont_tokens": "a6034ed95a124315" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "b0d9e6f90b58599e", + "hash_cont_tokens": "223fbf3fd106c04b" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "ddb8c4eaa3d71594", + "hash_cont_tokens": "7c8e30f486ff156a" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 428, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "a04883884a711ebf", + "hash_cont_tokens": "b4cc4a8d31bbaa03" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 636, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "d5511967956880ea", + "hash_cont_tokens": "7f0e1289ec188e82" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "8c35c18f5a96b3b3", + "hash_cont_tokens": "66b726b356a02feb" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "a80e346390d1f88c", + "hash_cont_tokens": "f08457005b652d25" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "5caf5eb895cd3ccd", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "795c466e9f87e4c1", + "hash_cont_tokens": "647bcbd68f292558" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "505a224f2325b0ec", + "hash_cont_tokens": "6849b7fe56c50dda" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1368, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3f767d07e9ec8662", + "hash_cont_tokens": "81585ec455b1e3e5" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "0bc8cefb3f763640", + "hash_cont_tokens": "471b68eb20e5d34b" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "36e85ac3fd3f3c64", + "hash_cont_tokens": "6e39384b9c0a8cc2" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "1b04a90b19ce0623", + "hash_cont_tokens": "bfe513578190093f" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "8db39e7efe9edb93", + "hash_cont_tokens": "9ce431b67350b312" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "f638aace411a0bd9", + "hash_cont_tokens": "0ff990d9cc38024d" + }, + "truncated": 168, + "non_truncated": 1366, + "padded": 5968, + "non_padded": 168, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c0f160879d378d4d", + "hash_cont_tokens": "bc3c70e15bc7dce0" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "a66dcd2d6795f6ec", + "hash_cont_tokens": "58464ea26d81f908" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5263b25641f9702c", + "hash_cont_tokens": "eaf6a5d3ddd39a12" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "0350ab02a3d50c5f", + "hash_cont_tokens": "618fd4f954253134" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "2c8688ec4c1a1673", + "hash_cont_tokens": "b4962d9e583b12c0" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "c24ed5c990a2b92c", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "59ca81fd3abf68b3", + "hash_cont_tokens": "397a75462a9735e3" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4cebe9a8da92320d", + "hash_cont_tokens": "de629d1414e01de8" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "3e6036a8ea87ff4f", + "hash_cont_tokens": "df48bc66e06781f2" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "0591af93c06ece74", + "hash_cont_tokens": "828897df1f4f08a1" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a65c9eacad86ea52", + "hash_cont_tokens": "76cb9f7fd091b5f4" + }, + "truncated": 980, + "non_truncated": 8556, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bf7d8c6b5e4f7948", + "hash_cont_tokens": "d8fb3065df1af037" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0cc44b083394b097", + "hash_cont_tokens": "872d659641a386c7" + }, + "truncated": 2624, + "non_truncated": 35571, + "padded": 111639, + "non_padded": 12769, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/harborwater/open-llama-3b-claude-30k/results_2023-12-02T22-19-50.317589.json b/eval-results/harborwater/open-llama-3b-claude-30k/results_2023-12-02T22-19-50.317589.json new file mode 100644 index 0000000000000000000000000000000000000000..5f8971f6ce84e5d05a6b2f35a56cbfd3a7013edc --- /dev/null +++ b/eval-results/harborwater/open-llama-3b-claude-30k/results_2023-12-02T22-19-50.317589.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 6554.977977486, + "end_time": 10461.362136199, + "total_evaluation_time_secondes": "3906.384158713001", + "model_name": "harborwater/open-llama-3b-claude-30k", + "model_sha": "2e59888f967c0d3fa645fc6b0fbdb9455b94dede", + "model_dtype": "torch.float16", + "model_size": "6.4 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.021986353297952996, + "acc_stderr": 0.004039162758110046 + }, + "all": { + "acc": 0.021986353297952996, + "acc_stderr": 0.004039162758110046 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bf7d8c6b5e4f7948", + "hash_cont_tokens": "d8fb3065df1af037" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "554ff890feb7b7d6", + "hash_cont_tokens": "cb641f09af9676e9" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/harborwater/open-llama-3b-everything-v2/results_2023-10-12T09-37-10.252705.json b/eval-results/harborwater/open-llama-3b-everything-v2/results_2023-10-12T09-37-10.252705.json new file mode 100644 index 0000000000000000000000000000000000000000..02bc05f3f72169e5dad42595d47e16f0930d2132 --- /dev/null +++ b/eval-results/harborwater/open-llama-3b-everything-v2/results_2023-10-12T09-37-10.252705.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "harborwater/open-llama-3b-everything-v2", + "model_sha": "31ce2c1611d9f7d56184ceb5bff6a7e95a180c03", + "model_size": "6.4 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.39505119453924914, + "acc_stderr": 0.014285898292938177, + "acc_norm": 0.4283276450511945, + "acc_norm_stderr": 0.014460496367599017 + }, + "harness|hellaswag|10": { + "acc": 0.5509858593905597, + "acc_stderr": 0.004963771168672078, + "acc_norm": 0.7328221469826728, + "acc_norm_stderr": 0.00441581669630307 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.03853254836552003, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.03853254836552003 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3092105263157895, + "acc_stderr": 0.037610708698674805, + "acc_norm": 0.3092105263157895, + "acc_norm_stderr": 0.037610708698674805 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.28679245283018867, + "acc_stderr": 0.027834912527544057, + "acc_norm": 0.28679245283018867, + "acc_norm_stderr": 0.027834912527544057 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2152777777777778, + "acc_stderr": 0.03437079344106136, + "acc_norm": 0.2152777777777778, + "acc_norm_stderr": 0.03437079344106136 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2023121387283237, + "acc_stderr": 0.03063114553919882, + "acc_norm": 0.2023121387283237, + "acc_norm_stderr": 0.03063114553919882 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179961, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179961 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.32340425531914896, + "acc_stderr": 0.030579442773610334, + "acc_norm": 0.32340425531914896, + "acc_norm_stderr": 0.030579442773610334 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.22807017543859648, + "acc_stderr": 0.03947152782669416, + "acc_norm": 0.22807017543859648, + "acc_norm_stderr": 0.03947152782669416 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2, + "acc_stderr": 0.03333333333333329, + "acc_norm": 0.2, + "acc_norm_stderr": 0.03333333333333329 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.291005291005291, + "acc_stderr": 0.02339382650048487, + "acc_norm": 0.291005291005291, + "acc_norm_stderr": 0.02339382650048487 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.04006168083848877, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.04006168083848877 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25161290322580643, + "acc_stderr": 0.02468597928623997, + "acc_norm": 0.25161290322580643, + "acc_norm_stderr": 0.02468597928623997 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2660098522167488, + "acc_stderr": 0.03108982600293753, + "acc_norm": 0.2660098522167488, + "acc_norm_stderr": 0.03108982600293753 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.23636363636363636, + "acc_stderr": 0.03317505930009181, + "acc_norm": 0.23636363636363636, + "acc_norm_stderr": 0.03317505930009181 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2474747474747475, + "acc_stderr": 0.03074630074212451, + "acc_norm": 0.2474747474747475, + "acc_norm_stderr": 0.03074630074212451 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.24870466321243523, + "acc_stderr": 0.031195840877700304, + "acc_norm": 0.24870466321243523, + "acc_norm_stderr": 0.031195840877700304 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.24102564102564103, + "acc_stderr": 0.021685546665333205, + "acc_norm": 0.24102564102564103, + "acc_norm_stderr": 0.021685546665333205 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.02684205787383371, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.02684205787383371 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2815126050420168, + "acc_stderr": 0.029213549414372177, + "acc_norm": 0.2815126050420168, + "acc_norm_stderr": 0.029213549414372177 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3509933774834437, + "acc_stderr": 0.03896981964257374, + "acc_norm": 0.3509933774834437, + "acc_norm_stderr": 0.03896981964257374 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.24220183486238533, + "acc_stderr": 0.018368176306598618, + "acc_norm": 0.24220183486238533, + "acc_norm_stderr": 0.018368176306598618 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.17592592592592593, + "acc_stderr": 0.025967420958258526, + "acc_norm": 0.17592592592592593, + "acc_norm_stderr": 0.025967420958258526 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.22058823529411764, + "acc_stderr": 0.029102254389674082, + "acc_norm": 0.22058823529411764, + "acc_norm_stderr": 0.029102254389674082 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3811659192825112, + "acc_stderr": 0.032596251184168264, + "acc_norm": 0.3811659192825112, + "acc_norm_stderr": 0.032596251184168264 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.22900763358778625, + "acc_stderr": 0.036853466317118506, + "acc_norm": 0.22900763358778625, + "acc_norm_stderr": 0.036853466317118506 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.371900826446281, + "acc_stderr": 0.04412015806624504, + "acc_norm": 0.371900826446281, + "acc_norm_stderr": 0.04412015806624504 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.04330043749650743, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.04330043749650743 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.26380368098159507, + "acc_stderr": 0.03462419931615623, + "acc_norm": 0.26380368098159507, + "acc_norm_stderr": 0.03462419931615623 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.2621359223300971, + "acc_stderr": 0.04354631077260597, + "acc_norm": 0.2621359223300971, + "acc_norm_stderr": 0.04354631077260597 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2905982905982906, + "acc_stderr": 0.029745048572674057, + "acc_norm": 0.2905982905982906, + "acc_norm_stderr": 0.029745048572674057 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.22, + "acc_stderr": 0.0416333199893227, + "acc_norm": 0.22, + "acc_norm_stderr": 0.0416333199893227 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.28735632183908044, + "acc_stderr": 0.0161824107306827, + "acc_norm": 0.28735632183908044, + "acc_norm_stderr": 0.0161824107306827 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2658959537572254, + "acc_stderr": 0.023786203255508283, + "acc_norm": 0.2658959537572254, + "acc_norm_stderr": 0.023786203255508283 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2346368715083799, + "acc_stderr": 0.014173044098303679, + "acc_norm": 0.2346368715083799, + "acc_norm_stderr": 0.014173044098303679 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.024288619466046123, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.024288619466046123 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.27009646302250806, + "acc_stderr": 0.025218040373410626, + "acc_norm": 0.27009646302250806, + "acc_norm_stderr": 0.025218040373410626 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2623456790123457, + "acc_stderr": 0.024477222856135107, + "acc_norm": 0.2623456790123457, + "acc_norm_stderr": 0.024477222856135107 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.28368794326241137, + "acc_stderr": 0.026891709428343957, + "acc_norm": 0.28368794326241137, + "acc_norm_stderr": 0.026891709428343957 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24902216427640156, + "acc_stderr": 0.01104489226404077, + "acc_norm": 0.24902216427640156, + "acc_norm_stderr": 0.01104489226404077 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.1948529411764706, + "acc_stderr": 0.024060599423487414, + "acc_norm": 0.1948529411764706, + "acc_norm_stderr": 0.024060599423487414 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2679738562091503, + "acc_stderr": 0.017917974069594722, + "acc_norm": 0.2679738562091503, + "acc_norm_stderr": 0.017917974069594722 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.32727272727272727, + "acc_stderr": 0.04494290866252088, + "acc_norm": 0.32727272727272727, + "acc_norm_stderr": 0.04494290866252088 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.028920583220675596, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.028920583220675596 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24875621890547264, + "acc_stderr": 0.030567675938916718, + "acc_norm": 0.24875621890547264, + "acc_norm_stderr": 0.030567675938916718 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3253012048192771, + "acc_stderr": 0.03647168523683227, + "acc_norm": 0.3253012048192771, + "acc_norm_stderr": 0.03647168523683227 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23255813953488372, + "mc1_stderr": 0.014789157531080514, + "mc2": 0.3725883051731077, + "mc2_stderr": 0.013630659233484554 + }, + "all": { + "acc": 0.2756642840691482, + "acc_stderr": 0.032294734616731276, + "acc_norm": 0.2793102626810814, + "acc_norm_stderr": 0.03228840654219385, + "mc1": 0.23255813953488372, + "mc1_stderr": 0.014789157531080514, + "mc2": 0.3725883051731077, + "mc2_stderr": 0.013630659233484554 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "7cefb32e2563a8e3", + "hash_cont_tokens": "2e8835aa03b9c2cf" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "e4a72fc2bbea66ff", + "hash_cont_tokens": "18a48de3edcef462" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40144, + "non-padded": 24, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "1430bf2cb1d054e2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "c4f45f8ebf944893", + "hash_cont_tokens": "1d81fa80e3039a08" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "7b6c0659a104d6af", + "hash_cont_tokens": "247dc44c6b578728" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ca33ffee63980ac1", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "a6aba95384c46b37", + "hash_cont_tokens": "26e3b69d5fb27bb2" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "95d92a1a2c158e2c", + "hash_cont_tokens": "bbda31842f3930d5" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "70284e3c06933186", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "028608b4301fcfd2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "02619f96ae20cf1e", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0282a73e02cf4b34", + "hash_cont_tokens": "894854ed7bec57f7" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5d0425cf2abddd51", + "hash_cont_tokens": "13130ec6de384bbb" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "560574f683641143", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "dc3987c35bc329e5", + "hash_cont_tokens": "29089b8b7020611e" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "be83fdd674b48356", + "hash_cont_tokens": "efc596dfa1a1f073" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "00155bf1a1a1ebc7", + "hash_cont_tokens": "70817a7ac9f44af2" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "ce05b52b00498cf6", + "hash_cont_tokens": "937cd53d06cc6e16" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "728bd41242158358", + "hash_cont_tokens": "eec972abe0fc0f5a" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "190511206bf21530", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "2bc219567947ac68", + "hash_cont_tokens": "94971ccfe8e59c25" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "8477b93b8643d23f", + "hash_cont_tokens": "a78e38b59778a04c" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "0e15ea7b43890b3c", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "142b719c7d7d4fe0", + "hash_cont_tokens": "91dc522e4e4e91c3" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4bf76efe7796945e", + "hash_cont_tokens": "f275c901b3d285f9" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "e3a453e5fb044f52", + "hash_cont_tokens": "85eb58f423437cce" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "f47a1c2b0c018aff", + "hash_cont_tokens": "39a93706184f896b" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "35bc9ee85a563c15", + "hash_cont_tokens": "d41065d20b689af3" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62a083d4ceb83864", + "hash_cont_tokens": "28c1f7c11bf85409" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "cd96d409604783e4", + "hash_cont_tokens": "78c510e6c5d316ac" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "3c716ffc27f83e15", + "hash_cont_tokens": "0ba4ecffc67603c5" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "fd8217f7edf722f8", + "hash_cont_tokens": "4a0339e9ad3efa6d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a54112084a848a44", + "hash_cont_tokens": "2529d55ec490f81f" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "89cf33fb840f27be", + "hash_cont_tokens": "21808b54f5df97b2" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "0a2b6ab3ae0e3b7c", + "hash_cont_tokens": "92acdd467ed943e1" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f28777a6fdce1d2b", + "hash_cont_tokens": "a6034ed95a124315" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "8282921a7a07bd5a", + "hash_cont_tokens": "223fbf3fd106c04b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "3aa62568b80ee7ca", + "hash_cont_tokens": "7c8e30f486ff156a" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "731b1d04f2da3d9a", + "hash_cont_tokens": "b4cc4a8d31bbaa03" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96e1af14c8358ac2", + "hash_cont_tokens": "7f0e1289ec188e82" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "bc2e4bf4e7cf5c39", + "hash_cont_tokens": "66b726b356a02feb" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "abed130d5c3867a4", + "hash_cont_tokens": "f08457005b652d25" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "83d7d50bc2ebab43", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "57004a232a08258a", + "hash_cont_tokens": "647bcbd68f292558" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "bb9518d436087f70", + "hash_cont_tokens": "6849b7fe56c50dda" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1365, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3edebd0b46a85682", + "hash_cont_tokens": "81585ec455b1e3e5" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "815607301732a13f", + "hash_cont_tokens": "471b68eb20e5d34b" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "952254859587db3e", + "hash_cont_tokens": "6e39384b9c0a8cc2" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "1429d150f124f76e", + "hash_cont_tokens": "bfe513578190093f" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "9f8bfa3b87b58a38", + "hash_cont_tokens": "9ce431b67350b312" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "f638aace411a0bd9", + "hash_cont_tokens": "0ff990d9cc38024d" + }, + "truncated": 168, + "non-truncated": 5968, + "padded": 5968, + "non-padded": 168, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c0f160879d378d4d", + "hash_cont_tokens": "bc3c70e15bc7dce0" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "548450e483004f15", + "hash_cont_tokens": "58464ea26d81f908" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "47f43ebfaa773712", + "hash_cont_tokens": "eaf6a5d3ddd39a12" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "0350ab02a3d50c5f", + "hash_cont_tokens": "618fd4f954253134" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "e010003b38f6d86a", + "hash_cont_tokens": "b4962d9e583b12c0" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "99959731e92e9eb1", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "841a69043fcd7645", + "hash_cont_tokens": "397a75462a9735e3" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6faa0998b440e497", + "hash_cont_tokens": "de629d1414e01de8" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "fe347abbeff2a4c1", + "hash_cont_tokens": "df48bc66e06781f2" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3f79e8edf26f0efd", + "hash_cont_tokens": "4a4fb8e86dc2fb9d" + }, + "total_evaluation_time_secondes": "2105.2144939899445", + "truncated": 1644, + "non-truncated": 109375, + "padded": 109332, + "non-padded": 1687, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/harborwater/open-llama-3b-everything-v2/results_2023-10-29T00-43-57.732775.json b/eval-results/harborwater/open-llama-3b-everything-v2/results_2023-10-29T00-43-57.732775.json new file mode 100644 index 0000000000000000000000000000000000000000..08815d33b1f5799dada83758de43b7fbf3093526 --- /dev/null +++ b/eval-results/harborwater/open-llama-3b-everything-v2/results_2023-10-29T00-43-57.732775.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "harborwater/open-llama-3b-everything-v2", + "model_sha": "1298ae5eb581d628ea03c92df12d30cc26e720ad", + "model_size": "6.4 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0020973154362416107, + "em_stderr": 0.0004685065030368325, + "f1": 0.0560864093959733, + "f1_stderr": 0.0013597729822813858 + }, + "harness|gsm8k|5": { + "acc": 0.01592115238817286, + "acc_stderr": 0.0034478192723889915 + }, + "harness|winogrande|5": { + "acc": 0.6661404893449092, + "acc_stderr": 0.013254029695143358 + }, + "all": { + "em": 0.0020973154362416107, + "em_stderr": 0.0004685065030368325, + "f1": 0.0560864093959733, + "f1_stderr": 0.0013597729822813858, + "acc": 0.341030820866541, + "acc_stderr": 0.008350924483766176 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a65c9eacad86ea52", + "hash_cont_tokens": "94784cffceeab3e1" + }, + "truncated": 980, + "non-truncated": 8556, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bf7d8c6b5e4f7948", + "hash_cont_tokens": "de7676ce9d753ad7" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "647d8b2cafc100bc", + "hash_cont_tokens": "828897df1f4f08a1" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2433, + "non-padded": 101, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a65e1c92b9137d17", + "hash_cont_tokens": "b321dd8dc68ae276" + }, + "total_evaluation_time_secondes": "9512.847876310349", + "truncated": 980, + "non-truncated": 12409, + "padded": 2433, + "non-padded": 10956, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/harborwater/open-llama-3b-everythingLM-2048/results_2023-10-04T08-05-25.924210.json b/eval-results/harborwater/open-llama-3b-everythingLM-2048/results_2023-10-04T08-05-25.924210.json new file mode 100644 index 0000000000000000000000000000000000000000..1f3bc3cd9e4b66d9c5700dee0f5c7a3424bff7e7 --- /dev/null +++ b/eval-results/harborwater/open-llama-3b-everythingLM-2048/results_2023-10-04T08-05-25.924210.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "harborwater/open-llama-3b-everythingLM-2048", + "model_sha": "1f9e8d48163feb63ed190eaa982f393542a75d30", + "model_size": "6.4 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.3856655290102389, + "acc_stderr": 0.01422425097325717, + "acc_norm": 0.4274744027303754, + "acc_norm_stderr": 0.014456862944650649 + }, + "harness|hellaswag|10": { + "acc": 0.5400318661621191, + "acc_stderr": 0.004973762948302805, + "acc_norm": 0.7171878111929895, + "acc_norm_stderr": 0.004494454911844637 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.037857144650666544, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.037857144650666544 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.29605263157894735, + "acc_stderr": 0.03715062154998905, + "acc_norm": 0.29605263157894735, + "acc_norm_stderr": 0.03715062154998905 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2981132075471698, + "acc_stderr": 0.028152837942493878, + "acc_norm": 0.2981132075471698, + "acc_norm_stderr": 0.028152837942493878 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.23121387283236994, + "acc_stderr": 0.032147373020294696, + "acc_norm": 0.23121387283236994, + "acc_norm_stderr": 0.032147373020294696 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179961, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179961 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3276595744680851, + "acc_stderr": 0.030683020843231008, + "acc_norm": 0.3276595744680851, + "acc_norm_stderr": 0.030683020843231008 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748143, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748143 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2, + "acc_stderr": 0.0333333333333333, + "acc_norm": 0.2, + "acc_norm_stderr": 0.0333333333333333 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.023068188848261107, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.023068188848261107 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23015873015873015, + "acc_stderr": 0.03764950879790606, + "acc_norm": 0.23015873015873015, + "acc_norm_stderr": 0.03764950879790606 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.267741935483871, + "acc_stderr": 0.02518900666021238, + "acc_norm": 0.267741935483871, + "acc_norm_stderr": 0.02518900666021238 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2660098522167488, + "acc_stderr": 0.03108982600293753, + "acc_norm": 0.2660098522167488, + "acc_norm_stderr": 0.03108982600293753 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.0347769116216366, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.0347769116216366 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2474747474747475, + "acc_stderr": 0.03074630074212451, + "acc_norm": 0.2474747474747475, + "acc_norm_stderr": 0.03074630074212451 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.24870466321243523, + "acc_stderr": 0.0311958408777003, + "acc_norm": 0.24870466321243523, + "acc_norm_stderr": 0.0311958408777003 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.24871794871794872, + "acc_stderr": 0.0219169577092138, + "acc_norm": 0.24871794871794872, + "acc_norm_stderr": 0.0219169577092138 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.02592887613276611, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.02592887613276611 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2815126050420168, + "acc_stderr": 0.029213549414372177, + "acc_norm": 0.2815126050420168, + "acc_norm_stderr": 0.029213549414372177 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.03822746937658753, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.03822746937658753 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.24587155963302754, + "acc_stderr": 0.018461940968708446, + "acc_norm": 0.24587155963302754, + "acc_norm_stderr": 0.018461940968708446 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.18981481481481483, + "acc_stderr": 0.026744714834691936, + "acc_norm": 0.18981481481481483, + "acc_norm_stderr": 0.026744714834691936 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.02977177522814565, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.02977177522814565 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.26582278481012656, + "acc_stderr": 0.028756799629658335, + "acc_norm": 0.26582278481012656, + "acc_norm_stderr": 0.028756799629658335 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3901345291479821, + "acc_stderr": 0.03273766725459156, + "acc_norm": 0.3901345291479821, + "acc_norm_stderr": 0.03273766725459156 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.24427480916030533, + "acc_stderr": 0.03768335959728745, + "acc_norm": 0.24427480916030533, + "acc_norm_stderr": 0.03768335959728745 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.3305785123966942, + "acc_stderr": 0.04294340845212095, + "acc_norm": 0.3305785123966942, + "acc_norm_stderr": 0.04294340845212095 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.044143436668549335, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.044143436668549335 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.25153374233128833, + "acc_stderr": 0.034089978868575295, + "acc_norm": 0.25153374233128833, + "acc_norm_stderr": 0.034089978868575295 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04287858751340456, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04287858751340456 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.2912621359223301, + "acc_stderr": 0.04498676320572921, + "acc_norm": 0.2912621359223301, + "acc_norm_stderr": 0.04498676320572921 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2905982905982906, + "acc_stderr": 0.02974504857267406, + "acc_norm": 0.2905982905982906, + "acc_norm_stderr": 0.02974504857267406 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.30140485312899107, + "acc_stderr": 0.016409091097268787, + "acc_norm": 0.30140485312899107, + "acc_norm_stderr": 0.016409091097268787 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.26878612716763006, + "acc_stderr": 0.023868003262500107, + "acc_norm": 0.26878612716763006, + "acc_norm_stderr": 0.023868003262500107 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24134078212290502, + "acc_stderr": 0.014310999547961455, + "acc_norm": 0.24134078212290502, + "acc_norm_stderr": 0.014310999547961455 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2581699346405229, + "acc_stderr": 0.025058503316958154, + "acc_norm": 0.2581699346405229, + "acc_norm_stderr": 0.025058503316958154 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2797427652733119, + "acc_stderr": 0.025494259350694905, + "acc_norm": 0.2797427652733119, + "acc_norm_stderr": 0.025494259350694905 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.27469135802469136, + "acc_stderr": 0.02483605786829468, + "acc_norm": 0.27469135802469136, + "acc_norm_stderr": 0.02483605786829468 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2872340425531915, + "acc_stderr": 0.026992199173064356, + "acc_norm": 0.2872340425531915, + "acc_norm_stderr": 0.026992199173064356 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142692, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142692 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.19117647058823528, + "acc_stderr": 0.02388688192244036, + "acc_norm": 0.19117647058823528, + "acc_norm_stderr": 0.02388688192244036 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.26633986928104575, + "acc_stderr": 0.017883188134667192, + "acc_norm": 0.26633986928104575, + "acc_norm_stderr": 0.017883188134667192 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.32727272727272727, + "acc_stderr": 0.04494290866252088, + "acc_norm": 0.32727272727272727, + "acc_norm_stderr": 0.04494290866252088 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3142857142857143, + "acc_stderr": 0.02971932942241747, + "acc_norm": 0.3142857142857143, + "acc_norm_stderr": 0.02971932942241747 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2736318407960199, + "acc_stderr": 0.031524391865554016, + "acc_norm": 0.2736318407960199, + "acc_norm_stderr": 0.031524391865554016 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3253012048192771, + "acc_stderr": 0.03647168523683227, + "acc_norm": 0.3253012048192771, + "acc_norm_stderr": 0.03647168523683227 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3508771929824561, + "acc_stderr": 0.03660298834049162, + "acc_norm": 0.3508771929824561, + "acc_norm_stderr": 0.03660298834049162 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.22643818849449204, + "mc1_stderr": 0.014651337324602587, + "mc2": 0.3426250755220841, + "mc2_stderr": 0.013487279265594353 + }, + "all": { + "acc": 0.2781212991625057, + "acc_stderr": 0.03240629002364077, + "acc_norm": 0.2818325672769296, + "acc_norm_stderr": 0.032402108734402385, + "mc1": 0.22643818849449204, + "mc1_stderr": 0.014651337324602587, + "mc2": 0.3426250755220841, + "mc2_stderr": 0.013487279265594353 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "7cefb32e2563a8e3", + "hash_cont_tokens": "2e8835aa03b9c2cf" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "e4a72fc2bbea66ff", + "hash_cont_tokens": "18a48de3edcef462" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40144, + "non-padded": 24, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "1430bf2cb1d054e2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "c4f45f8ebf944893", + "hash_cont_tokens": "1d81fa80e3039a08" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "7b6c0659a104d6af", + "hash_cont_tokens": "247dc44c6b578728" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ca33ffee63980ac1", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "a6aba95384c46b37", + "hash_cont_tokens": "26e3b69d5fb27bb2" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "95d92a1a2c158e2c", + "hash_cont_tokens": "bbda31842f3930d5" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "70284e3c06933186", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "028608b4301fcfd2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "02619f96ae20cf1e", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0282a73e02cf4b34", + "hash_cont_tokens": "894854ed7bec57f7" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5d0425cf2abddd51", + "hash_cont_tokens": "13130ec6de384bbb" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "560574f683641143", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "dc3987c35bc329e5", + "hash_cont_tokens": "29089b8b7020611e" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "be83fdd674b48356", + "hash_cont_tokens": "efc596dfa1a1f073" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "00155bf1a1a1ebc7", + "hash_cont_tokens": "70817a7ac9f44af2" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "ce05b52b00498cf6", + "hash_cont_tokens": "937cd53d06cc6e16" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "728bd41242158358", + "hash_cont_tokens": "eec972abe0fc0f5a" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "190511206bf21530", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "2bc219567947ac68", + "hash_cont_tokens": "94971ccfe8e59c25" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "8477b93b8643d23f", + "hash_cont_tokens": "a78e38b59778a04c" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "0e15ea7b43890b3c", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "142b719c7d7d4fe0", + "hash_cont_tokens": "91dc522e4e4e91c3" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4bf76efe7796945e", + "hash_cont_tokens": "f275c901b3d285f9" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "e3a453e5fb044f52", + "hash_cont_tokens": "85eb58f423437cce" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "f47a1c2b0c018aff", + "hash_cont_tokens": "39a93706184f896b" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "35bc9ee85a563c15", + "hash_cont_tokens": "d41065d20b689af3" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62a083d4ceb83864", + "hash_cont_tokens": "28c1f7c11bf85409" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "cd96d409604783e4", + "hash_cont_tokens": "78c510e6c5d316ac" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "3c716ffc27f83e15", + "hash_cont_tokens": "0ba4ecffc67603c5" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "fd8217f7edf722f8", + "hash_cont_tokens": "4a0339e9ad3efa6d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a54112084a848a44", + "hash_cont_tokens": "2529d55ec490f81f" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "89cf33fb840f27be", + "hash_cont_tokens": "21808b54f5df97b2" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "0a2b6ab3ae0e3b7c", + "hash_cont_tokens": "92acdd467ed943e1" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f28777a6fdce1d2b", + "hash_cont_tokens": "a6034ed95a124315" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "8282921a7a07bd5a", + "hash_cont_tokens": "223fbf3fd106c04b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "3aa62568b80ee7ca", + "hash_cont_tokens": "7c8e30f486ff156a" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "731b1d04f2da3d9a", + "hash_cont_tokens": "b4cc4a8d31bbaa03" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96e1af14c8358ac2", + "hash_cont_tokens": "7f0e1289ec188e82" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "bc2e4bf4e7cf5c39", + "hash_cont_tokens": "66b726b356a02feb" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "abed130d5c3867a4", + "hash_cont_tokens": "f08457005b652d25" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "83d7d50bc2ebab43", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "57004a232a08258a", + "hash_cont_tokens": "647bcbd68f292558" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "bb9518d436087f70", + "hash_cont_tokens": "6849b7fe56c50dda" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1365, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3edebd0b46a85682", + "hash_cont_tokens": "81585ec455b1e3e5" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "815607301732a13f", + "hash_cont_tokens": "471b68eb20e5d34b" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "952254859587db3e", + "hash_cont_tokens": "6e39384b9c0a8cc2" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "1429d150f124f76e", + "hash_cont_tokens": "bfe513578190093f" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "9f8bfa3b87b58a38", + "hash_cont_tokens": "9ce431b67350b312" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "f638aace411a0bd9", + "hash_cont_tokens": "0ff990d9cc38024d" + }, + "truncated": 168, + "non-truncated": 5968, + "padded": 5968, + "non-padded": 168, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c0f160879d378d4d", + "hash_cont_tokens": "bc3c70e15bc7dce0" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "548450e483004f15", + "hash_cont_tokens": "58464ea26d81f908" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "47f43ebfaa773712", + "hash_cont_tokens": "eaf6a5d3ddd39a12" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "0350ab02a3d50c5f", + "hash_cont_tokens": "618fd4f954253134" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "e010003b38f6d86a", + "hash_cont_tokens": "b4962d9e583b12c0" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "99959731e92e9eb1", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "841a69043fcd7645", + "hash_cont_tokens": "397a75462a9735e3" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6faa0998b440e497", + "hash_cont_tokens": "de629d1414e01de8" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "fe347abbeff2a4c1", + "hash_cont_tokens": "df48bc66e06781f2" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3f79e8edf26f0efd", + "hash_cont_tokens": "4a4fb8e86dc2fb9d" + }, + "total_evaluation_time_secondes": "2137.769516468048", + "truncated": 1644, + "non-truncated": 109375, + "padded": 109332, + "non-padded": 1687, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/harborwater/open-llama-3b-everythingLM-2048/results_2023-10-24T01-01-11.414021.json b/eval-results/harborwater/open-llama-3b-everythingLM-2048/results_2023-10-24T01-01-11.414021.json new file mode 100644 index 0000000000000000000000000000000000000000..1a4981b246b5d1bc5de2ac0e8a1873a7fd64d07e --- /dev/null +++ b/eval-results/harborwater/open-llama-3b-everythingLM-2048/results_2023-10-24T01-01-11.414021.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "harborwater/open-llama-3b-everythingLM-2048", + "model_sha": "f033291d1e2da897d39742b1ef19dc5148d2f1cd", + "model_size": "6.4 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0014681208053691276, + "em_stderr": 0.00039210421902986076, + "f1": 0.053537122483221615, + "f1_stderr": 0.0012879336042021898 + }, + "harness|gsm8k|5": { + "acc": 0.015163002274450341, + "acc_stderr": 0.003366022949726365 + }, + "harness|winogrande|5": { + "acc": 0.6629834254143646, + "acc_stderr": 0.01328495576939525 + }, + "all": { + "em": 0.0014681208053691276, + "em_stderr": 0.00039210421902986076, + "f1": 0.053537122483221615, + "f1_stderr": 0.0012879336042021898, + "acc": 0.3390732138444075, + "acc_stderr": 0.008325489359560807 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a65c9eacad86ea52", + "hash_cont_tokens": "780a172d939b658a" + }, + "truncated": 980, + "non-truncated": 8556, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bf7d8c6b5e4f7948", + "hash_cont_tokens": "a2bbc8c442849176" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "647d8b2cafc100bc", + "hash_cont_tokens": "828897df1f4f08a1" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2433, + "non-padded": 101, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a65e1c92b9137d17", + "hash_cont_tokens": "6b1934bf817ba18d" + }, + "total_evaluation_time_secondes": "9077.927297830582", + "truncated": 980, + "non-truncated": 12409, + "padded": 2433, + "non-padded": 10956, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/harborwater/open-llama-3b-v2-wizard-evol-instuct-v2-196k/results_2023-09-13T12-33-59.724911.json b/eval-results/harborwater/open-llama-3b-v2-wizard-evol-instuct-v2-196k/results_2023-09-13T12-33-59.724911.json new file mode 100644 index 0000000000000000000000000000000000000000..819faf5c68f872056448705d3a731af76ca46f72 --- /dev/null +++ b/eval-results/harborwater/open-llama-3b-v2-wizard-evol-instuct-v2-196k/results_2023-09-13T12-33-59.724911.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "harborwater/open-llama-3b-v2-wizard-evol-instuct-v2-196k", + "model_sha": "4da0c661e6df1235c9997b996c8e395b87248406", + "model_size": "3.4 GB", + "model_dtype": "8bit", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.38310580204778155, + "acc_stderr": 0.014206472661672883, + "acc_norm": 0.4121160409556314, + "acc_norm_stderr": 0.014383915302225403 + }, + "harness|hellaswag|10": { + "acc": 0.5536745668193587, + "acc_stderr": 0.004960947388535103, + "acc_norm": 0.7288388767177854, + "acc_norm_stderr": 0.004436505187567003 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768081, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768081 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.21481481481481482, + "acc_stderr": 0.035478541985608236, + "acc_norm": 0.21481481481481482, + "acc_norm_stderr": 0.035478541985608236 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.20394736842105263, + "acc_stderr": 0.03279000406310051, + "acc_norm": 0.20394736842105263, + "acc_norm_stderr": 0.03279000406310051 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.25660377358490566, + "acc_stderr": 0.026880647889051968, + "acc_norm": 0.25660377358490566, + "acc_norm_stderr": 0.026880647889051968 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2708333333333333, + "acc_stderr": 0.037161774375660185, + "acc_norm": 0.2708333333333333, + "acc_norm_stderr": 0.037161774375660185 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.17, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.17, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.17341040462427745, + "acc_stderr": 0.02886810787497064, + "acc_norm": 0.17341040462427745, + "acc_norm_stderr": 0.02886810787497064 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237656, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237656 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3276595744680851, + "acc_stderr": 0.030683020843231004, + "acc_norm": 0.3276595744680851, + "acc_norm_stderr": 0.030683020843231004 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748143, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748143 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.1793103448275862, + "acc_stderr": 0.03196766433373187, + "acc_norm": 0.1793103448275862, + "acc_norm_stderr": 0.03196766433373187 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.022182037202948365, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.022182037202948365 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2698412698412698, + "acc_stderr": 0.03970158273235173, + "acc_norm": 0.2698412698412698, + "acc_norm_stderr": 0.03970158273235173 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2129032258064516, + "acc_stderr": 0.02328766512726854, + "acc_norm": 0.2129032258064516, + "acc_norm_stderr": 0.02328766512726854 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.26108374384236455, + "acc_stderr": 0.030903796952114482, + "acc_norm": 0.26108374384236455, + "acc_norm_stderr": 0.030903796952114482 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.24242424242424243, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.21212121212121213, + "acc_stderr": 0.029126522834586818, + "acc_norm": 0.21212121212121213, + "acc_norm_stderr": 0.029126522834586818 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.22279792746113988, + "acc_stderr": 0.03003114797764154, + "acc_norm": 0.22279792746113988, + "acc_norm_stderr": 0.03003114797764154 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.21025641025641026, + "acc_stderr": 0.020660597485026924, + "acc_norm": 0.21025641025641026, + "acc_norm_stderr": 0.020660597485026924 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.026466117538959912, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.026466117538959912 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.24789915966386555, + "acc_stderr": 0.028047967224176892, + "acc_norm": 0.24789915966386555, + "acc_norm_stderr": 0.028047967224176892 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.24503311258278146, + "acc_stderr": 0.035118075718047245, + "acc_norm": 0.24503311258278146, + "acc_norm_stderr": 0.035118075718047245 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.23669724770642203, + "acc_stderr": 0.01822407811729908, + "acc_norm": 0.23669724770642203, + "acc_norm_stderr": 0.01822407811729908 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.1574074074074074, + "acc_stderr": 0.02483717351824239, + "acc_norm": 0.1574074074074074, + "acc_norm_stderr": 0.02483717351824239 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.030964517926923403, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.030964517926923403 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.28270042194092826, + "acc_stderr": 0.029312814153955927, + "acc_norm": 0.28270042194092826, + "acc_norm_stderr": 0.029312814153955927 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.38565022421524664, + "acc_stderr": 0.03266842214289202, + "acc_norm": 0.38565022421524664, + "acc_norm_stderr": 0.03266842214289202 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.22900763358778625, + "acc_stderr": 0.036853466317118506, + "acc_norm": 0.22900763358778625, + "acc_norm_stderr": 0.036853466317118506 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2066115702479339, + "acc_stderr": 0.03695980128098825, + "acc_norm": 0.2066115702479339, + "acc_norm_stderr": 0.03695980128098825 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.043300437496507437, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.043300437496507437 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2331288343558282, + "acc_stderr": 0.033220157957767414, + "acc_norm": 0.2331288343558282, + "acc_norm_stderr": 0.033220157957767414 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25, + "acc_stderr": 0.04109974682633932, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04109974682633932 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.2524271844660194, + "acc_stderr": 0.04301250399690875, + "acc_norm": 0.2524271844660194, + "acc_norm_stderr": 0.04301250399690875 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.27350427350427353, + "acc_stderr": 0.02920254015343116, + "acc_norm": 0.27350427350427353, + "acc_norm_stderr": 0.02920254015343116 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.26053639846743293, + "acc_stderr": 0.015696008563807092, + "acc_norm": 0.26053639846743293, + "acc_norm_stderr": 0.015696008563807092 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2435754189944134, + "acc_stderr": 0.01435591196476786, + "acc_norm": 0.2435754189944134, + "acc_norm_stderr": 0.01435591196476786 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.23202614379084968, + "acc_stderr": 0.024170840879341016, + "acc_norm": 0.23202614379084968, + "acc_norm_stderr": 0.024170840879341016 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2733118971061093, + "acc_stderr": 0.02531176597542612, + "acc_norm": 0.2733118971061093, + "acc_norm_stderr": 0.02531176597542612 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2716049382716049, + "acc_stderr": 0.024748624490537365, + "acc_norm": 0.2716049382716049, + "acc_norm_stderr": 0.024748624490537365 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3049645390070922, + "acc_stderr": 0.027464708442022145, + "acc_norm": 0.3049645390070922, + "acc_norm_stderr": 0.027464708442022145 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24119947848761408, + "acc_stderr": 0.010926496102034965, + "acc_norm": 0.24119947848761408, + "acc_norm_stderr": 0.010926496102034965 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.024562204314142317, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.024562204314142317 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2581699346405229, + "acc_stderr": 0.017704531653250075, + "acc_norm": 0.2581699346405229, + "acc_norm_stderr": 0.017704531653250075 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.36363636363636365, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.23673469387755103, + "acc_stderr": 0.027212835884073142, + "acc_norm": 0.23673469387755103, + "acc_norm_stderr": 0.027212835884073142 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2537313432835821, + "acc_stderr": 0.03076944496729602, + "acc_norm": 0.2537313432835821, + "acc_norm_stderr": 0.03076944496729602 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3313253012048193, + "acc_stderr": 0.036643147772880864, + "acc_norm": 0.3313253012048193, + "acc_norm_stderr": 0.036643147772880864 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.035650796707083106, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.035650796707083106 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2558139534883721, + "mc1_stderr": 0.01527417621928336, + "mc2": 0.38869602803116626, + "mc2_stderr": 0.014079845925819514 + }, + "all": { + "acc": 0.26114518377411455, + "acc_stderr": 0.031658909010883536, + "acc_norm": 0.26460576934710234, + "acc_norm_stderr": 0.03165302766240192, + "mc1": 0.2558139534883721, + "mc1_stderr": 0.01527417621928336, + "mc2": 0.38869602803116626, + "mc2_stderr": 0.014079845925819514 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "7cefb32e2563a8e3", + "hash_cont_tokens": "2e8835aa03b9c2cf" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "e4a72fc2bbea66ff", + "hash_cont_tokens": "18a48de3edcef462" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40144, + "non-padded": 24, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "1430bf2cb1d054e2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "c4f45f8ebf944893", + "hash_cont_tokens": "1d81fa80e3039a08" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "7b6c0659a104d6af", + "hash_cont_tokens": "247dc44c6b578728" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ca33ffee63980ac1", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "a6aba95384c46b37", + "hash_cont_tokens": "26e3b69d5fb27bb2" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "95d92a1a2c158e2c", + "hash_cont_tokens": "bbda31842f3930d5" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "70284e3c06933186", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "028608b4301fcfd2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "02619f96ae20cf1e", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0282a73e02cf4b34", + "hash_cont_tokens": "894854ed7bec57f7" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5d0425cf2abddd51", + "hash_cont_tokens": "13130ec6de384bbb" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "560574f683641143", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "dc3987c35bc329e5", + "hash_cont_tokens": "29089b8b7020611e" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "be83fdd674b48356", + "hash_cont_tokens": "efc596dfa1a1f073" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "00155bf1a1a1ebc7", + "hash_cont_tokens": "70817a7ac9f44af2" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "ce05b52b00498cf6", + "hash_cont_tokens": "937cd53d06cc6e16" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "728bd41242158358", + "hash_cont_tokens": "eec972abe0fc0f5a" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "190511206bf21530", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "2bc219567947ac68", + "hash_cont_tokens": "94971ccfe8e59c25" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "8477b93b8643d23f", + "hash_cont_tokens": "a78e38b59778a04c" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "0e15ea7b43890b3c", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "142b719c7d7d4fe0", + "hash_cont_tokens": "91dc522e4e4e91c3" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4bf76efe7796945e", + "hash_cont_tokens": "f275c901b3d285f9" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "e3a453e5fb044f52", + "hash_cont_tokens": "85eb58f423437cce" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "f47a1c2b0c018aff", + "hash_cont_tokens": "39a93706184f896b" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "35bc9ee85a563c15", + "hash_cont_tokens": "d41065d20b689af3" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62a083d4ceb83864", + "hash_cont_tokens": "28c1f7c11bf85409" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "cd96d409604783e4", + "hash_cont_tokens": "78c510e6c5d316ac" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "3c716ffc27f83e15", + "hash_cont_tokens": "0ba4ecffc67603c5" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "fd8217f7edf722f8", + "hash_cont_tokens": "4a0339e9ad3efa6d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a54112084a848a44", + "hash_cont_tokens": "2529d55ec490f81f" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "89cf33fb840f27be", + "hash_cont_tokens": "21808b54f5df97b2" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "0a2b6ab3ae0e3b7c", + "hash_cont_tokens": "92acdd467ed943e1" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f28777a6fdce1d2b", + "hash_cont_tokens": "a6034ed95a124315" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "8282921a7a07bd5a", + "hash_cont_tokens": "223fbf3fd106c04b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "3aa62568b80ee7ca", + "hash_cont_tokens": "7c8e30f486ff156a" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "731b1d04f2da3d9a", + "hash_cont_tokens": "b4cc4a8d31bbaa03" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96e1af14c8358ac2", + "hash_cont_tokens": "7f0e1289ec188e82" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "bc2e4bf4e7cf5c39", + "hash_cont_tokens": "66b726b356a02feb" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "abed130d5c3867a4", + "hash_cont_tokens": "f08457005b652d25" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "83d7d50bc2ebab43", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "57004a232a08258a", + "hash_cont_tokens": "647bcbd68f292558" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "bb9518d436087f70", + "hash_cont_tokens": "6849b7fe56c50dda" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1365, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3edebd0b46a85682", + "hash_cont_tokens": "81585ec455b1e3e5" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "815607301732a13f", + "hash_cont_tokens": "471b68eb20e5d34b" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "952254859587db3e", + "hash_cont_tokens": "6e39384b9c0a8cc2" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "1429d150f124f76e", + "hash_cont_tokens": "bfe513578190093f" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "9f8bfa3b87b58a38", + "hash_cont_tokens": "9ce431b67350b312" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "f638aace411a0bd9", + "hash_cont_tokens": "0ff990d9cc38024d" + }, + "truncated": 168, + "non-truncated": 5968, + "padded": 5968, + "non-padded": 168, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c0f160879d378d4d", + "hash_cont_tokens": "bc3c70e15bc7dce0" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "548450e483004f15", + "hash_cont_tokens": "58464ea26d81f908" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "47f43ebfaa773712", + "hash_cont_tokens": "eaf6a5d3ddd39a12" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "0350ab02a3d50c5f", + "hash_cont_tokens": "618fd4f954253134" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "e010003b38f6d86a", + "hash_cont_tokens": "b4962d9e583b12c0" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "99959731e92e9eb1", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "841a69043fcd7645", + "hash_cont_tokens": "397a75462a9735e3" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6faa0998b440e497", + "hash_cont_tokens": "de629d1414e01de8" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "fe347abbeff2a4c1", + "hash_cont_tokens": "df48bc66e06781f2" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3f79e8edf26f0efd", + "hash_cont_tokens": "4a4fb8e86dc2fb9d" + }, + "total_evaluation_time_secondes": "3889.953780889511", + "truncated": 1644, + "non-truncated": 109375, + "padded": 109332, + "non-padded": 1687, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/harborwater/open-llama-3b-v2-wizard-evol-instuct-v2-196k/results_2023-09-13T15-10-23.173150.json b/eval-results/harborwater/open-llama-3b-v2-wizard-evol-instuct-v2-196k/results_2023-09-13T15-10-23.173150.json new file mode 100644 index 0000000000000000000000000000000000000000..0e1413c2ce19ad5251286424c4022236139729b4 --- /dev/null +++ b/eval-results/harborwater/open-llama-3b-v2-wizard-evol-instuct-v2-196k/results_2023-09-13T15-10-23.173150.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "harborwater/open-llama-3b-v2-wizard-evol-instuct-v2-196k", + "model_sha": "4da0c661e6df1235c9997b996c8e395b87248406", + "model_size": "6.4 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.39078498293515357, + "acc_stderr": 0.01425856388051378, + "acc_norm": 0.4180887372013652, + "acc_norm_stderr": 0.014413988396996077 + }, + "harness|hellaswag|10": { + "acc": 0.552778331009759, + "acc_stderr": 0.004961904949171394, + "acc_norm": 0.7301334395538738, + "acc_norm_stderr": 0.00442983115291468 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768081, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768081 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.0359144408419697, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.0359144408419697 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.21710526315789475, + "acc_stderr": 0.03355045304882923, + "acc_norm": 0.21710526315789475, + "acc_norm_stderr": 0.03355045304882923 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2943396226415094, + "acc_stderr": 0.028049186315695248, + "acc_norm": 0.2943396226415094, + "acc_norm_stderr": 0.028049186315695248 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2138728323699422, + "acc_stderr": 0.03126511206173043, + "acc_norm": 0.2138728323699422, + "acc_norm_stderr": 0.03126511206173043 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237656, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237656 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.33617021276595743, + "acc_stderr": 0.030881618520676942, + "acc_norm": 0.33617021276595743, + "acc_norm_stderr": 0.030881618520676942 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.03999423879281333, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.03999423879281333 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.19310344827586207, + "acc_stderr": 0.032894455221273995, + "acc_norm": 0.19310344827586207, + "acc_norm_stderr": 0.032894455221273995 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.021935878081184756, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.021935878081184756 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.040061680838488746, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.040061680838488746 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.23548387096774193, + "acc_stderr": 0.02413763242933771, + "acc_norm": 0.23548387096774193, + "acc_norm_stderr": 0.02413763242933771 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.270935960591133, + "acc_stderr": 0.031270907132977, + "acc_norm": 0.270935960591133, + "acc_norm_stderr": 0.031270907132977 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847415, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847415 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.23636363636363636, + "acc_stderr": 0.03317505930009181, + "acc_norm": 0.23636363636363636, + "acc_norm_stderr": 0.03317505930009181 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.02962022787479047, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.02962022787479047 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.23316062176165803, + "acc_stderr": 0.03051611137147601, + "acc_norm": 0.23316062176165803, + "acc_norm_stderr": 0.03051611137147601 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2205128205128205, + "acc_stderr": 0.02102067268082791, + "acc_norm": 0.2205128205128205, + "acc_norm_stderr": 0.02102067268082791 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.02646611753895991, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.02646611753895991 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.24789915966386555, + "acc_stderr": 0.028047967224176892, + "acc_norm": 0.24789915966386555, + "acc_norm_stderr": 0.028047967224176892 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23178807947019867, + "acc_stderr": 0.03445406271987053, + "acc_norm": 0.23178807947019867, + "acc_norm_stderr": 0.03445406271987053 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.23486238532110093, + "acc_stderr": 0.018175110510343585, + "acc_norm": 0.23486238532110093, + "acc_norm_stderr": 0.018175110510343585 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.17592592592592593, + "acc_stderr": 0.02596742095825853, + "acc_norm": 0.17592592592592593, + "acc_norm_stderr": 0.02596742095825853 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.3080168776371308, + "acc_stderr": 0.0300523893356057, + "acc_norm": 0.3080168776371308, + "acc_norm_stderr": 0.0300523893356057 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.40358744394618834, + "acc_stderr": 0.03292802819330313, + "acc_norm": 0.40358744394618834, + "acc_norm_stderr": 0.03292802819330313 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.22137404580152673, + "acc_stderr": 0.036412970813137276, + "acc_norm": 0.22137404580152673, + "acc_norm_stderr": 0.036412970813137276 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2644628099173554, + "acc_stderr": 0.04026187527591204, + "acc_norm": 0.2644628099173554, + "acc_norm_stderr": 0.04026187527591204 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.28703703703703703, + "acc_stderr": 0.043733130409147614, + "acc_norm": 0.28703703703703703, + "acc_norm_stderr": 0.043733130409147614 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2392638036809816, + "acc_stderr": 0.033519538795212696, + "acc_norm": 0.2392638036809816, + "acc_norm_stderr": 0.033519538795212696 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.30357142857142855, + "acc_stderr": 0.04364226155841044, + "acc_norm": 0.30357142857142855, + "acc_norm_stderr": 0.04364226155841044 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.2524271844660194, + "acc_stderr": 0.04301250399690875, + "acc_norm": 0.2524271844660194, + "acc_norm_stderr": 0.04301250399690875 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2863247863247863, + "acc_stderr": 0.02961432369045665, + "acc_norm": 0.2863247863247863, + "acc_norm_stderr": 0.02961432369045665 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2822477650063857, + "acc_stderr": 0.016095302969878548, + "acc_norm": 0.2822477650063857, + "acc_norm_stderr": 0.016095302969878548 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2543352601156069, + "acc_stderr": 0.023445826276545543, + "acc_norm": 0.2543352601156069, + "acc_norm_stderr": 0.023445826276545543 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2435754189944134, + "acc_stderr": 0.01435591196476786, + "acc_norm": 0.2435754189944134, + "acc_norm_stderr": 0.01435591196476786 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.24183006535947713, + "acc_stderr": 0.024518195641879334, + "acc_norm": 0.24183006535947713, + "acc_norm_stderr": 0.024518195641879334 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2733118971061093, + "acc_stderr": 0.02531176597542612, + "acc_norm": 0.2733118971061093, + "acc_norm_stderr": 0.02531176597542612 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.02492200116888633, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.02492200116888633 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.30851063829787234, + "acc_stderr": 0.027553366165101362, + "acc_norm": 0.30851063829787234, + "acc_norm_stderr": 0.027553366165101362 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24119947848761408, + "acc_stderr": 0.010926496102034965, + "acc_norm": 0.24119947848761408, + "acc_norm_stderr": 0.010926496102034965 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.20955882352941177, + "acc_stderr": 0.02472311040767705, + "acc_norm": 0.20955882352941177, + "acc_norm_stderr": 0.02472311040767705 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2565359477124183, + "acc_stderr": 0.01766784161237899, + "acc_norm": 0.2565359477124183, + "acc_norm_stderr": 0.01766784161237899 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.35454545454545455, + "acc_stderr": 0.04582004841505416, + "acc_norm": 0.35454545454545455, + "acc_norm_stderr": 0.04582004841505416 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2530612244897959, + "acc_stderr": 0.027833023871399683, + "acc_norm": 0.2530612244897959, + "acc_norm_stderr": 0.027833023871399683 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23383084577114427, + "acc_stderr": 0.029929415408348398, + "acc_norm": 0.23383084577114427, + "acc_norm_stderr": 0.029929415408348398 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3433734939759036, + "acc_stderr": 0.03696584317010601, + "acc_norm": 0.3433734939759036, + "acc_norm_stderr": 0.03696584317010601 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.25458996328029376, + "mc1_stderr": 0.015250117079156494, + "mc2": 0.3899306177235812, + "mc2_stderr": 0.014108077614456916 + }, + "all": { + "acc": 0.2706462072131069, + "acc_stderr": 0.0320941656944084, + "acc_norm": 0.27411500149802764, + "acc_norm_stderr": 0.032087781808310525, + "mc1": 0.25458996328029376, + "mc1_stderr": 0.015250117079156494, + "mc2": 0.3899306177235812, + "mc2_stderr": 0.014108077614456916 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "7cefb32e2563a8e3", + "hash_cont_tokens": "2e8835aa03b9c2cf" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "e4a72fc2bbea66ff", + "hash_cont_tokens": "18a48de3edcef462" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40144, + "non-padded": 24, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "1430bf2cb1d054e2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "c4f45f8ebf944893", + "hash_cont_tokens": "1d81fa80e3039a08" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "7b6c0659a104d6af", + "hash_cont_tokens": "247dc44c6b578728" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ca33ffee63980ac1", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "a6aba95384c46b37", + "hash_cont_tokens": "26e3b69d5fb27bb2" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "95d92a1a2c158e2c", + "hash_cont_tokens": "bbda31842f3930d5" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "70284e3c06933186", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "028608b4301fcfd2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "02619f96ae20cf1e", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0282a73e02cf4b34", + "hash_cont_tokens": "894854ed7bec57f7" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5d0425cf2abddd51", + "hash_cont_tokens": "13130ec6de384bbb" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "560574f683641143", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "dc3987c35bc329e5", + "hash_cont_tokens": "29089b8b7020611e" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "be83fdd674b48356", + "hash_cont_tokens": "efc596dfa1a1f073" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "00155bf1a1a1ebc7", + "hash_cont_tokens": "70817a7ac9f44af2" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "ce05b52b00498cf6", + "hash_cont_tokens": "937cd53d06cc6e16" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "728bd41242158358", + "hash_cont_tokens": "eec972abe0fc0f5a" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "190511206bf21530", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "2bc219567947ac68", + "hash_cont_tokens": "94971ccfe8e59c25" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "8477b93b8643d23f", + "hash_cont_tokens": "a78e38b59778a04c" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "0e15ea7b43890b3c", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "142b719c7d7d4fe0", + "hash_cont_tokens": "91dc522e4e4e91c3" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4bf76efe7796945e", + "hash_cont_tokens": "f275c901b3d285f9" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "e3a453e5fb044f52", + "hash_cont_tokens": "85eb58f423437cce" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "f47a1c2b0c018aff", + "hash_cont_tokens": "39a93706184f896b" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "35bc9ee85a563c15", + "hash_cont_tokens": "d41065d20b689af3" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62a083d4ceb83864", + "hash_cont_tokens": "28c1f7c11bf85409" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "cd96d409604783e4", + "hash_cont_tokens": "78c510e6c5d316ac" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "3c716ffc27f83e15", + "hash_cont_tokens": "0ba4ecffc67603c5" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "fd8217f7edf722f8", + "hash_cont_tokens": "4a0339e9ad3efa6d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a54112084a848a44", + "hash_cont_tokens": "2529d55ec490f81f" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "89cf33fb840f27be", + "hash_cont_tokens": "21808b54f5df97b2" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "0a2b6ab3ae0e3b7c", + "hash_cont_tokens": "92acdd467ed943e1" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f28777a6fdce1d2b", + "hash_cont_tokens": "a6034ed95a124315" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "8282921a7a07bd5a", + "hash_cont_tokens": "223fbf3fd106c04b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "3aa62568b80ee7ca", + "hash_cont_tokens": "7c8e30f486ff156a" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "731b1d04f2da3d9a", + "hash_cont_tokens": "b4cc4a8d31bbaa03" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96e1af14c8358ac2", + "hash_cont_tokens": "7f0e1289ec188e82" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "bc2e4bf4e7cf5c39", + "hash_cont_tokens": "66b726b356a02feb" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "abed130d5c3867a4", + "hash_cont_tokens": "f08457005b652d25" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "83d7d50bc2ebab43", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "57004a232a08258a", + "hash_cont_tokens": "647bcbd68f292558" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "bb9518d436087f70", + "hash_cont_tokens": "6849b7fe56c50dda" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1365, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3edebd0b46a85682", + "hash_cont_tokens": "81585ec455b1e3e5" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "815607301732a13f", + "hash_cont_tokens": "471b68eb20e5d34b" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "952254859587db3e", + "hash_cont_tokens": "6e39384b9c0a8cc2" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "1429d150f124f76e", + "hash_cont_tokens": "bfe513578190093f" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "9f8bfa3b87b58a38", + "hash_cont_tokens": "9ce431b67350b312" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "f638aace411a0bd9", + "hash_cont_tokens": "0ff990d9cc38024d" + }, + "truncated": 168, + "non-truncated": 5968, + "padded": 5968, + "non-padded": 168, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c0f160879d378d4d", + "hash_cont_tokens": "bc3c70e15bc7dce0" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "548450e483004f15", + "hash_cont_tokens": "58464ea26d81f908" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "47f43ebfaa773712", + "hash_cont_tokens": "eaf6a5d3ddd39a12" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "0350ab02a3d50c5f", + "hash_cont_tokens": "618fd4f954253134" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "e010003b38f6d86a", + "hash_cont_tokens": "b4962d9e583b12c0" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "99959731e92e9eb1", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "841a69043fcd7645", + "hash_cont_tokens": "397a75462a9735e3" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6faa0998b440e497", + "hash_cont_tokens": "de629d1414e01de8" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "fe347abbeff2a4c1", + "hash_cont_tokens": "df48bc66e06781f2" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3f79e8edf26f0efd", + "hash_cont_tokens": "4a4fb8e86dc2fb9d" + }, + "total_evaluation_time_secondes": "2110.653804063797", + "truncated": 1644, + "non-truncated": 109375, + "padded": 109332, + "non-padded": 1687, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/harborwater/open-llama-3b-v2-wizard-evol-instuct-v2-196k/results_2023-10-24T09-06-59.427518.json b/eval-results/harborwater/open-llama-3b-v2-wizard-evol-instuct-v2-196k/results_2023-10-24T09-06-59.427518.json new file mode 100644 index 0000000000000000000000000000000000000000..3ffb53b08c8dd15c05b230ccc6cbc27e3202f40d --- /dev/null +++ b/eval-results/harborwater/open-llama-3b-v2-wizard-evol-instuct-v2-196k/results_2023-10-24T09-06-59.427518.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "harborwater/open-llama-3b-v2-wizard-evol-instuct-v2-196k", + "model_sha": "5bc8d583b73a7ebc664a6fd97a2c51cb0565bccb", + "model_size": "3.4 GB", + "model_dtype": "8bit", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0024119127516778523, + "em_stderr": 0.0005023380498893439, + "f1": 0.05683829697986595, + "f1_stderr": 0.0013625469192287039 + }, + "harness|gsm8k|5": { + "acc": 0.01592115238817286, + "acc_stderr": 0.003447819272389002 + }, + "harness|winogrande|5": { + "acc": 0.6661404893449092, + "acc_stderr": 0.013254029695143358 + }, + "all": { + "em": 0.0024119127516778523, + "em_stderr": 0.0005023380498893439, + "f1": 0.05683829697986595, + "f1_stderr": 0.0013625469192287039, + "acc": 0.341030820866541, + "acc_stderr": 0.00835092448376618 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a65c9eacad86ea52", + "hash_cont_tokens": "a3a220f53570a73e" + }, + "truncated": 980, + "non-truncated": 8556, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bf7d8c6b5e4f7948", + "hash_cont_tokens": "5b9bbb958d3fcf91" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "647d8b2cafc100bc", + "hash_cont_tokens": "828897df1f4f08a1" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2433, + "non-padded": 101, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a65e1c92b9137d17", + "hash_cont_tokens": "ab1a73d483b84763" + }, + "total_evaluation_time_secondes": "18191.103542804718", + "truncated": 980, + "non-truncated": 12409, + "padded": 2433, + "non-padded": 10956, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/harborwater/open-llama-3b-v2-wizard-evol-instuct-v2-196k/results_2023-10-25T23-35-48.720340.json b/eval-results/harborwater/open-llama-3b-v2-wizard-evol-instuct-v2-196k/results_2023-10-25T23-35-48.720340.json new file mode 100644 index 0000000000000000000000000000000000000000..221c00043e60738e7cf144e3d8d2f3b54ba5fe02 --- /dev/null +++ b/eval-results/harborwater/open-llama-3b-v2-wizard-evol-instuct-v2-196k/results_2023-10-25T23-35-48.720340.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "harborwater/open-llama-3b-v2-wizard-evol-instuct-v2-196k", + "model_sha": "5bc8d583b73a7ebc664a6fd97a2c51cb0565bccb", + "model_size": "6.4 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0024119127516778523, + "em_stderr": 0.0005023380498893348, + "f1": 0.055686870805369305, + "f1_stderr": 0.0013493803185445354 + }, + "harness|gsm8k|5": { + "acc": 0.018953752843062926, + "acc_stderr": 0.0037560783410314704 + }, + "harness|winogrande|5": { + "acc": 0.6669297553275454, + "acc_stderr": 0.013246194028070658 + }, + "all": { + "em": 0.0024119127516778523, + "em_stderr": 0.0005023380498893348, + "f1": 0.055686870805369305, + "f1_stderr": 0.0013493803185445354, + "acc": 0.34294175408530414, + "acc_stderr": 0.008501136184551065 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a65c9eacad86ea52", + "hash_cont_tokens": "08b3c5565fb8b357" + }, + "truncated": 980, + "non-truncated": 8556, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bf7d8c6b5e4f7948", + "hash_cont_tokens": "ac7c827fc19bbfa1" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "647d8b2cafc100bc", + "hash_cont_tokens": "828897df1f4f08a1" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2433, + "non-padded": 101, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a65e1c92b9137d17", + "hash_cont_tokens": "a2d34d22a4753c0d" + }, + "total_evaluation_time_secondes": "8248.279017210007", + "truncated": 980, + "non-truncated": 12409, + "padded": 2433, + "non-padded": 10956, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/harborwater/wizard-orca-3b/results_2023-10-08T19-21-18.723038.json b/eval-results/harborwater/wizard-orca-3b/results_2023-10-08T19-21-18.723038.json new file mode 100644 index 0000000000000000000000000000000000000000..feba3e43b9aebfb681dfabe5d69958b8debce8cb --- /dev/null +++ b/eval-results/harborwater/wizard-orca-3b/results_2023-10-08T19-21-18.723038.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "harborwater/wizard-orca-3b", + "model_sha": "ffc81b58375342f12e38a67272d95458a72e8d09", + "model_size": "6.4 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.3848122866894198, + "acc_stderr": 0.014218371065251102, + "acc_norm": 0.41723549488054607, + "acc_norm_stderr": 0.01440982551840308 + }, + "harness|hellaswag|10": { + "acc": 0.5467038438558056, + "acc_stderr": 0.00496796581019999, + "acc_norm": 0.7177853017327226, + "acc_norm_stderr": 0.004491574539441884 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.16, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.16, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.17777777777777778, + "acc_stderr": 0.033027898599017176, + "acc_norm": 0.17777777777777778, + "acc_norm_stderr": 0.033027898599017176 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.21710526315789475, + "acc_stderr": 0.03355045304882924, + "acc_norm": 0.21710526315789475, + "acc_norm_stderr": 0.03355045304882924 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2339622641509434, + "acc_stderr": 0.02605529690115292, + "acc_norm": 0.2339622641509434, + "acc_norm_stderr": 0.02605529690115292 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.25, + "acc_stderr": 0.03621034121889507, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03621034121889507 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2023121387283237, + "acc_stderr": 0.03063114553919882, + "acc_norm": 0.2023121387283237, + "acc_norm_stderr": 0.03063114553919882 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237655, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237655 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.31063829787234043, + "acc_stderr": 0.03025123757921317, + "acc_norm": 0.31063829787234043, + "acc_norm_stderr": 0.03025123757921317 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.038351539543994194, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.038351539543994194 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2620689655172414, + "acc_stderr": 0.036646663372252565, + "acc_norm": 0.2620689655172414, + "acc_norm_stderr": 0.036646663372252565 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.21693121693121692, + "acc_stderr": 0.02122708244944508, + "acc_norm": 0.21693121693121692, + "acc_norm_stderr": 0.02122708244944508 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.04006168083848876, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.04006168083848876 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.1870967741935484, + "acc_stderr": 0.02218571009225225, + "acc_norm": 0.1870967741935484, + "acc_norm_stderr": 0.02218571009225225 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.15270935960591134, + "acc_stderr": 0.02530890453938063, + "acc_norm": 0.15270935960591134, + "acc_norm_stderr": 0.02530890453938063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.24242424242424243, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.16666666666666666, + "acc_stderr": 0.02655220782821529, + "acc_norm": 0.16666666666666666, + "acc_norm_stderr": 0.02655220782821529 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.20207253886010362, + "acc_stderr": 0.02897908979429673, + "acc_norm": 0.20207253886010362, + "acc_norm_stderr": 0.02897908979429673 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2205128205128205, + "acc_stderr": 0.021020672680827912, + "acc_norm": 0.2205128205128205, + "acc_norm_stderr": 0.021020672680827912 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2111111111111111, + "acc_stderr": 0.024882116857655075, + "acc_norm": 0.2111111111111111, + "acc_norm_stderr": 0.024882116857655075 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.24789915966386555, + "acc_stderr": 0.028047967224176892, + "acc_norm": 0.24789915966386555, + "acc_norm_stderr": 0.028047967224176892 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.24503311258278146, + "acc_stderr": 0.03511807571804726, + "acc_norm": 0.24503311258278146, + "acc_norm_stderr": 0.03511807571804726 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.21834862385321102, + "acc_stderr": 0.01771260052872273, + "acc_norm": 0.21834862385321102, + "acc_norm_stderr": 0.01771260052872273 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.18055555555555555, + "acc_stderr": 0.026232878971491656, + "acc_norm": 0.18055555555555555, + "acc_norm_stderr": 0.026232878971491656 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.19117647058823528, + "acc_stderr": 0.027599174300640766, + "acc_norm": 0.19117647058823528, + "acc_norm_stderr": 0.027599174300640766 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2911392405063291, + "acc_stderr": 0.029571601065753374, + "acc_norm": 0.2911392405063291, + "acc_norm_stderr": 0.029571601065753374 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.336322869955157, + "acc_stderr": 0.031708824268455, + "acc_norm": 0.336322869955157, + "acc_norm_stderr": 0.031708824268455 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2824427480916031, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.2824427480916031, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.23140495867768596, + "acc_stderr": 0.03849856098794089, + "acc_norm": 0.23140495867768596, + "acc_norm_stderr": 0.03849856098794089 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.04133119440243839, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.04133119440243839 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2147239263803681, + "acc_stderr": 0.03226219377286774, + "acc_norm": 0.2147239263803681, + "acc_norm_stderr": 0.03226219377286774 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.30357142857142855, + "acc_stderr": 0.04364226155841043, + "acc_norm": 0.30357142857142855, + "acc_norm_stderr": 0.04364226155841043 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.13592233009708737, + "acc_stderr": 0.03393295729761013, + "acc_norm": 0.13592233009708737, + "acc_norm_stderr": 0.03393295729761013 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.3076923076923077, + "acc_stderr": 0.030236389942173116, + "acc_norm": 0.3076923076923077, + "acc_norm_stderr": 0.030236389942173116 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.24265644955300128, + "acc_stderr": 0.01532988894089987, + "acc_norm": 0.24265644955300128, + "acc_norm_stderr": 0.01532988894089987 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.25722543352601157, + "acc_stderr": 0.02353292543104428, + "acc_norm": 0.25722543352601157, + "acc_norm_stderr": 0.02353292543104428 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23687150837988827, + "acc_stderr": 0.014219570788103982, + "acc_norm": 0.23687150837988827, + "acc_norm_stderr": 0.014219570788103982 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.02355083135199509, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.02355083135199509 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.27009646302250806, + "acc_stderr": 0.02521804037341063, + "acc_norm": 0.27009646302250806, + "acc_norm_stderr": 0.02521804037341063 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.023788583551658533, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.023788583551658533 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2730496453900709, + "acc_stderr": 0.026577860943307857, + "acc_norm": 0.2730496453900709, + "acc_norm_stderr": 0.026577860943307857 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2392438070404172, + "acc_stderr": 0.01089612365267665, + "acc_norm": 0.2392438070404172, + "acc_norm_stderr": 0.01089612365267665 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.18382352941176472, + "acc_stderr": 0.023529242185193106, + "acc_norm": 0.18382352941176472, + "acc_norm_stderr": 0.023529242185193106 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25163398692810457, + "acc_stderr": 0.017555818091322263, + "acc_norm": 0.25163398692810457, + "acc_norm_stderr": 0.017555818091322263 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2636363636363636, + "acc_stderr": 0.04220224692971987, + "acc_norm": 0.2636363636363636, + "acc_norm_stderr": 0.04220224692971987 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.21224489795918366, + "acc_stderr": 0.026176967197866767, + "acc_norm": 0.21224489795918366, + "acc_norm_stderr": 0.026176967197866767 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.030360490154014652, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.030360490154014652 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3132530120481928, + "acc_stderr": 0.03610805018031023, + "acc_norm": 0.3132530120481928, + "acc_norm_stderr": 0.03610805018031023 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.26193390452876375, + "mc1_stderr": 0.015392118805015027, + "mc2": 0.40035714074695034, + "mc2_stderr": 0.014298393496028232 + }, + "all": { + "acc": 0.2523974583502166, + "acc_stderr": 0.03129775621975096, + "acc_norm": 0.25584668997848853, + "acc_norm_stderr": 0.031292926782164415, + "mc1": 0.26193390452876375, + "mc1_stderr": 0.015392118805015027, + "mc2": 0.40035714074695034, + "mc2_stderr": 0.014298393496028232 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "7cefb32e2563a8e3", + "hash_cont_tokens": "2e8835aa03b9c2cf" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "e4a72fc2bbea66ff", + "hash_cont_tokens": "18a48de3edcef462" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40144, + "non-padded": 24, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "1430bf2cb1d054e2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "c4f45f8ebf944893", + "hash_cont_tokens": "1d81fa80e3039a08" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "7b6c0659a104d6af", + "hash_cont_tokens": "247dc44c6b578728" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ca33ffee63980ac1", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "a6aba95384c46b37", + "hash_cont_tokens": "26e3b69d5fb27bb2" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "95d92a1a2c158e2c", + "hash_cont_tokens": "bbda31842f3930d5" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "70284e3c06933186", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "028608b4301fcfd2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "02619f96ae20cf1e", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0282a73e02cf4b34", + "hash_cont_tokens": "894854ed7bec57f7" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5d0425cf2abddd51", + "hash_cont_tokens": "13130ec6de384bbb" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "560574f683641143", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "dc3987c35bc329e5", + "hash_cont_tokens": "29089b8b7020611e" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "be83fdd674b48356", + "hash_cont_tokens": "efc596dfa1a1f073" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "00155bf1a1a1ebc7", + "hash_cont_tokens": "70817a7ac9f44af2" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "ce05b52b00498cf6", + "hash_cont_tokens": "937cd53d06cc6e16" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "728bd41242158358", + "hash_cont_tokens": "eec972abe0fc0f5a" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "190511206bf21530", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "2bc219567947ac68", + "hash_cont_tokens": "94971ccfe8e59c25" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "8477b93b8643d23f", + "hash_cont_tokens": "a78e38b59778a04c" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "0e15ea7b43890b3c", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "142b719c7d7d4fe0", + "hash_cont_tokens": "91dc522e4e4e91c3" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4bf76efe7796945e", + "hash_cont_tokens": "f275c901b3d285f9" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "e3a453e5fb044f52", + "hash_cont_tokens": "85eb58f423437cce" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "f47a1c2b0c018aff", + "hash_cont_tokens": "39a93706184f896b" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "35bc9ee85a563c15", + "hash_cont_tokens": "d41065d20b689af3" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62a083d4ceb83864", + "hash_cont_tokens": "28c1f7c11bf85409" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "cd96d409604783e4", + "hash_cont_tokens": "78c510e6c5d316ac" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "3c716ffc27f83e15", + "hash_cont_tokens": "0ba4ecffc67603c5" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "fd8217f7edf722f8", + "hash_cont_tokens": "4a0339e9ad3efa6d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a54112084a848a44", + "hash_cont_tokens": "2529d55ec490f81f" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "89cf33fb840f27be", + "hash_cont_tokens": "21808b54f5df97b2" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "0a2b6ab3ae0e3b7c", + "hash_cont_tokens": "92acdd467ed943e1" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f28777a6fdce1d2b", + "hash_cont_tokens": "a6034ed95a124315" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "8282921a7a07bd5a", + "hash_cont_tokens": "223fbf3fd106c04b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "3aa62568b80ee7ca", + "hash_cont_tokens": "7c8e30f486ff156a" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "731b1d04f2da3d9a", + "hash_cont_tokens": "b4cc4a8d31bbaa03" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96e1af14c8358ac2", + "hash_cont_tokens": "7f0e1289ec188e82" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "bc2e4bf4e7cf5c39", + "hash_cont_tokens": "66b726b356a02feb" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "abed130d5c3867a4", + "hash_cont_tokens": "f08457005b652d25" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "83d7d50bc2ebab43", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "57004a232a08258a", + "hash_cont_tokens": "647bcbd68f292558" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "bb9518d436087f70", + "hash_cont_tokens": "6849b7fe56c50dda" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1365, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3edebd0b46a85682", + "hash_cont_tokens": "81585ec455b1e3e5" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "815607301732a13f", + "hash_cont_tokens": "471b68eb20e5d34b" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "952254859587db3e", + "hash_cont_tokens": "6e39384b9c0a8cc2" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "1429d150f124f76e", + "hash_cont_tokens": "bfe513578190093f" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "9f8bfa3b87b58a38", + "hash_cont_tokens": "9ce431b67350b312" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "f638aace411a0bd9", + "hash_cont_tokens": "0ff990d9cc38024d" + }, + "truncated": 168, + "non-truncated": 5968, + "padded": 5968, + "non-padded": 168, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c0f160879d378d4d", + "hash_cont_tokens": "bc3c70e15bc7dce0" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "548450e483004f15", + "hash_cont_tokens": "58464ea26d81f908" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "47f43ebfaa773712", + "hash_cont_tokens": "eaf6a5d3ddd39a12" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "0350ab02a3d50c5f", + "hash_cont_tokens": "618fd4f954253134" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "e010003b38f6d86a", + "hash_cont_tokens": "b4962d9e583b12c0" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "99959731e92e9eb1", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "841a69043fcd7645", + "hash_cont_tokens": "397a75462a9735e3" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6faa0998b440e497", + "hash_cont_tokens": "de629d1414e01de8" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "fe347abbeff2a4c1", + "hash_cont_tokens": "df48bc66e06781f2" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3f79e8edf26f0efd", + "hash_cont_tokens": "4a4fb8e86dc2fb9d" + }, + "total_evaluation_time_secondes": "2100.5906295776367", + "truncated": 1644, + "non-truncated": 109375, + "padded": 109332, + "non-padded": 1687, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/harborwater/wizard-orca-3b/results_2023-10-24T08-46-00.865464.json b/eval-results/harborwater/wizard-orca-3b/results_2023-10-24T08-46-00.865464.json new file mode 100644 index 0000000000000000000000000000000000000000..e8d98e7e7f5e6fef65a642a50a38e614a0f095e2 --- /dev/null +++ b/eval-results/harborwater/wizard-orca-3b/results_2023-10-24T08-46-00.865464.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "harborwater/wizard-orca-3b", + "model_sha": "ffc81b58375342f12e38a67272d95458a72e8d09", + "model_size": "6.4 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0019924496644295304, + "em_stderr": 0.00045666764626669333, + "f1": 0.05503670302013434, + "f1_stderr": 0.0013533156474354355 + }, + "harness|gsm8k|5": { + "acc": 0.01061410159211524, + "acc_stderr": 0.002822713322387704 + }, + "harness|winogrande|5": { + "acc": 0.6692975532754538, + "acc_stderr": 0.013222435887002691 + }, + "all": { + "em": 0.0019924496644295304, + "em_stderr": 0.00045666764626669333, + "f1": 0.05503670302013434, + "f1_stderr": 0.0013533156474354355, + "acc": 0.33995582743378455, + "acc_stderr": 0.008022574604695198 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a65c9eacad86ea52", + "hash_cont_tokens": "dd42d5c9269230fe" + }, + "truncated": 980, + "non-truncated": 8556, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bf7d8c6b5e4f7948", + "hash_cont_tokens": "b39d931100fb2f5f" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "647d8b2cafc100bc", + "hash_cont_tokens": "828897df1f4f08a1" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2433, + "non-padded": 101, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a65e1c92b9137d17", + "hash_cont_tokens": "71c81335c159379e" + }, + "total_evaluation_time_secondes": "9051.011059761047", + "truncated": 980, + "non-truncated": 12409, + "padded": 2433, + "non-padded": 10956, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/hfl/chinese-alpaca-2-13b-16k/results_2023-12-09T15-53-33.265685.json b/eval-results/hfl/chinese-alpaca-2-13b-16k/results_2023-12-09T15-53-33.265685.json new file mode 100644 index 0000000000000000000000000000000000000000..852c2abe24fdb3f54c37bbea546d9389efa2983a --- /dev/null +++ b/eval-results/hfl/chinese-alpaca-2-13b-16k/results_2023-12-09T15-53-33.265685.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 582019.58061195, + "end_time": 592083.439415912, + "total_evaluation_time_secondes": "10063.858803962008", + "model_name": "hfl/chinese-alpaca-2-13b-16k", + "model_sha": "ba4536aed022c49bda60e1b56a0dbefc2ea6a30a", + "model_dtype": "torch.float16", + "model_size": "25.0 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5213310580204779, + "acc_stderr": 0.014598087973127106, + "acc_norm": 0.5503412969283277, + "acc_norm_stderr": 0.014537144444284738 + }, + "harness|hellaswag|10": { + "acc": 0.5728938458474407, + "acc_stderr": 0.004936470085238487, + "acc_norm": 0.7741485759808803, + "acc_norm_stderr": 0.0041728722829842005 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5131578947368421, + "acc_stderr": 0.04067533136309173, + "acc_norm": 0.5131578947368421, + "acc_norm_stderr": 0.04067533136309173 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5509433962264151, + "acc_stderr": 0.030612730713641095, + "acc_norm": 0.5509433962264151, + "acc_norm_stderr": 0.030612730713641095 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5069444444444444, + "acc_stderr": 0.04180806750294938, + "acc_norm": 0.5069444444444444, + "acc_norm_stderr": 0.04180806750294938 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5086705202312138, + "acc_stderr": 0.038118909889404126, + "acc_norm": 0.5086705202312138, + "acc_norm_stderr": 0.038118909889404126 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.04533838195929775, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.04533838195929775 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3446808510638298, + "acc_stderr": 0.03106898596312215, + "acc_norm": 0.3446808510638298, + "acc_norm_stderr": 0.03106898596312215 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.04166567577101579, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.04166567577101579 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30952380952380953, + "acc_stderr": 0.023809523809523853, + "acc_norm": 0.30952380952380953, + "acc_norm_stderr": 0.023809523809523853 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.041634530313028585, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.041634530313028585 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5741935483870968, + "acc_stderr": 0.028129112709165904, + "acc_norm": 0.5741935483870968, + "acc_norm_stderr": 0.028129112709165904 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.39901477832512317, + "acc_stderr": 0.03445487686264715, + "acc_norm": 0.39901477832512317, + "acc_norm_stderr": 0.03445487686264715 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6606060606060606, + "acc_stderr": 0.03697442205031596, + "acc_norm": 0.6606060606060606, + "acc_norm_stderr": 0.03697442205031596 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.033586181457325226, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.033586181457325226 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7357512953367875, + "acc_stderr": 0.03182155050916646, + "acc_norm": 0.7357512953367875, + "acc_norm_stderr": 0.03182155050916646 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.44358974358974357, + "acc_stderr": 0.0251891498947642, + "acc_norm": 0.44358974358974357, + "acc_norm_stderr": 0.0251891498947642 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3074074074074074, + "acc_stderr": 0.02813325257881563, + "acc_norm": 0.3074074074074074, + "acc_norm_stderr": 0.02813325257881563 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5210084033613446, + "acc_stderr": 0.03244980849990029, + "acc_norm": 0.5210084033613446, + "acc_norm_stderr": 0.03244980849990029 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.038227469376587525, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.038227469376587525 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7045871559633028, + "acc_stderr": 0.019560619182976, + "acc_norm": 0.7045871559633028, + "acc_norm_stderr": 0.019560619182976 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.39814814814814814, + "acc_stderr": 0.033384734032074016, + "acc_norm": 0.39814814814814814, + "acc_norm_stderr": 0.033384734032074016 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7009803921568627, + "acc_stderr": 0.03213325717373617, + "acc_norm": 0.7009803921568627, + "acc_norm_stderr": 0.03213325717373617 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.70042194092827, + "acc_stderr": 0.02981802474975309, + "acc_norm": 0.70042194092827, + "acc_norm_stderr": 0.02981802474975309 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.600896860986547, + "acc_stderr": 0.03286745312567961, + "acc_norm": 0.600896860986547, + "acc_norm_stderr": 0.03286745312567961 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5648854961832062, + "acc_stderr": 0.04348208051644858, + "acc_norm": 0.5648854961832062, + "acc_norm_stderr": 0.04348208051644858 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.04026187527591207, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.04026187527591207 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.04557239513497751, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.04557239513497751 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5766871165644172, + "acc_stderr": 0.03881891213334383, + "acc_norm": 0.5766871165644172, + "acc_norm_stderr": 0.03881891213334383 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.29464285714285715, + "acc_stderr": 0.04327040932578729, + "acc_norm": 0.29464285714285715, + "acc_norm_stderr": 0.04327040932578729 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6796116504854369, + "acc_stderr": 0.04620284082280042, + "acc_norm": 0.6796116504854369, + "acc_norm_stderr": 0.04620284082280042 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7863247863247863, + "acc_stderr": 0.02685345037700914, + "acc_norm": 0.7863247863247863, + "acc_norm_stderr": 0.02685345037700914 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956914, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956914 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7151979565772669, + "acc_stderr": 0.016139174096522546, + "acc_norm": 0.7151979565772669, + "acc_norm_stderr": 0.016139174096522546 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5867052023121387, + "acc_stderr": 0.02651126136940924, + "acc_norm": 0.5867052023121387, + "acc_norm_stderr": 0.02651126136940924 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24134078212290502, + "acc_stderr": 0.014310999547961443, + "acc_norm": 0.24134078212290502, + "acc_norm_stderr": 0.014310999547961443 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5522875816993464, + "acc_stderr": 0.02847293847803353, + "acc_norm": 0.5522875816993464, + "acc_norm_stderr": 0.02847293847803353 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5980707395498392, + "acc_stderr": 0.027846476005930473, + "acc_norm": 0.5980707395498392, + "acc_norm_stderr": 0.027846476005930473 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5648148148148148, + "acc_stderr": 0.027586006221607708, + "acc_norm": 0.5648148148148148, + "acc_norm_stderr": 0.027586006221607708 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4078014184397163, + "acc_stderr": 0.029316011776343555, + "acc_norm": 0.4078014184397163, + "acc_norm_stderr": 0.029316011776343555 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.39895697522816165, + "acc_stderr": 0.01250675765529367, + "acc_norm": 0.39895697522816165, + "acc_norm_stderr": 0.01250675765529367 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4338235294117647, + "acc_stderr": 0.030105636570016636, + "acc_norm": 0.4338235294117647, + "acc_norm_stderr": 0.030105636570016636 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.49019607843137253, + "acc_stderr": 0.020223946005074305, + "acc_norm": 0.49019607843137253, + "acc_norm_stderr": 0.020223946005074305 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5818181818181818, + "acc_stderr": 0.04724577405731572, + "acc_norm": 0.5818181818181818, + "acc_norm_stderr": 0.04724577405731572 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6244897959183674, + "acc_stderr": 0.03100120903989484, + "acc_norm": 0.6244897959183674, + "acc_norm_stderr": 0.03100120903989484 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6218905472636815, + "acc_stderr": 0.034288678487786564, + "acc_norm": 0.6218905472636815, + "acc_norm_stderr": 0.034288678487786564 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42771084337349397, + "acc_stderr": 0.038515976837185335, + "acc_norm": 0.42771084337349397, + "acc_norm_stderr": 0.038515976837185335 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.695906432748538, + "acc_stderr": 0.03528211258245231, + "acc_norm": 0.695906432748538, + "acc_norm_stderr": 0.03528211258245231 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.33047735618115054, + "mc1_stderr": 0.016466769613698307, + "mc2": 0.46496694797516, + "mc2_stderr": 0.015236674932834036 + }, + "harness|winogrande|5": { + "acc": 0.734017363851618, + "acc_stderr": 0.01241832315305105 + }, + "harness|gsm8k|5": { + "acc": 0.21076573161485973, + "acc_stderr": 0.011234280469030465 + }, + "all": { + "acc": 0.5126179344828111, + "acc_stderr": 0.0342051274120513, + "acc_norm": 0.5178843368987507, + "acc_norm_stderr": 0.034949756392914415, + "mc1": 0.33047735618115054, + "mc1_stderr": 0.016466769613698307, + "mc2": 0.46496694797516, + "mc2_stderr": 0.015236674932834036 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "e602902c123c2c7f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "04d2b2c4fd859912", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "c44c8100ac118ab8", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "2471bd9b6de2f391", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8011eab91a4417a2", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "c265a8ab28fdfd92", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "0d068c05d1befefa", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "e699adc64e7c4216", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "045fbf083ca82902", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "0ceac4d4d139f844", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "c78cdb3bf161a170", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "33ea33a584e53dff", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "42a5f4e298135117", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "b71af05030cd3f49", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "a1398d54792f4b6d", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "6d10e7f09fccb09b", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "63c2290fd89c9ed7" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "6b82379e3861993c", + "hash_cont_tokens": "5938cbc2945154c0" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/hfl/chinese-alpaca-2-13b/results_2023-12-09T16-00-55.681332.json b/eval-results/hfl/chinese-alpaca-2-13b/results_2023-12-09T16-00-55.681332.json new file mode 100644 index 0000000000000000000000000000000000000000..a56f06f52848c2299b65387102645476eed4ad01 --- /dev/null +++ b/eval-results/hfl/chinese-alpaca-2-13b/results_2023-12-09T16-00-55.681332.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 582437.087808415, + "end_time": 592523.650778783, + "total_evaluation_time_secondes": "10086.562970368075", + "model_name": "hfl/chinese-alpaca-2-13b", + "model_sha": "3b2e3895ff83c8892ab20fb8f98754d947879186", + "model_dtype": "torch.float16", + "model_size": "24.77 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5418088737201365, + "acc_stderr": 0.014560220308714697, + "acc_norm": 0.5870307167235495, + "acc_norm_stderr": 0.014388344935398329 + }, + "harness|hellaswag|10": { + "acc": 0.59699263095001, + "acc_stderr": 0.004894997736719051, + "acc_norm": 0.7975502887870942, + "acc_norm_stderr": 0.004010043978333125 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5921052631578947, + "acc_stderr": 0.039993097127774734, + "acc_norm": 0.5921052631578947, + "acc_norm_stderr": 0.039993097127774734 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5886792452830188, + "acc_stderr": 0.030285009259009794, + "acc_norm": 0.5886792452830188, + "acc_norm_stderr": 0.030285009259009794 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5625, + "acc_stderr": 0.04148415739394154, + "acc_norm": 0.5625, + "acc_norm_stderr": 0.04148415739394154 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5260115606936416, + "acc_stderr": 0.038073017265045125, + "acc_norm": 0.5260115606936416, + "acc_norm_stderr": 0.038073017265045125 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.04755129616062946, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.04755129616062946 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.37872340425531914, + "acc_stderr": 0.03170995606040655, + "acc_norm": 0.37872340425531914, + "acc_norm_stderr": 0.03170995606040655 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5448275862068965, + "acc_stderr": 0.04149886942192117, + "acc_norm": 0.5448275862068965, + "acc_norm_stderr": 0.04149886942192117 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.32275132275132273, + "acc_stderr": 0.024078943243597016, + "acc_norm": 0.32275132275132273, + "acc_norm_stderr": 0.024078943243597016 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.04263906892795132, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.04263906892795132 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.667741935483871, + "acc_stderr": 0.0267955608481228, + "acc_norm": 0.667741935483871, + "acc_norm_stderr": 0.0267955608481228 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4482758620689655, + "acc_stderr": 0.03499113137676744, + "acc_norm": 0.4482758620689655, + "acc_norm_stderr": 0.03499113137676744 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6606060606060606, + "acc_stderr": 0.03697442205031596, + "acc_norm": 0.6606060606060606, + "acc_norm_stderr": 0.03697442205031596 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.696969696969697, + "acc_stderr": 0.032742879140268674, + "acc_norm": 0.696969696969697, + "acc_norm_stderr": 0.032742879140268674 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7979274611398963, + "acc_stderr": 0.02897908979429673, + "acc_norm": 0.7979274611398963, + "acc_norm_stderr": 0.02897908979429673 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5102564102564102, + "acc_stderr": 0.025345672221942374, + "acc_norm": 0.5102564102564102, + "acc_norm_stderr": 0.025345672221942374 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2851851851851852, + "acc_stderr": 0.027528599210340492, + "acc_norm": 0.2851851851851852, + "acc_norm_stderr": 0.027528599210340492 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5798319327731093, + "acc_stderr": 0.03206183783236152, + "acc_norm": 0.5798319327731093, + "acc_norm_stderr": 0.03206183783236152 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7541284403669725, + "acc_stderr": 0.01846194096870843, + "acc_norm": 0.7541284403669725, + "acc_norm_stderr": 0.01846194096870843 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7549019607843137, + "acc_stderr": 0.030190282453501943, + "acc_norm": 0.7549019607843137, + "acc_norm_stderr": 0.030190282453501943 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7088607594936709, + "acc_stderr": 0.029571601065753374, + "acc_norm": 0.7088607594936709, + "acc_norm_stderr": 0.029571601065753374 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6278026905829597, + "acc_stderr": 0.03244305283008731, + "acc_norm": 0.6278026905829597, + "acc_norm_stderr": 0.03244305283008731 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6030534351145038, + "acc_stderr": 0.04291135671009224, + "acc_norm": 0.6030534351145038, + "acc_norm_stderr": 0.04291135671009224 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.04026187527591205, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.04026187527591205 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6134969325153374, + "acc_stderr": 0.038258255488486076, + "acc_norm": 0.6134969325153374, + "acc_norm_stderr": 0.038258255488486076 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.0457237235873743, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.0457237235873743 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.045416094465039476, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.045416094465039476 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.811965811965812, + "acc_stderr": 0.025598193686652244, + "acc_norm": 0.811965811965812, + "acc_norm_stderr": 0.025598193686652244 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465918, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465918 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7522349936143039, + "acc_stderr": 0.015438083080568972, + "acc_norm": 0.7522349936143039, + "acc_norm_stderr": 0.015438083080568972 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.02607431485165708, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.02607431485165708 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3865921787709497, + "acc_stderr": 0.016286674879101026, + "acc_norm": 0.3865921787709497, + "acc_norm_stderr": 0.016286674879101026 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5947712418300654, + "acc_stderr": 0.028110928492809075, + "acc_norm": 0.5947712418300654, + "acc_norm_stderr": 0.028110928492809075 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6366559485530546, + "acc_stderr": 0.02731684767419271, + "acc_norm": 0.6366559485530546, + "acc_norm_stderr": 0.02731684767419271 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6018518518518519, + "acc_stderr": 0.027237415094592474, + "acc_norm": 0.6018518518518519, + "acc_norm_stderr": 0.027237415094592474 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.40425531914893614, + "acc_stderr": 0.029275532159704725, + "acc_norm": 0.40425531914893614, + "acc_norm_stderr": 0.029275532159704725 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4230769230769231, + "acc_stderr": 0.01261820406658839, + "acc_norm": 0.4230769230769231, + "acc_norm_stderr": 0.01261820406658839 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.47794117647058826, + "acc_stderr": 0.030343264224213535, + "acc_norm": 0.47794117647058826, + "acc_norm_stderr": 0.030343264224213535 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5245098039215687, + "acc_stderr": 0.020203517280261436, + "acc_norm": 0.5245098039215687, + "acc_norm_stderr": 0.020203517280261436 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6090909090909091, + "acc_stderr": 0.046737523336702384, + "acc_norm": 0.6090909090909091, + "acc_norm_stderr": 0.046737523336702384 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5959183673469388, + "acc_stderr": 0.03141470802586589, + "acc_norm": 0.5959183673469388, + "acc_norm_stderr": 0.03141470802586589 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6865671641791045, + "acc_stderr": 0.032801882053486456, + "acc_norm": 0.6865671641791045, + "acc_norm_stderr": 0.032801882053486456 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.463855421686747, + "acc_stderr": 0.03882310850890593, + "acc_norm": 0.463855421686747, + "acc_norm_stderr": 0.03882310850890593 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7543859649122807, + "acc_stderr": 0.03301405946987251, + "acc_norm": 0.7543859649122807, + "acc_norm_stderr": 0.03301405946987251 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.34761321909424725, + "mc1_stderr": 0.016670769188897303, + "mc2": 0.5022258550236057, + "mc2_stderr": 0.015284175194421176 + }, + "harness|winogrande|5": { + "acc": 0.7561168113654302, + "acc_stderr": 0.012068923278908189 + }, + "harness|gsm8k|5": { + "acc": 0.25018953752843065, + "acc_stderr": 0.011930334350873352 + }, + "all": { + "acc": 0.5502321350314341, + "acc_stderr": 0.033838534455358144, + "acc_norm": 0.5559937862519342, + "acc_norm_stderr": 0.03456092398331123, + "mc1": 0.34761321909424725, + "mc1_stderr": 0.016670769188897303, + "mc2": 0.5022258550236057, + "mc2_stderr": 0.015284175194421176 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "e602902c123c2c7f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "04d2b2c4fd859912", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "c44c8100ac118ab8", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "2471bd9b6de2f391", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8011eab91a4417a2", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "c265a8ab28fdfd92", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "0d068c05d1befefa", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "e699adc64e7c4216", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "045fbf083ca82902", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "0ceac4d4d139f844", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "c78cdb3bf161a170", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "33ea33a584e53dff", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "42a5f4e298135117", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "b71af05030cd3f49", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "a1398d54792f4b6d", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "6d10e7f09fccb09b", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "3b02c7e344922b12" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "6b82379e3861993c", + "hash_cont_tokens": "7f5d51332bb0f298" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/hyunseoki/ko-en-llama2-13b/results_2023-10-04T07-33-17.210034.json b/eval-results/hyunseoki/ko-en-llama2-13b/results_2023-10-04T07-33-17.210034.json new file mode 100644 index 0000000000000000000000000000000000000000..3a8b68af680afc636dfc6fce92f2e362e435f87f --- /dev/null +++ b/eval-results/hyunseoki/ko-en-llama2-13b/results_2023-10-04T07-33-17.210034.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "hyunseoki/ko-en-llama2-13b", + "model_sha": "2768cf6f955b65868ccbb20658e2cc444b2f3be9", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5392491467576792, + "acc_stderr": 0.014566303676636588, + "acc_norm": 0.5819112627986348, + "acc_norm_stderr": 0.014413988396996077 + }, + "harness|hellaswag|10": { + "acc": 0.6091416052579167, + "acc_stderr": 0.004869455150933827, + "acc_norm": 0.8188607847042422, + "acc_norm_stderr": 0.0038434637920379223 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.562962962962963, + "acc_stderr": 0.04284958639753401, + "acc_norm": 0.562962962962963, + "acc_norm_stderr": 0.04284958639753401 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5526315789473685, + "acc_stderr": 0.040463368839782514, + "acc_norm": 0.5526315789473685, + "acc_norm_stderr": 0.040463368839782514 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5433962264150943, + "acc_stderr": 0.03065674869673943, + "acc_norm": 0.5433962264150943, + "acc_norm_stderr": 0.03065674869673943 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5902777777777778, + "acc_stderr": 0.04112490974670787, + "acc_norm": 0.5902777777777778, + "acc_norm_stderr": 0.04112490974670787 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.44508670520231214, + "acc_stderr": 0.03789401760283648, + "acc_norm": 0.44508670520231214, + "acc_norm_stderr": 0.03789401760283648 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4127659574468085, + "acc_stderr": 0.03218471141400351, + "acc_norm": 0.4127659574468085, + "acc_norm_stderr": 0.03218471141400351 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.0433913832257986, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.0433913832257986 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.04164188720169377, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.04164188720169377 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3201058201058201, + "acc_stderr": 0.024026846392873506, + "acc_norm": 0.3201058201058201, + "acc_norm_stderr": 0.024026846392873506 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.038932596106046755, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.038932596106046755 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421255, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421255 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6193548387096774, + "acc_stderr": 0.027621717832907036, + "acc_norm": 0.6193548387096774, + "acc_norm_stderr": 0.027621717832907036 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4433497536945813, + "acc_stderr": 0.03495334582162934, + "acc_norm": 0.4433497536945813, + "acc_norm_stderr": 0.03495334582162934 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6181818181818182, + "acc_stderr": 0.037937131711656344, + "acc_norm": 0.6181818181818182, + "acc_norm_stderr": 0.037937131711656344 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.702020202020202, + "acc_stderr": 0.03258630383836556, + "acc_norm": 0.702020202020202, + "acc_norm_stderr": 0.03258630383836556 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7564766839378239, + "acc_stderr": 0.030975436386845436, + "acc_norm": 0.7564766839378239, + "acc_norm_stderr": 0.030975436386845436 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.47692307692307695, + "acc_stderr": 0.025323990861736118, + "acc_norm": 0.47692307692307695, + "acc_norm_stderr": 0.025323990861736118 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2814814814814815, + "acc_stderr": 0.027420019350945277, + "acc_norm": 0.2814814814814815, + "acc_norm_stderr": 0.027420019350945277 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.542016806722689, + "acc_stderr": 0.03236361111951941, + "acc_norm": 0.542016806722689, + "acc_norm_stderr": 0.03236361111951941 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.03861557546255169, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.03861557546255169 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7009174311926606, + "acc_stderr": 0.019630417285415175, + "acc_norm": 0.7009174311926606, + "acc_norm_stderr": 0.019630417285415175 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.41203703703703703, + "acc_stderr": 0.03356787758160835, + "acc_norm": 0.41203703703703703, + "acc_norm_stderr": 0.03356787758160835 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6813725490196079, + "acc_stderr": 0.0327028718148208, + "acc_norm": 0.6813725490196079, + "acc_norm_stderr": 0.0327028718148208 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6624472573839663, + "acc_stderr": 0.030781549102026223, + "acc_norm": 0.6624472573839663, + "acc_norm_stderr": 0.030781549102026223 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5964125560538116, + "acc_stderr": 0.03292802819330315, + "acc_norm": 0.5964125560538116, + "acc_norm_stderr": 0.03292802819330315 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6564885496183206, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.6564885496183206, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6446280991735537, + "acc_stderr": 0.0436923632657398, + "acc_norm": 0.6446280991735537, + "acc_norm_stderr": 0.0436923632657398 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6296296296296297, + "acc_stderr": 0.04668408033024931, + "acc_norm": 0.6296296296296297, + "acc_norm_stderr": 0.04668408033024931 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6380368098159509, + "acc_stderr": 0.037757007291414416, + "acc_norm": 0.6380368098159509, + "acc_norm_stderr": 0.037757007291414416 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04287858751340456, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04287858751340456 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7475728155339806, + "acc_stderr": 0.04301250399690878, + "acc_norm": 0.7475728155339806, + "acc_norm_stderr": 0.04301250399690878 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7478632478632479, + "acc_stderr": 0.02844796547623102, + "acc_norm": 0.7478632478632479, + "acc_norm_stderr": 0.02844796547623102 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7292464878671775, + "acc_stderr": 0.015889888362560486, + "acc_norm": 0.7292464878671775, + "acc_norm_stderr": 0.015889888362560486 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6040462427745664, + "acc_stderr": 0.02632981334194625, + "acc_norm": 0.6040462427745664, + "acc_norm_stderr": 0.02632981334194625 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2670391061452514, + "acc_stderr": 0.014796502622562557, + "acc_norm": 0.2670391061452514, + "acc_norm_stderr": 0.014796502622562557 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6045751633986928, + "acc_stderr": 0.02799672318063146, + "acc_norm": 0.6045751633986928, + "acc_norm_stderr": 0.02799672318063146 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6141479099678456, + "acc_stderr": 0.027648149599751464, + "acc_norm": 0.6141479099678456, + "acc_norm_stderr": 0.027648149599751464 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6141975308641975, + "acc_stderr": 0.027085401226132143, + "acc_norm": 0.6141975308641975, + "acc_norm_stderr": 0.027085401226132143 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4078014184397163, + "acc_stderr": 0.029316011776343555, + "acc_norm": 0.4078014184397163, + "acc_norm_stderr": 0.029316011776343555 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.38461538461538464, + "acc_stderr": 0.012425548416302942, + "acc_norm": 0.38461538461538464, + "acc_norm_stderr": 0.012425548416302942 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4375, + "acc_stderr": 0.030134614954403924, + "acc_norm": 0.4375, + "acc_norm_stderr": 0.030134614954403924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.477124183006536, + "acc_stderr": 0.02020665318788479, + "acc_norm": 0.477124183006536, + "acc_norm_stderr": 0.02020665318788479 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5727272727272728, + "acc_stderr": 0.04738198703545483, + "acc_norm": 0.5727272727272728, + "acc_norm_stderr": 0.04738198703545483 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5755102040816327, + "acc_stderr": 0.031642094879429414, + "acc_norm": 0.5755102040816327, + "acc_norm_stderr": 0.031642094879429414 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7263681592039801, + "acc_stderr": 0.03152439186555402, + "acc_norm": 0.7263681592039801, + "acc_norm_stderr": 0.03152439186555402 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.77, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.77, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4397590361445783, + "acc_stderr": 0.03864139923699122, + "acc_norm": 0.4397590361445783, + "acc_norm_stderr": 0.03864139923699122 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7485380116959064, + "acc_stderr": 0.033275044238468436, + "acc_norm": 0.7485380116959064, + "acc_norm_stderr": 0.033275044238468436 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.26193390452876375, + "mc1_stderr": 0.015392118805015023, + "mc2": 0.3996110091058917, + "mc2_stderr": 0.013538590385255279 + }, + "all": { + "acc": 0.5220122335674698, + "acc_stderr": 0.03470974074584197, + "acc_norm": 0.5262898826435255, + "acc_norm_stderr": 0.03468976944688372, + "mc1": 0.26193390452876375, + "mc1_stderr": 0.015392118805015023, + "mc2": 0.3996110091058917, + "mc2_stderr": 0.013538590385255279 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "3940.88108253479", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/hyunseoki/ko-en-llama2-13b/results_2023-10-27T07-23-26.353656.json b/eval-results/hyunseoki/ko-en-llama2-13b/results_2023-10-27T07-23-26.353656.json new file mode 100644 index 0000000000000000000000000000000000000000..f250feb62a03206b2957e3e6175006ccc083af7b --- /dev/null +++ b/eval-results/hyunseoki/ko-en-llama2-13b/results_2023-10-27T07-23-26.353656.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "hyunseoki/ko-en-llama2-13b", + "model_sha": "2768cf6f955b65868ccbb20658e2cc444b2f3be9", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.28114513422818793, + "em_stderr": 0.004603896433799628, + "f1": 0.3260591442953026, + "f1_stderr": 0.004539391567050269 + }, + "harness|gsm8k|5": { + "acc": 0.0075815011372251705, + "acc_stderr": 0.002389281512077218 + }, + "harness|winogrande|5": { + "acc": 0.7482241515390686, + "acc_stderr": 0.012198489100259776 + }, + "all": { + "em": 0.28114513422818793, + "em_stderr": 0.004603896433799628, + "f1": 0.3260591442953026, + "f1_stderr": 0.004539391567050269, + "acc": 0.3779028263381469, + "acc_stderr": 0.007293885306168497 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "3c68514fc59a97e3" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "873a249ce873391d" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "9bbc632d087a5651" + }, + "total_evaluation_time_secondes": "5774.394629240036", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/hyunseoki/ko-ref-llama2-13b/results_2023-10-04T09-36-39.103374.json b/eval-results/hyunseoki/ko-ref-llama2-13b/results_2023-10-04T09-36-39.103374.json new file mode 100644 index 0000000000000000000000000000000000000000..12983049067935b3ed82677a6732801cd70af740 --- /dev/null +++ b/eval-results/hyunseoki/ko-ref-llama2-13b/results_2023-10-04T09-36-39.103374.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "hyunseoki/ko-ref-llama2-13b", + "model_sha": "c5d09631c88ab5012b48187ecd90ae773cd4bbd9", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4590443686006826, + "acc_stderr": 0.014562291073601241, + "acc_norm": 0.48378839590443684, + "acc_norm_stderr": 0.014603708567414947 + }, + "harness|hellaswag|10": { + "acc": 0.5243975303724357, + "acc_stderr": 0.004983837641502895, + "acc_norm": 0.735610436168094, + "acc_norm_stderr": 0.0044010632658032095 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.37777777777777777, + "acc_stderr": 0.04188307537595852, + "acc_norm": 0.37777777777777777, + "acc_norm_stderr": 0.04188307537595852 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3618421052631579, + "acc_stderr": 0.03910525752849724, + "acc_norm": 0.3618421052631579, + "acc_norm_stderr": 0.03910525752849724 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.37735849056603776, + "acc_stderr": 0.029832808114796005, + "acc_norm": 0.37735849056603776, + "acc_norm_stderr": 0.029832808114796005 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3611111111111111, + "acc_stderr": 0.040166600304512336, + "acc_norm": 0.3611111111111111, + "acc_norm_stderr": 0.040166600304512336 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036844, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036844 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3352601156069364, + "acc_stderr": 0.03599586301247077, + "acc_norm": 0.3352601156069364, + "acc_norm_stderr": 0.03599586301247077 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.30638297872340425, + "acc_stderr": 0.030135906478517563, + "acc_norm": 0.30638297872340425, + "acc_norm_stderr": 0.030135906478517563 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.0414243971948936, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.0414243971948936 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.32413793103448274, + "acc_stderr": 0.03900432069185553, + "acc_norm": 0.32413793103448274, + "acc_norm_stderr": 0.03900432069185553 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30423280423280424, + "acc_stderr": 0.02369541500946309, + "acc_norm": 0.30423280423280424, + "acc_norm_stderr": 0.02369541500946309 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.19047619047619047, + "acc_stderr": 0.035122074123020514, + "acc_norm": 0.19047619047619047, + "acc_norm_stderr": 0.035122074123020514 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621503, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621503 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3419354838709677, + "acc_stderr": 0.026985289576552732, + "acc_norm": 0.3419354838709677, + "acc_norm_stderr": 0.026985289576552732 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.31527093596059114, + "acc_stderr": 0.03269080871970186, + "acc_norm": 0.31527093596059114, + "acc_norm_stderr": 0.03269080871970186 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939098, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939098 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.30303030303030304, + "acc_stderr": 0.03588624800091709, + "acc_norm": 0.30303030303030304, + "acc_norm_stderr": 0.03588624800091709 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.494949494949495, + "acc_stderr": 0.035621707606254015, + "acc_norm": 0.494949494949495, + "acc_norm_stderr": 0.035621707606254015 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.41450777202072536, + "acc_stderr": 0.03555300319557673, + "acc_norm": 0.41450777202072536, + "acc_norm_stderr": 0.03555300319557673 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3230769230769231, + "acc_stderr": 0.02371088850197057, + "acc_norm": 0.3230769230769231, + "acc_norm_stderr": 0.02371088850197057 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.02684205787383371, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.02684205787383371 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.03038835355188684, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.03038835355188684 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2847682119205298, + "acc_stderr": 0.03684881521389023, + "acc_norm": 0.2847682119205298, + "acc_norm_stderr": 0.03684881521389023 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.47522935779816516, + "acc_stderr": 0.021410999753635914, + "acc_norm": 0.47522935779816516, + "acc_norm_stderr": 0.021410999753635914 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2361111111111111, + "acc_stderr": 0.02896370257079102, + "acc_norm": 0.2361111111111111, + "acc_norm_stderr": 0.02896370257079102 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.3137254901960784, + "acc_stderr": 0.032566854844603886, + "acc_norm": 0.3137254901960784, + "acc_norm_stderr": 0.032566854844603886 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.39662447257383965, + "acc_stderr": 0.03184399873811225, + "acc_norm": 0.39662447257383965, + "acc_norm_stderr": 0.03184399873811225 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.4304932735426009, + "acc_stderr": 0.033231973029429394, + "acc_norm": 0.4304932735426009, + "acc_norm_stderr": 0.033231973029429394 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.366412213740458, + "acc_stderr": 0.042258754519696386, + "acc_norm": 0.366412213740458, + "acc_norm_stderr": 0.042258754519696386 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.48760330578512395, + "acc_stderr": 0.045629515481807666, + "acc_norm": 0.48760330578512395, + "acc_norm_stderr": 0.045629515481807666 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.4537037037037037, + "acc_stderr": 0.048129173245368216, + "acc_norm": 0.4537037037037037, + "acc_norm_stderr": 0.048129173245368216 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3374233128834356, + "acc_stderr": 0.03714908409935573, + "acc_norm": 0.3374233128834356, + "acc_norm_stderr": 0.03714908409935573 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.042878587513404544, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.042878587513404544 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3592233009708738, + "acc_stderr": 0.04750458399041694, + "acc_norm": 0.3592233009708738, + "acc_norm_stderr": 0.04750458399041694 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.5, + "acc_stderr": 0.03275608910402091, + "acc_norm": 0.5, + "acc_norm_stderr": 0.03275608910402091 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.4789272030651341, + "acc_stderr": 0.017864076786212903, + "acc_norm": 0.4789272030651341, + "acc_norm_stderr": 0.017864076786212903 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.4421965317919075, + "acc_stderr": 0.026738603643807403, + "acc_norm": 0.4421965317919075, + "acc_norm_stderr": 0.026738603643807403 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24804469273743016, + "acc_stderr": 0.014444157808261431, + "acc_norm": 0.24804469273743016, + "acc_norm_stderr": 0.014444157808261431 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.39215686274509803, + "acc_stderr": 0.027956046165424513, + "acc_norm": 0.39215686274509803, + "acc_norm_stderr": 0.027956046165424513 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.4405144694533762, + "acc_stderr": 0.02819640057419743, + "acc_norm": 0.4405144694533762, + "acc_norm_stderr": 0.02819640057419743 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.39197530864197533, + "acc_stderr": 0.027163686038271222, + "acc_norm": 0.39197530864197533, + "acc_norm_stderr": 0.027163686038271222 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2801418439716312, + "acc_stderr": 0.026789172351140242, + "acc_norm": 0.2801418439716312, + "acc_norm_stderr": 0.026789172351140242 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.32529335071707954, + "acc_stderr": 0.011965311536571531, + "acc_norm": 0.32529335071707954, + "acc_norm_stderr": 0.011965311536571531 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.22794117647058823, + "acc_stderr": 0.025483081468029804, + "acc_norm": 0.22794117647058823, + "acc_norm_stderr": 0.025483081468029804 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.3284313725490196, + "acc_stderr": 0.018999707383162655, + "acc_norm": 0.3284313725490196, + "acc_norm_stderr": 0.018999707383162655 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2909090909090909, + "acc_stderr": 0.04350271442923243, + "acc_norm": 0.2909090909090909, + "acc_norm_stderr": 0.04350271442923243 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.33877551020408164, + "acc_stderr": 0.030299506562154185, + "acc_norm": 0.33877551020408164, + "acc_norm_stderr": 0.030299506562154185 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.40298507462686567, + "acc_stderr": 0.03468343295111126, + "acc_norm": 0.40298507462686567, + "acc_norm_stderr": 0.03468343295111126 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.48, + "acc_stderr": 0.05021167315686781, + "acc_norm": 0.48, + "acc_norm_stderr": 0.05021167315686781 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3433734939759036, + "acc_stderr": 0.03696584317010601, + "acc_norm": 0.3433734939759036, + "acc_norm_stderr": 0.03696584317010601 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.52046783625731, + "acc_stderr": 0.0383161053282193, + "acc_norm": 0.52046783625731, + "acc_norm_stderr": 0.0383161053282193 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23623011015911874, + "mc1_stderr": 0.014869755015871117, + "mc2": 0.35820119309520254, + "mc2_stderr": 0.013639920153786636 + }, + "all": { + "acc": 0.3532050616995535, + "acc_stderr": 0.03417805610937114, + "acc_norm": 0.35720433175208594, + "acc_norm_stderr": 0.0341688805690002, + "mc1": 0.23623011015911874, + "mc1_stderr": 0.014869755015871117, + "mc2": 0.35820119309520254, + "mc2_stderr": 0.013639920153786636 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "3898.793873310089", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/hyunseoki/ko-ref-llama2-13b/results_2023-10-23T20-48-08.405984.json b/eval-results/hyunseoki/ko-ref-llama2-13b/results_2023-10-23T20-48-08.405984.json new file mode 100644 index 0000000000000000000000000000000000000000..2c9523b617b3357348fcf4af4807740d05f3c21b --- /dev/null +++ b/eval-results/hyunseoki/ko-ref-llama2-13b/results_2023-10-23T20-48-08.405984.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "hyunseoki/ko-ref-llama2-13b", + "model_sha": "c5d09631c88ab5012b48187ecd90ae773cd4bbd9", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.23804530201342283, + "em_stderr": 0.00436148149592577, + "f1": 0.2753114513422822, + "f1_stderr": 0.004376593977288765 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.691397000789266, + "acc_stderr": 0.012982160200926574 + }, + "all": { + "em": 0.23804530201342283, + "em_stderr": 0.00436148149592577, + "f1": 0.2753114513422822, + "f1_stderr": 0.004376593977288765, + "acc": 0.345698500394633, + "acc_stderr": 0.006491080100463287 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "9beae1da9299177d" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "08540bf1bf18e1ae" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "6d6d6022e89b3dc2" + }, + "total_evaluation_time_secondes": "7202.1130385398865", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/hyunseoki/ko-ref-llama2-7b/results_2023-10-04T09-16-09.367375.json b/eval-results/hyunseoki/ko-ref-llama2-7b/results_2023-10-04T09-16-09.367375.json new file mode 100644 index 0000000000000000000000000000000000000000..2e0aad0f90ba3aa5d53b67d25e8317031b5d9da2 --- /dev/null +++ b/eval-results/hyunseoki/ko-ref-llama2-7b/results_2023-10-04T09-16-09.367375.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "hyunseoki/ko-ref-llama2-7b", + "model_sha": "1ee08c79ae7393473754b77e82b1472ef63d5dd2", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.3796928327645051, + "acc_stderr": 0.014182119866974872, + "acc_norm": 0.42662116040955633, + "acc_norm_stderr": 0.014453185592920293 + }, + "harness|hellaswag|10": { + "acc": 0.4761999601672974, + "acc_stderr": 0.004984125363319067, + "acc_norm": 0.665803624775941, + "acc_norm_stderr": 0.0047074472442006285 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.03944624162501117, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.03944624162501117 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.035834961763610625, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.035834961763610625 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.28679245283018867, + "acc_stderr": 0.027834912527544078, + "acc_norm": 0.28679245283018867, + "acc_norm_stderr": 0.027834912527544078 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2847222222222222, + "acc_stderr": 0.03773809990686934, + "acc_norm": 0.2847222222222222, + "acc_norm_stderr": 0.03773809990686934 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.27167630057803466, + "acc_stderr": 0.03391750322321658, + "acc_norm": 0.27167630057803466, + "acc_norm_stderr": 0.03391750322321658 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.044405219061793254, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.044405219061793254 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.31063829787234043, + "acc_stderr": 0.03025123757921317, + "acc_norm": 0.31063829787234043, + "acc_norm_stderr": 0.03025123757921317 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.18421052631578946, + "acc_stderr": 0.03646758875075566, + "acc_norm": 0.18421052631578946, + "acc_norm_stderr": 0.03646758875075566 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.296551724137931, + "acc_stderr": 0.038061426873099935, + "acc_norm": 0.296551724137931, + "acc_norm_stderr": 0.038061426873099935 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.022418042891113946, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.022418042891113946 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.18253968253968253, + "acc_stderr": 0.034550710191021496, + "acc_norm": 0.18253968253968253, + "acc_norm_stderr": 0.034550710191021496 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3, + "acc_stderr": 0.02606936229533513, + "acc_norm": 0.3, + "acc_norm_stderr": 0.02606936229533513 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.26108374384236455, + "acc_stderr": 0.030903796952114485, + "acc_norm": 0.26108374384236455, + "acc_norm_stderr": 0.030903796952114485 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.28484848484848485, + "acc_stderr": 0.03524390844511784, + "acc_norm": 0.28484848484848485, + "acc_norm_stderr": 0.03524390844511784 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.033586181457325226, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.033586181457325226 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.35233160621761656, + "acc_stderr": 0.03447478286414358, + "acc_norm": 0.35233160621761656, + "acc_norm_stderr": 0.03447478286414358 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.31025641025641026, + "acc_stderr": 0.0234546748894043, + "acc_norm": 0.31025641025641026, + "acc_norm_stderr": 0.0234546748894043 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.02684205787383371, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.02684205787383371 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2815126050420168, + "acc_stderr": 0.029213549414372167, + "acc_norm": 0.2815126050420168, + "acc_norm_stderr": 0.029213549414372167 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3669724770642202, + "acc_stderr": 0.020664675659520532, + "acc_norm": 0.3669724770642202, + "acc_norm_stderr": 0.020664675659520532 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4675925925925926, + "acc_stderr": 0.03402801581358966, + "acc_norm": 0.4675925925925926, + "acc_norm_stderr": 0.03402801581358966 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.30392156862745096, + "acc_stderr": 0.03228210387037892, + "acc_norm": 0.30392156862745096, + "acc_norm_stderr": 0.03228210387037892 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2911392405063291, + "acc_stderr": 0.029571601065753374, + "acc_norm": 0.2911392405063291, + "acc_norm_stderr": 0.029571601065753374 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.32286995515695066, + "acc_stderr": 0.03138147637575499, + "acc_norm": 0.32286995515695066, + "acc_norm_stderr": 0.03138147637575499 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3053435114503817, + "acc_stderr": 0.04039314978724561, + "acc_norm": 0.3053435114503817, + "acc_norm_stderr": 0.04039314978724561 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2231404958677686, + "acc_stderr": 0.03800754475228733, + "acc_norm": 0.2231404958677686, + "acc_norm_stderr": 0.03800754475228733 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.046166311118017125, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.046166311118017125 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.31901840490797545, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.31901840490797545, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.16964285714285715, + "acc_stderr": 0.0356236785009539, + "acc_norm": 0.16964285714285715, + "acc_norm_stderr": 0.0356236785009539 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.39805825242718446, + "acc_stderr": 0.04846748253977239, + "acc_norm": 0.39805825242718446, + "acc_norm_stderr": 0.04846748253977239 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.29914529914529914, + "acc_stderr": 0.029996951858349476, + "acc_norm": 0.29914529914529914, + "acc_norm_stderr": 0.029996951858349476 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.3205619412515964, + "acc_stderr": 0.016688893310803782, + "acc_norm": 0.3205619412515964, + "acc_norm_stderr": 0.016688893310803782 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2745664739884393, + "acc_stderr": 0.02402774515526501, + "acc_norm": 0.2745664739884393, + "acc_norm_stderr": 0.02402774515526501 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.26256983240223464, + "acc_stderr": 0.014716824273017761, + "acc_norm": 0.26256983240223464, + "acc_norm_stderr": 0.014716824273017761 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.3562091503267974, + "acc_stderr": 0.027420477662629252, + "acc_norm": 0.3562091503267974, + "acc_norm_stderr": 0.027420477662629252 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.33440514469453375, + "acc_stderr": 0.026795422327893937, + "acc_norm": 0.33440514469453375, + "acc_norm_stderr": 0.026795422327893937 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25617283950617287, + "acc_stderr": 0.024288533637726095, + "acc_norm": 0.25617283950617287, + "acc_norm_stderr": 0.024288533637726095 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2801418439716312, + "acc_stderr": 0.026789172351140242, + "acc_norm": 0.2801418439716312, + "acc_norm_stderr": 0.026789172351140242 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.28096479791395046, + "acc_stderr": 0.011479684550077704, + "acc_norm": 0.28096479791395046, + "acc_norm_stderr": 0.011479684550077704 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.375, + "acc_stderr": 0.029408372932278746, + "acc_norm": 0.375, + "acc_norm_stderr": 0.029408372932278746 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.24183006535947713, + "acc_stderr": 0.017322789207784326, + "acc_norm": 0.24183006535947713, + "acc_norm_stderr": 0.017322789207784326 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.23636363636363636, + "acc_stderr": 0.04069306319721376, + "acc_norm": 0.23636363636363636, + "acc_norm_stderr": 0.04069306319721376 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.40816326530612246, + "acc_stderr": 0.03146465712827423, + "acc_norm": 0.40816326530612246, + "acc_norm_stderr": 0.03146465712827423 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.373134328358209, + "acc_stderr": 0.034198326081760065, + "acc_norm": 0.373134328358209, + "acc_norm_stderr": 0.034198326081760065 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.29518072289156627, + "acc_stderr": 0.0355092018568963, + "acc_norm": 0.29518072289156627, + "acc_norm_stderr": 0.0355092018568963 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.39766081871345027, + "acc_stderr": 0.0375363895576169, + "acc_norm": 0.39766081871345027, + "acc_norm_stderr": 0.0375363895576169 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862673, + "mc2": 0.3861615273057621, + "mc2_stderr": 0.014427973069673701 + }, + "all": { + "acc": 0.3082551264068994, + "acc_stderr": 0.033282145763427556, + "acc_norm": 0.31226414322475854, + "acc_norm_stderr": 0.033282050638119544, + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862673, + "mc2": 0.3861615273057621, + "mc2_stderr": 0.014427973069673701 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "2664.0070703029633", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/hyunseoki/ko-ref-llama2-7b/results_2023-10-25T06-27-40.666893.json b/eval-results/hyunseoki/ko-ref-llama2-7b/results_2023-10-25T06-27-40.666893.json new file mode 100644 index 0000000000000000000000000000000000000000..aadfdfb715f2480d4b5af70d80b265a9c198cfe7 --- /dev/null +++ b/eval-results/hyunseoki/ko-ref-llama2-7b/results_2023-10-25T06-27-40.666893.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "hyunseoki/ko-ref-llama2-7b", + "model_sha": "1ee08c79ae7393473754b77e82b1472ef63d5dd2", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.20343959731543623, + "em_stderr": 0.004122557786324279, + "f1": 0.24051069630872504, + "f1_stderr": 0.00417330845396371 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.6621941594317285, + "acc_stderr": 0.013292583502910888 + }, + "all": { + "em": 0.20343959731543623, + "em_stderr": 0.004122557786324279, + "f1": 0.24051069630872504, + "f1_stderr": 0.00417330845396371, + "acc": 0.3310970797158643, + "acc_stderr": 0.006646291751455444 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "9a53eaadb9e3e524" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "4b5fd9e8b121c921" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "a8277d186a8c58f5" + }, + "total_evaluation_time_secondes": "4902.9846930503845", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/ibranze/araproje-llama2-7b-hf/results_2023-10-08T20-04-34.106747.json b/eval-results/ibranze/araproje-llama2-7b-hf/results_2023-10-08T20-04-34.106747.json new file mode 100644 index 0000000000000000000000000000000000000000..1154307262a483fde9dcfaadabca5bf1bec61994 --- /dev/null +++ b/eval-results/ibranze/araproje-llama2-7b-hf/results_2023-10-08T20-04-34.106747.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "ibranze/araproje-llama2-7b-hf", + "model_sha": "7fe54f507e762b0f62265813aef908765b1298c0", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.49146757679180886, + "acc_stderr": 0.01460926316563219, + "acc_norm": 0.5307167235494881, + "acc_norm_stderr": 0.014583792546304037 + }, + "harness|hellaswag|10": { + "acc": 0.5884285998805019, + "acc_stderr": 0.0049111251010646425, + "acc_norm": 0.785700059749054, + "acc_norm_stderr": 0.004094971980892084 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.40789473684210525, + "acc_stderr": 0.03999309712777471, + "acc_norm": 0.40789473684210525, + "acc_norm_stderr": 0.03999309712777471 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4641509433962264, + "acc_stderr": 0.030693675018458003, + "acc_norm": 0.4641509433962264, + "acc_norm_stderr": 0.030693675018458003 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4652777777777778, + "acc_stderr": 0.04171115858181618, + "acc_norm": 0.4652777777777778, + "acc_norm_stderr": 0.04171115858181618 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4277456647398844, + "acc_stderr": 0.037724468575180255, + "acc_norm": 0.4277456647398844, + "acc_norm_stderr": 0.037724468575180255 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.04280105837364395, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.04280105837364395 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.42127659574468085, + "acc_stderr": 0.03227834510146267, + "acc_norm": 0.42127659574468085, + "acc_norm_stderr": 0.03227834510146267 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322004, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322004 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2671957671957672, + "acc_stderr": 0.02278967314577656, + "acc_norm": 0.2671957671957672, + "acc_norm_stderr": 0.02278967314577656 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.0404061017820884, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.0404061017820884 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621503, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621503 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5, + "acc_stderr": 0.028444006199428714, + "acc_norm": 0.5, + "acc_norm_stderr": 0.028444006199428714 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3645320197044335, + "acc_stderr": 0.033864057460620905, + "acc_norm": 0.3645320197044335, + "acc_norm_stderr": 0.033864057460620905 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.593939393939394, + "acc_stderr": 0.03834816355401181, + "acc_norm": 0.593939393939394, + "acc_norm_stderr": 0.03834816355401181 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.4898989898989899, + "acc_stderr": 0.03561625488673745, + "acc_norm": 0.4898989898989899, + "acc_norm_stderr": 0.03561625488673745 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6787564766839378, + "acc_stderr": 0.033699508685490674, + "acc_norm": 0.6787564766839378, + "acc_norm_stderr": 0.033699508685490674 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.45897435897435895, + "acc_stderr": 0.025265525491284295, + "acc_norm": 0.45897435897435895, + "acc_norm_stderr": 0.025265525491284295 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.027940457136228416, + "acc_norm": 0.3, + "acc_norm_stderr": 0.027940457136228416 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.0322529423239964, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.0322529423239964 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.037804458505267334, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.037804458505267334 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6311926605504588, + "acc_stderr": 0.020686227560729555, + "acc_norm": 0.6311926605504588, + "acc_norm_stderr": 0.020686227560729555 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.27314814814814814, + "acc_stderr": 0.03038805130167812, + "acc_norm": 0.27314814814814814, + "acc_norm_stderr": 0.03038805130167812 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5441176470588235, + "acc_stderr": 0.03495624522015476, + "acc_norm": 0.5441176470588235, + "acc_norm_stderr": 0.03495624522015476 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6329113924050633, + "acc_stderr": 0.031376240725616185, + "acc_norm": 0.6329113924050633, + "acc_norm_stderr": 0.031376240725616185 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5650224215246636, + "acc_stderr": 0.033272833702713445, + "acc_norm": 0.5650224215246636, + "acc_norm_stderr": 0.033272833702713445 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5648854961832062, + "acc_stderr": 0.04348208051644858, + "acc_norm": 0.5648854961832062, + "acc_norm_stderr": 0.04348208051644858 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6528925619834711, + "acc_stderr": 0.043457245702925335, + "acc_norm": 0.6528925619834711, + "acc_norm_stderr": 0.043457245702925335 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.04820403072760628, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.04820403072760628 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5153374233128835, + "acc_stderr": 0.03926522378708843, + "acc_norm": 0.5153374233128835, + "acc_norm_stderr": 0.03926522378708843 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.38392857142857145, + "acc_stderr": 0.04616143075028547, + "acc_norm": 0.38392857142857145, + "acc_norm_stderr": 0.04616143075028547 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5533980582524272, + "acc_stderr": 0.04922424153458933, + "acc_norm": 0.5533980582524272, + "acc_norm_stderr": 0.04922424153458933 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6923076923076923, + "acc_stderr": 0.030236389942173085, + "acc_norm": 0.6923076923076923, + "acc_norm_stderr": 0.030236389942173085 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.55, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.55, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6398467432950191, + "acc_stderr": 0.017166362471369306, + "acc_norm": 0.6398467432950191, + "acc_norm_stderr": 0.017166362471369306 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.49421965317919075, + "acc_stderr": 0.026917296179149116, + "acc_norm": 0.49421965317919075, + "acc_norm_stderr": 0.026917296179149116 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23910614525139665, + "acc_stderr": 0.014265554192331144, + "acc_norm": 0.23910614525139665, + "acc_norm_stderr": 0.014265554192331144 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.49673202614379086, + "acc_stderr": 0.02862930519400354, + "acc_norm": 0.49673202614379086, + "acc_norm_stderr": 0.02862930519400354 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6012861736334405, + "acc_stderr": 0.0278093225857745, + "acc_norm": 0.6012861736334405, + "acc_norm_stderr": 0.0278093225857745 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.4876543209876543, + "acc_stderr": 0.027812262269327228, + "acc_norm": 0.4876543209876543, + "acc_norm_stderr": 0.027812262269327228 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3617021276595745, + "acc_stderr": 0.028663820147199492, + "acc_norm": 0.3617021276595745, + "acc_norm_stderr": 0.028663820147199492 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.36114732724902215, + "acc_stderr": 0.01226793547751903, + "acc_norm": 0.36114732724902215, + "acc_norm_stderr": 0.01226793547751903 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5257352941176471, + "acc_stderr": 0.03033257809455504, + "acc_norm": 0.5257352941176471, + "acc_norm_stderr": 0.03033257809455504 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.020087362076702857, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.020087362076702857 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5272727272727272, + "acc_stderr": 0.04782001791380061, + "acc_norm": 0.5272727272727272, + "acc_norm_stderr": 0.04782001791380061 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4775510204081633, + "acc_stderr": 0.031976941187136725, + "acc_norm": 0.4775510204081633, + "acc_norm_stderr": 0.031976941187136725 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6318407960199005, + "acc_stderr": 0.03410410565495301, + "acc_norm": 0.6318407960199005, + "acc_norm_stderr": 0.03410410565495301 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.65, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.65, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42168674698795183, + "acc_stderr": 0.03844453181770917, + "acc_norm": 0.42168674698795183, + "acc_norm_stderr": 0.03844453181770917 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7017543859649122, + "acc_stderr": 0.03508771929824563, + "acc_norm": 0.7017543859649122, + "acc_norm_stderr": 0.03508771929824563 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2484700122399021, + "mc1_stderr": 0.01512742709652068, + "mc2": 0.3875084099562216, + "mc2_stderr": 0.013510147651392562 + }, + "all": { + "acc": 0.47043597201725107, + "acc_stderr": 0.03529263908245757, + "acc_norm": 0.47444479585837357, + "acc_norm_stderr": 0.03527837427331349, + "mc1": 0.2484700122399021, + "mc1_stderr": 0.01512742709652068, + "mc2": 0.3875084099562216, + "mc2_stderr": 0.013510147651392562 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4275.146588087082", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/ibranze/araproje-llama2-7b-hf/results_2023-10-26T05-17-54.107073.json b/eval-results/ibranze/araproje-llama2-7b-hf/results_2023-10-26T05-17-54.107073.json new file mode 100644 index 0000000000000000000000000000000000000000..eb6cd08453f20d3a99caca05c13abfdef1d518a8 --- /dev/null +++ b/eval-results/ibranze/araproje-llama2-7b-hf/results_2023-10-26T05-17-54.107073.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "ibranze/araproje-llama2-7b-hf", + "model_sha": "7fe54f507e762b0f62265813aef908765b1298c0", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0012583892617449664, + "em_stderr": 0.00036305608931194434, + "f1": 0.055925964765100665, + "f1_stderr": 0.0013181664771628632 + }, + "harness|gsm8k|5": { + "acc": 0.0712661106899166, + "acc_stderr": 0.007086462127954491 + }, + "harness|winogrande|5": { + "acc": 0.7403314917127072, + "acc_stderr": 0.012322700705552667 + }, + "all": { + "em": 0.0012583892617449664, + "em_stderr": 0.00036305608931194434, + "f1": 0.055925964765100665, + "f1_stderr": 0.0013181664771628632, + "acc": 0.4057988012013119, + "acc_stderr": 0.00970458141675358 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "ef74ade15eb78da6" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "542d7b742ca594d0" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "58a2c19976e6dde8" + }, + "total_evaluation_time_secondes": "9980.373613119125", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/internlm/internlm-20b-chat/results_2023-09-18T17-16-38.542229.json b/eval-results/internlm/internlm-20b-chat/results_2023-09-18T17-16-38.542229.json new file mode 100644 index 0000000000000000000000000000000000000000..1f3850476feece47025ceacdbf6cd012c306997c --- /dev/null +++ b/eval-results/internlm/internlm-20b-chat/results_2023-09-18T17-16-38.542229.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "internlm/internlm-20b-chat", + "model_sha": "79946225fa7a215e0ebcf4440a9cce88e475deaa", + "model_size": "37.54 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5051194539249146, + "acc_stderr": 0.014610624890309157, + "acc_norm": 0.5537542662116041, + "acc_norm_stderr": 0.014526705548539982 + }, + "harness|hellaswag|10": { + "acc": 0.5822545309699263, + "acc_stderr": 0.0049217984926087826, + "acc_norm": 0.7857996415056762, + "acc_norm_stderr": 0.004094279871733679 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5407407407407407, + "acc_stderr": 0.04304979692464242, + "acc_norm": 0.5407407407407407, + "acc_norm_stderr": 0.04304979692464242 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5789473684210527, + "acc_stderr": 0.04017901275981748, + "acc_norm": 0.5789473684210527, + "acc_norm_stderr": 0.04017901275981748 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6, + "acc_stderr": 0.030151134457776285, + "acc_norm": 0.6, + "acc_norm_stderr": 0.030151134457776285 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.625, + "acc_stderr": 0.04048439222695598, + "acc_norm": 0.625, + "acc_norm_stderr": 0.04048439222695598 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5260115606936416, + "acc_stderr": 0.03807301726504514, + "acc_norm": 0.5260115606936416, + "acc_norm_stderr": 0.03807301726504514 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4215686274509804, + "acc_stderr": 0.049135952012744975, + "acc_norm": 0.4215686274509804, + "acc_norm_stderr": 0.049135952012744975 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.046882617226215055, + "acc_norm": 0.68, + "acc_norm_stderr": 0.046882617226215055 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.451063829787234, + "acc_stderr": 0.032529096196131965, + "acc_norm": 0.451063829787234, + "acc_norm_stderr": 0.032529096196131965 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.32456140350877194, + "acc_stderr": 0.044045561573747664, + "acc_norm": 0.32456140350877194, + "acc_norm_stderr": 0.044045561573747664 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5586206896551724, + "acc_stderr": 0.04137931034482757, + "acc_norm": 0.5586206896551724, + "acc_norm_stderr": 0.04137931034482757 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3783068783068783, + "acc_stderr": 0.02497695405315523, + "acc_norm": 0.3783068783068783, + "acc_norm_stderr": 0.02497695405315523 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.04360314860077459, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.04360314860077459 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6612903225806451, + "acc_stderr": 0.026923446059302834, + "acc_norm": 0.6612903225806451, + "acc_norm_stderr": 0.026923446059302834 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43349753694581283, + "acc_stderr": 0.03486731727419872, + "acc_norm": 0.43349753694581283, + "acc_norm_stderr": 0.03486731727419872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7636363636363637, + "acc_stderr": 0.03317505930009182, + "acc_norm": 0.7636363636363637, + "acc_norm_stderr": 0.03317505930009182 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7525252525252525, + "acc_stderr": 0.030746300742124498, + "acc_norm": 0.7525252525252525, + "acc_norm_stderr": 0.030746300742124498 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8549222797927462, + "acc_stderr": 0.025416343096306443, + "acc_norm": 0.8549222797927462, + "acc_norm_stderr": 0.025416343096306443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5692307692307692, + "acc_stderr": 0.02510682066053975, + "acc_norm": 0.5692307692307692, + "acc_norm_stderr": 0.02510682066053975 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.028037929969114993, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.028037929969114993 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5630252100840336, + "acc_stderr": 0.03221943636566196, + "acc_norm": 0.5630252100840336, + "acc_norm_stderr": 0.03221943636566196 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.36423841059602646, + "acc_stderr": 0.03929111781242741, + "acc_norm": 0.36423841059602646, + "acc_norm_stderr": 0.03929111781242741 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7908256880733945, + "acc_stderr": 0.017437937173343233, + "acc_norm": 0.7908256880733945, + "acc_norm_stderr": 0.017437937173343233 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4537037037037037, + "acc_stderr": 0.033953227263757976, + "acc_norm": 0.4537037037037037, + "acc_norm_stderr": 0.033953227263757976 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7598039215686274, + "acc_stderr": 0.02998373305591361, + "acc_norm": 0.7598039215686274, + "acc_norm_stderr": 0.02998373305591361 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7890295358649789, + "acc_stderr": 0.02655837250266192, + "acc_norm": 0.7890295358649789, + "acc_norm_stderr": 0.02655837250266192 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6547085201793722, + "acc_stderr": 0.031911001928357954, + "acc_norm": 0.6547085201793722, + "acc_norm_stderr": 0.031911001928357954 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6641221374045801, + "acc_stderr": 0.041423137719966634, + "acc_norm": 0.6641221374045801, + "acc_norm_stderr": 0.041423137719966634 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.039849796533028725, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.039849796533028725 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.04330043749650744, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.04330043749650744 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6871165644171779, + "acc_stderr": 0.03642914578292406, + "acc_norm": 0.6871165644171779, + "acc_norm_stderr": 0.03642914578292406 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4107142857142857, + "acc_stderr": 0.04669510663875191, + "acc_norm": 0.4107142857142857, + "acc_norm_stderr": 0.04669510663875191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7281553398058253, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.7281553398058253, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8504273504273504, + "acc_stderr": 0.023365051491753715, + "acc_norm": 0.8504273504273504, + "acc_norm_stderr": 0.023365051491753715 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.756066411238825, + "acc_stderr": 0.015357212665829468, + "acc_norm": 0.756066411238825, + "acc_norm_stderr": 0.015357212665829468 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.025722802200895803, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.025722802200895803 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.39888268156424583, + "acc_stderr": 0.01637696614261008, + "acc_norm": 0.39888268156424583, + "acc_norm_stderr": 0.01637696614261008 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6437908496732027, + "acc_stderr": 0.027420477662629245, + "acc_norm": 0.6437908496732027, + "acc_norm_stderr": 0.027420477662629245 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6334405144694534, + "acc_stderr": 0.02736807824397165, + "acc_norm": 0.6334405144694534, + "acc_norm_stderr": 0.02736807824397165 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6172839506172839, + "acc_stderr": 0.02704453813840259, + "acc_norm": 0.6172839506172839, + "acc_norm_stderr": 0.02704453813840259 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.42907801418439717, + "acc_stderr": 0.02952591430255856, + "acc_norm": 0.42907801418439717, + "acc_norm_stderr": 0.02952591430255856 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.43741851368970014, + "acc_stderr": 0.012669813464935726, + "acc_norm": 0.43741851368970014, + "acc_norm_stderr": 0.012669813464935726 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5551470588235294, + "acc_stderr": 0.03018753206032938, + "acc_norm": 0.5551470588235294, + "acc_norm_stderr": 0.03018753206032938 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5964052287581699, + "acc_stderr": 0.019848280168401157, + "acc_norm": 0.5964052287581699, + "acc_norm_stderr": 0.019848280168401157 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.689795918367347, + "acc_stderr": 0.029613459872484378, + "acc_norm": 0.689795918367347, + "acc_norm_stderr": 0.029613459872484378 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7810945273631841, + "acc_stderr": 0.029239174636647, + "acc_norm": 0.7810945273631841, + "acc_norm_stderr": 0.029239174636647 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4879518072289157, + "acc_stderr": 0.0389136449583582, + "acc_norm": 0.4879518072289157, + "acc_norm_stderr": 0.0389136449583582 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7719298245614035, + "acc_stderr": 0.032180937956023566, + "acc_norm": 0.7719298245614035, + "acc_norm_stderr": 0.032180937956023566 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.29498164014687883, + "mc1_stderr": 0.015964400965589667, + "mc2": 0.43223188090265946, + "mc2_stderr": 0.014453414586538413 + }, + "all": { + "acc": 0.5839324464870065, + "acc_stderr": 0.034263089521573094, + "acc_norm": 0.5882066824670478, + "acc_norm_stderr": 0.03424764142051133, + "mc1": 0.29498164014687883, + "mc1_stderr": 0.015964400965589667, + "mc2": 0.43223188090265946, + "mc2_stderr": 0.014453414586538413 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ab6d1c3d70fc1049", + "hash_cont_tokens": "3b5ae8a16b9ca33e" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4adeb76b031573a1", + "hash_cont_tokens": "e07ac1a581cd0c0c" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40137, + "non-padded": 31, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "6481c2fba7190478", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "e07c9163ac5081a3", + "hash_cont_tokens": "3448d00acc7a11c6" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "539ef21f133e917c", + "hash_cont_tokens": "9b5285416fa903e2" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd43555e5256f484", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "15663115b477062a", + "hash_cont_tokens": "0382995cfcc24e3e" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "81e10086d9a67a89", + "hash_cont_tokens": "09ef20d27e0286fe" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "822ee9d85dd90211", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "30010413b5397d2a", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "29b48c03b0b8f848", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "02ae47c6ad730c28", + "hash_cont_tokens": "2115091b39764e96" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 680, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "ba5fa0250e3d6c30", + "hash_cont_tokens": "253e8f65a34d2f2b" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "9505189fb7c44b4e", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "53c3c332289e7b04", + "hash_cont_tokens": "863770146d3e3341" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f92b13ed95e3c111", + "hash_cont_tokens": "547784fe0135a15c" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e71d172f814a80c2", + "hash_cont_tokens": "545e7978a9a2e921" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 576, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "5e819b51881bb2f0", + "hash_cont_tokens": "6220dafecd3e71a1" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "f12da3535e4dc175", + "hash_cont_tokens": "9e1c83b748056f05" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "bfc70762fbc3c00a", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "1cd3d68c399c9798", + "hash_cont_tokens": "c0ee938431d4cce1" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "4bf07860f9a05ea0", + "hash_cont_tokens": "2fd86b22bfa1c8cb" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "f18950fe2d6b0852", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "ff596bb41009df48", + "hash_cont_tokens": "8d52dfdbe7373dec" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "94857240d8fb30ec", + "hash_cont_tokens": "7daa2bbedae272e1" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "04cd00103bdeb0e8", + "hash_cont_tokens": "530e7985f90589ad" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "be211f365573815f", + "hash_cont_tokens": "8abfdac40b0aa157" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "61b56ac65f52bd9f", + "hash_cont_tokens": "0450a3d8e715e926" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "2d068a1f6fa6447c", + "hash_cont_tokens": "3e477b8a15ec619c" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "751c9d3d45725100", + "hash_cont_tokens": "f0648b1ae17e3c3f" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "e67193bef752e0a2", + "hash_cont_tokens": "71a621b85c8384ec" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2179, + "non-padded": 1, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a2a440e523363a80", + "hash_cont_tokens": "507dec89f16c35ea" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "9ccd9b23a7e8b60b", + "hash_cont_tokens": "fe66e65deac902bb" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "a39dd570f7b916b5", + "hash_cont_tokens": "7fe519011d639dc8" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "7f5bf8be5e049ee7", + "hash_cont_tokens": "77ba99656e04ddd0" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "54bd52aca1127ab7", + "hash_cont_tokens": "bc8f34ada52ca31e" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "3f43f1b61ea3ce0a", + "hash_cont_tokens": "d4b66c0f10b911b8" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "44ed1d9fa69f8186", + "hash_cont_tokens": "f7ea9e092aff54a4" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "bfab6f0ecaac1ebf", + "hash_cont_tokens": "9e305ec3d994de5c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "5f80df97c3c4cc0c", + "hash_cont_tokens": "85f6ff4f34ded537" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "ee080212a243ddeb", + "hash_cont_tokens": "1f24f5bf907f5f28" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "3a52fb801992c276", + "hash_cont_tokens": "37062ffd1e129b49" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "a619808a23365e33", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "5e4a6c9f25859265", + "hash_cont_tokens": "64725e71e0bff006" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "7aaba52ab7fcd1b5", + "hash_cont_tokens": "d73b7e792a1de62d" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1332, + "non-padded": 52, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "5fab9a0922852979", + "hash_cont_tokens": "291bc548e95ea24c" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "6ee5db1fa41a7d98", + "hash_cont_tokens": "4159368fbefa62ba" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "020fc3b1cbcbd5d2", + "hash_cont_tokens": "b3758c79335b5e25" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1240, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "75f96699a1420f83", + "hash_cont_tokens": "c7aff90b52b3c210" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17a1e44b80641b10", + "hash_cont_tokens": "8fd4fe19db20b33f" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "1ef195957d6ce2d8", + "hash_cont_tokens": "70fdfc3a3cdab2b2" + }, + "truncated": 24, + "non-truncated": 6112, + "padded": 6112, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "daf8d8353ee75752", + "hash_cont_tokens": "2662c15f3eee1572" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "dc25cc58ce67d22c", + "hash_cont_tokens": "7b998c3f691a5888" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "a6acb474e3c26b82", + "hash_cont_tokens": "9884d7f2589a4eec" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "2a8612c9136b36ea", + "hash_cont_tokens": "87576f25f4731ef0" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "8eb4cf86c2ce8485", + "hash_cont_tokens": "bba9af89c33fad2f" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "b76e3c3b93b3f8bf", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "232b5d2c3d25603b", + "hash_cont_tokens": "16a5fb37a6047671" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "c9c5e1d965ab2da3", + "hash_cont_tokens": "65fd69dde784be8d" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "6d63a24c235684df", + "hash_cont_tokens": "c2881c2ce51fc82e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5748e7db7a468b88", + "hash_cont_tokens": "d1899133b79ddfc3" + }, + "total_evaluation_time_secondes": "6323.855161905289", + "truncated": 1500, + "non-truncated": 109519, + "padded": 109399, + "non-padded": 1620, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/internlm/internlm-20b-chat/results_2023-11-08T17-10-14.815999.json b/eval-results/internlm/internlm-20b-chat/results_2023-11-08T17-10-14.815999.json new file mode 100644 index 0000000000000000000000000000000000000000..4df47cf3188b16d3bd41b158d3f4d22d272ffefc --- /dev/null +++ b/eval-results/internlm/internlm-20b-chat/results_2023-11-08T17-10-14.815999.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "internlm/internlm-20b-chat", + "model_sha": "d82fc5509ec3d0ef7cca9847ab1397bc9781faaf", + "model_dtype": "torch.float16", + "model_size": "37.65 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.016149328859060404, + "em_stderr": 0.001290866955681033, + "f1": 0.10652579697986504, + "f1_stderr": 0.0024253657455140664 + }, + "harness|gsm8k|5": { + "acc": 0.18726307808946172, + "acc_stderr": 0.01074591419951081 + }, + "harness|winogrande|5": { + "acc": 0.7876874506708761, + "acc_stderr": 0.011493384687249779 + }, + "all": { + "em": 0.016149328859060404, + "em_stderr": 0.001290866955681033, + "f1": 0.10652579697986504, + "f1_stderr": 0.0024253657455140664, + "acc": 0.4874752643801689, + "acc_stderr": 0.011119649443380293 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "c8087562d3d7677b", + "hash_cont_tokens": "493d3ed66c66e6c3" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "50087df0eb7e53b4", + "hash_cont_tokens": "5d99dacd175ae5c1" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "df07f13dc1768541", + "hash_cont_tokens": "d6a6dcb7ee4ab350" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2375, + "non_padded": 159, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "c9cc2ba13c63638d", + "hash_cont_tokens": "eb32b1d3613799e3" + }, + "truncated": 0, + "non_truncated": 12122, + "padded": 2375, + "non_padded": 11014, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/internlm/internlm-20b/results_2023-09-18T17-21-43.333495.json b/eval-results/internlm/internlm-20b/results_2023-09-18T17-21-43.333495.json new file mode 100644 index 0000000000000000000000000000000000000000..ac9f2a8d0f01995502cd5330e14467d3c248ab99 --- /dev/null +++ b/eval-results/internlm/internlm-20b/results_2023-09-18T17-21-43.333495.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "internlm/internlm-20b", + "model_sha": "b8825fe3394608fe84f0f5eb6471454384fb83aa", + "model_size": "37.54 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5537542662116041, + "acc_stderr": 0.014526705548539982, + "acc_norm": 0.6049488054607508, + "acc_norm_stderr": 0.014285898292938167 + }, + "harness|hellaswag|10": { + "acc": 0.6195976897032464, + "acc_stderr": 0.004844935327599204, + "acc_norm": 0.8212507468631747, + "acc_norm_stderr": 0.0038235918141330317 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5333333333333333, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.5333333333333333, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6710526315789473, + "acc_stderr": 0.038234289699266046, + "acc_norm": 0.6710526315789473, + "acc_norm_stderr": 0.038234289699266046 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6226415094339622, + "acc_stderr": 0.029832808114796, + "acc_norm": 0.6226415094339622, + "acc_norm_stderr": 0.029832808114796 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7361111111111112, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.7361111111111112, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6127167630057804, + "acc_stderr": 0.037143259063020656, + "acc_norm": 0.6127167630057804, + "acc_norm_stderr": 0.037143259063020656 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4019607843137255, + "acc_stderr": 0.04878608714466996, + "acc_norm": 0.4019607843137255, + "acc_norm_stderr": 0.04878608714466996 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5319148936170213, + "acc_stderr": 0.03261936918467383, + "acc_norm": 0.5319148936170213, + "acc_norm_stderr": 0.03261936918467383 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.04537815354939392, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.04537815354939392 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5517241379310345, + "acc_stderr": 0.04144311810878152, + "acc_norm": 0.5517241379310345, + "acc_norm_stderr": 0.04144311810878152 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3994708994708995, + "acc_stderr": 0.025225450284067877, + "acc_norm": 0.3994708994708995, + "acc_norm_stderr": 0.025225450284067877 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.04325506042017086, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.04325506042017086 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7580645161290323, + "acc_stderr": 0.024362599693031096, + "acc_norm": 0.7580645161290323, + "acc_norm_stderr": 0.024362599693031096 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.035158955511656986, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.035158955511656986 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.03192271569548302, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.03192271569548302 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7727272727272727, + "acc_stderr": 0.02985751567338642, + "acc_norm": 0.7727272727272727, + "acc_norm_stderr": 0.02985751567338642 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9015544041450777, + "acc_stderr": 0.021500249576033467, + "acc_norm": 0.9015544041450777, + "acc_norm_stderr": 0.021500249576033467 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6, + "acc_stderr": 0.02483881198803316, + "acc_norm": 0.6, + "acc_norm_stderr": 0.02483881198803316 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3296296296296296, + "acc_stderr": 0.028661201116524575, + "acc_norm": 0.3296296296296296, + "acc_norm_stderr": 0.028661201116524575 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5840336134453782, + "acc_stderr": 0.032016501007396114, + "acc_norm": 0.5840336134453782, + "acc_norm_stderr": 0.032016501007396114 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.038020397601079024, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.038020397601079024 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8330275229357799, + "acc_stderr": 0.015990154885073382, + "acc_norm": 0.8330275229357799, + "acc_norm_stderr": 0.015990154885073382 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5879629629629629, + "acc_stderr": 0.03356787758160831, + "acc_norm": 0.5879629629629629, + "acc_norm_stderr": 0.03356787758160831 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.028867431449849313, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.028867431449849313 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8059071729957806, + "acc_stderr": 0.025744902532290934, + "acc_norm": 0.8059071729957806, + "acc_norm_stderr": 0.025744902532290934 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6636771300448431, + "acc_stderr": 0.03170882426845501, + "acc_norm": 0.6636771300448431, + "acc_norm_stderr": 0.03170882426845501 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6564885496183206, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.6564885496183206, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8099173553719008, + "acc_stderr": 0.03581796951709282, + "acc_norm": 0.8099173553719008, + "acc_norm_stderr": 0.03581796951709282 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.03957835471980981, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.03957835471980981 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7239263803680982, + "acc_stderr": 0.03512385283705046, + "acc_norm": 0.7239263803680982, + "acc_norm_stderr": 0.03512385283705046 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.49107142857142855, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.49107142857142855, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8589743589743589, + "acc_stderr": 0.022801382534597528, + "acc_norm": 0.8589743589743589, + "acc_norm_stderr": 0.022801382534597528 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7854406130268199, + "acc_stderr": 0.014680033956893346, + "acc_norm": 0.7854406130268199, + "acc_norm_stderr": 0.014680033956893346 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6994219653179191, + "acc_stderr": 0.0246853168672578, + "acc_norm": 0.6994219653179191, + "acc_norm_stderr": 0.0246853168672578 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3340782122905028, + "acc_stderr": 0.015774911422381636, + "acc_norm": 0.3340782122905028, + "acc_norm_stderr": 0.015774911422381636 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.696078431372549, + "acc_stderr": 0.026336613469046633, + "acc_norm": 0.696078431372549, + "acc_norm_stderr": 0.026336613469046633 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6495176848874598, + "acc_stderr": 0.027098652621301754, + "acc_norm": 0.6495176848874598, + "acc_norm_stderr": 0.027098652621301754 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7006172839506173, + "acc_stderr": 0.02548311560119545, + "acc_norm": 0.7006172839506173, + "acc_norm_stderr": 0.02548311560119545 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4574468085106383, + "acc_stderr": 0.029719281272236844, + "acc_norm": 0.4574468085106383, + "acc_norm_stderr": 0.029719281272236844 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4706649282920469, + "acc_stderr": 0.01274823839736555, + "acc_norm": 0.4706649282920469, + "acc_norm_stderr": 0.01274823839736555 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5955882352941176, + "acc_stderr": 0.02981263070156974, + "acc_norm": 0.5955882352941176, + "acc_norm_stderr": 0.02981263070156974 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6339869281045751, + "acc_stderr": 0.01948802574552967, + "acc_norm": 0.6339869281045751, + "acc_norm_stderr": 0.01948802574552967 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7142857142857143, + "acc_stderr": 0.028920583220675606, + "acc_norm": 0.7142857142857143, + "acc_norm_stderr": 0.028920583220675606 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8159203980099502, + "acc_stderr": 0.027403859410786834, + "acc_norm": 0.8159203980099502, + "acc_norm_stderr": 0.027403859410786834 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.9, + "acc_stderr": 0.030151134457776334, + "acc_norm": 0.9, + "acc_norm_stderr": 0.030151134457776334 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5120481927710844, + "acc_stderr": 0.03891364495835816, + "acc_norm": 0.5120481927710844, + "acc_norm_stderr": 0.03891364495835816 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8011695906432749, + "acc_stderr": 0.030611116557432528, + "acc_norm": 0.8011695906432749, + "acc_norm_stderr": 0.030611116557432528 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3659730722154223, + "mc1_stderr": 0.01686294168408837, + "mc2": 0.526105452742239, + "mc2_stderr": 0.015063646699857039 + }, + "all": { + "acc": 0.6174585708390606, + "acc_stderr": 0.03320891731594165, + "acc_norm": 0.6217441233205704, + "acc_norm_stderr": 0.03318752493002525, + "mc1": 0.3659730722154223, + "mc1_stderr": 0.01686294168408837, + "mc2": 0.526105452742239, + "mc2_stderr": 0.015063646699857039 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ab6d1c3d70fc1049", + "hash_cont_tokens": "3b5ae8a16b9ca33e" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4adeb76b031573a1", + "hash_cont_tokens": "e07ac1a581cd0c0c" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40137, + "non-padded": 31, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "6481c2fba7190478", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "e07c9163ac5081a3", + "hash_cont_tokens": "3448d00acc7a11c6" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "539ef21f133e917c", + "hash_cont_tokens": "9b5285416fa903e2" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd43555e5256f484", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "15663115b477062a", + "hash_cont_tokens": "0382995cfcc24e3e" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "81e10086d9a67a89", + "hash_cont_tokens": "09ef20d27e0286fe" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "822ee9d85dd90211", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "30010413b5397d2a", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "29b48c03b0b8f848", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "02ae47c6ad730c28", + "hash_cont_tokens": "2115091b39764e96" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 680, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "ba5fa0250e3d6c30", + "hash_cont_tokens": "253e8f65a34d2f2b" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "9505189fb7c44b4e", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "53c3c332289e7b04", + "hash_cont_tokens": "863770146d3e3341" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f92b13ed95e3c111", + "hash_cont_tokens": "547784fe0135a15c" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e71d172f814a80c2", + "hash_cont_tokens": "545e7978a9a2e921" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 576, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "5e819b51881bb2f0", + "hash_cont_tokens": "6220dafecd3e71a1" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "f12da3535e4dc175", + "hash_cont_tokens": "9e1c83b748056f05" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "bfc70762fbc3c00a", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "1cd3d68c399c9798", + "hash_cont_tokens": "c0ee938431d4cce1" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "4bf07860f9a05ea0", + "hash_cont_tokens": "2fd86b22bfa1c8cb" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "f18950fe2d6b0852", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "ff596bb41009df48", + "hash_cont_tokens": "8d52dfdbe7373dec" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "94857240d8fb30ec", + "hash_cont_tokens": "7daa2bbedae272e1" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "04cd00103bdeb0e8", + "hash_cont_tokens": "530e7985f90589ad" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "be211f365573815f", + "hash_cont_tokens": "8abfdac40b0aa157" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "61b56ac65f52bd9f", + "hash_cont_tokens": "0450a3d8e715e926" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "2d068a1f6fa6447c", + "hash_cont_tokens": "3e477b8a15ec619c" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "751c9d3d45725100", + "hash_cont_tokens": "f0648b1ae17e3c3f" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "e67193bef752e0a2", + "hash_cont_tokens": "71a621b85c8384ec" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2179, + "non-padded": 1, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a2a440e523363a80", + "hash_cont_tokens": "507dec89f16c35ea" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "9ccd9b23a7e8b60b", + "hash_cont_tokens": "fe66e65deac902bb" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "a39dd570f7b916b5", + "hash_cont_tokens": "7fe519011d639dc8" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "7f5bf8be5e049ee7", + "hash_cont_tokens": "77ba99656e04ddd0" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "54bd52aca1127ab7", + "hash_cont_tokens": "bc8f34ada52ca31e" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "3f43f1b61ea3ce0a", + "hash_cont_tokens": "d4b66c0f10b911b8" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "44ed1d9fa69f8186", + "hash_cont_tokens": "f7ea9e092aff54a4" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "bfab6f0ecaac1ebf", + "hash_cont_tokens": "9e305ec3d994de5c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "5f80df97c3c4cc0c", + "hash_cont_tokens": "85f6ff4f34ded537" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "ee080212a243ddeb", + "hash_cont_tokens": "1f24f5bf907f5f28" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "3a52fb801992c276", + "hash_cont_tokens": "37062ffd1e129b49" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "a619808a23365e33", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "5e4a6c9f25859265", + "hash_cont_tokens": "64725e71e0bff006" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "7aaba52ab7fcd1b5", + "hash_cont_tokens": "d73b7e792a1de62d" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1332, + "non-padded": 52, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "5fab9a0922852979", + "hash_cont_tokens": "291bc548e95ea24c" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "6ee5db1fa41a7d98", + "hash_cont_tokens": "4159368fbefa62ba" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "020fc3b1cbcbd5d2", + "hash_cont_tokens": "b3758c79335b5e25" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1240, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "75f96699a1420f83", + "hash_cont_tokens": "c7aff90b52b3c210" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17a1e44b80641b10", + "hash_cont_tokens": "8fd4fe19db20b33f" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "1ef195957d6ce2d8", + "hash_cont_tokens": "70fdfc3a3cdab2b2" + }, + "truncated": 24, + "non-truncated": 6112, + "padded": 6112, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "daf8d8353ee75752", + "hash_cont_tokens": "2662c15f3eee1572" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "dc25cc58ce67d22c", + "hash_cont_tokens": "7b998c3f691a5888" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "a6acb474e3c26b82", + "hash_cont_tokens": "9884d7f2589a4eec" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "2a8612c9136b36ea", + "hash_cont_tokens": "87576f25f4731ef0" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "8eb4cf86c2ce8485", + "hash_cont_tokens": "bba9af89c33fad2f" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "b76e3c3b93b3f8bf", + "hash_cont_tokens": "e58c016de340de83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "232b5d2c3d25603b", + "hash_cont_tokens": "16a5fb37a6047671" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "c9c5e1d965ab2da3", + "hash_cont_tokens": "65fd69dde784be8d" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "6d63a24c235684df", + "hash_cont_tokens": "c2881c2ce51fc82e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5748e7db7a468b88", + "hash_cont_tokens": "d1899133b79ddfc3" + }, + "total_evaluation_time_secondes": "6371.967739105225", + "truncated": 1500, + "non-truncated": 109519, + "padded": 109399, + "non-padded": 1620, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/internlm/internlm-20b/results_2023-11-08T17-07-38.003322.json b/eval-results/internlm/internlm-20b/results_2023-11-08T17-07-38.003322.json new file mode 100644 index 0000000000000000000000000000000000000000..ccffaa357f5cf3f95c47477b449c505499fa270c --- /dev/null +++ b/eval-results/internlm/internlm-20b/results_2023-11-08T17-07-38.003322.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "internlm/internlm-20b", + "model_sha": "2d83118d863d24565da1f9c6c0fe99d3e882f25c", + "model_dtype": "torch.float16", + "model_size": "37.65 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.446623322147651, + "em_stderr": 0.005091207245611711, + "f1": 0.4853261325503364, + "f1_stderr": 0.004942779780816972 + }, + "harness|gsm8k|5": { + "acc": 0.2350265352539803, + "acc_stderr": 0.011679491349994874 + }, + "harness|winogrande|5": { + "acc": 0.7671665351223362, + "acc_stderr": 0.011878201073856542 + }, + "all": { + "em": 0.446623322147651, + "em_stderr": 0.005091207245611711, + "f1": 0.4853261325503364, + "f1_stderr": 0.004942779780816972, + "acc": 0.5010965351881582, + "acc_stderr": 0.011778846211925709 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "c8087562d3d7677b", + "hash_cont_tokens": "b487569d81182210" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "50087df0eb7e53b4", + "hash_cont_tokens": "37ad23e0d73bbf84" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "df07f13dc1768541", + "hash_cont_tokens": "d6a6dcb7ee4ab350" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2375, + "non_padded": 159, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "c9cc2ba13c63638d", + "hash_cont_tokens": "6b4bb04d1b648c44" + }, + "truncated": 0, + "non_truncated": 12122, + "padded": 2375, + "non_padded": 11014, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/itsliupeng/llama2_7b_code/results_2023-10-08T20-46-27.226805.json b/eval-results/itsliupeng/llama2_7b_code/results_2023-10-08T20-46-27.226805.json new file mode 100644 index 0000000000000000000000000000000000000000..f86fbdc4e5e961cdd7ab3b05510b61f450edee54 --- /dev/null +++ b/eval-results/itsliupeng/llama2_7b_code/results_2023-10-08T20-46-27.226805.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "itsliupeng/llama2_7b_code", + "model_sha": "0e6d1edd87c8753b55d280179c8fb0e65ebf5fa2", + "model_size": "12.61 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4872013651877133, + "acc_stderr": 0.014606603181012538, + "acc_norm": 0.5213310580204779, + "acc_norm_stderr": 0.014598087973127108 + }, + "harness|hellaswag|10": { + "acc": 0.5621390161322446, + "acc_stderr": 0.00495109780277595, + "acc_norm": 0.7571200955984864, + "acc_norm_stderr": 0.0042794671285607475 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847415, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847415 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4605263157894737, + "acc_stderr": 0.04056242252249034, + "acc_norm": 0.4605263157894737, + "acc_norm_stderr": 0.04056242252249034 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4679245283018868, + "acc_stderr": 0.030709486992556545, + "acc_norm": 0.4679245283018868, + "acc_norm_stderr": 0.030709486992556545 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4375, + "acc_stderr": 0.04148415739394154, + "acc_norm": 0.4375, + "acc_norm_stderr": 0.04148415739394154 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.42196531791907516, + "acc_stderr": 0.0376574669386515, + "acc_norm": 0.42196531791907516, + "acc_norm_stderr": 0.0376574669386515 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.044405219061793275, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.044405219061793275 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4340425531914894, + "acc_stderr": 0.032400380867927465, + "acc_norm": 0.4340425531914894, + "acc_norm_stderr": 0.032400380867927465 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.32456140350877194, + "acc_stderr": 0.04404556157374767, + "acc_norm": 0.32456140350877194, + "acc_norm_stderr": 0.04404556157374767 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.04166567577101579, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.04166567577101579 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.31216931216931215, + "acc_stderr": 0.023865206836972595, + "acc_norm": 0.31216931216931215, + "acc_norm_stderr": 0.023865206836972595 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.04104947269903394, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.04104947269903394 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5193548387096775, + "acc_stderr": 0.02842268740431211, + "acc_norm": 0.5193548387096775, + "acc_norm_stderr": 0.02842268740431211 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.30049261083743845, + "acc_stderr": 0.03225799476233484, + "acc_norm": 0.30049261083743845, + "acc_norm_stderr": 0.03225799476233484 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6303030303030303, + "acc_stderr": 0.03769430314512567, + "acc_norm": 0.6303030303030303, + "acc_norm_stderr": 0.03769430314512567 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5808080808080808, + "acc_stderr": 0.03515520728670417, + "acc_norm": 0.5808080808080808, + "acc_norm_stderr": 0.03515520728670417 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6528497409326425, + "acc_stderr": 0.03435696168361355, + "acc_norm": 0.6528497409326425, + "acc_norm_stderr": 0.03435696168361355 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.41794871794871796, + "acc_stderr": 0.02500732988246122, + "acc_norm": 0.41794871794871796, + "acc_norm_stderr": 0.02500732988246122 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3111111111111111, + "acc_stderr": 0.02822644674968352, + "acc_norm": 0.3111111111111111, + "acc_norm_stderr": 0.02822644674968352 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42016806722689076, + "acc_stderr": 0.03206183783236152, + "acc_norm": 0.42016806722689076, + "acc_norm_stderr": 0.03206183783236152 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3841059602649007, + "acc_stderr": 0.03971301814719198, + "acc_norm": 0.3841059602649007, + "acc_norm_stderr": 0.03971301814719198 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.636697247706422, + "acc_stderr": 0.020620603919625804, + "acc_norm": 0.636697247706422, + "acc_norm_stderr": 0.020620603919625804 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.30092592592592593, + "acc_stderr": 0.03128039084329882, + "acc_norm": 0.30092592592592593, + "acc_norm_stderr": 0.03128039084329882 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.03454236585380608, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.03454236585380608 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6708860759493671, + "acc_stderr": 0.030587326294702365, + "acc_norm": 0.6708860759493671, + "acc_norm_stderr": 0.030587326294702365 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5112107623318386, + "acc_stderr": 0.033549366530984746, + "acc_norm": 0.5112107623318386, + "acc_norm_stderr": 0.033549366530984746 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5877862595419847, + "acc_stderr": 0.04317171194870254, + "acc_norm": 0.5877862595419847, + "acc_norm_stderr": 0.04317171194870254 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6115702479338843, + "acc_stderr": 0.04449270350068382, + "acc_norm": 0.6115702479338843, + "acc_norm_stderr": 0.04449270350068382 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.04820403072760627, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.04820403072760627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5276073619631901, + "acc_stderr": 0.0392237829061099, + "acc_norm": 0.5276073619631901, + "acc_norm_stderr": 0.0392237829061099 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6116504854368932, + "acc_stderr": 0.048257293373563895, + "acc_norm": 0.6116504854368932, + "acc_norm_stderr": 0.048257293373563895 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7051282051282052, + "acc_stderr": 0.029872577708891183, + "acc_norm": 0.7051282051282052, + "acc_norm_stderr": 0.029872577708891183 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6309067688378033, + "acc_stderr": 0.017256283109124616, + "acc_norm": 0.6309067688378033, + "acc_norm_stderr": 0.017256283109124616 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5491329479768786, + "acc_stderr": 0.02678881193156275, + "acc_norm": 0.5491329479768786, + "acc_norm_stderr": 0.02678881193156275 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.31843575418994413, + "acc_stderr": 0.015581008080360276, + "acc_norm": 0.31843575418994413, + "acc_norm_stderr": 0.015581008080360276 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.02845263998508801, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.02845263998508801 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5916398713826366, + "acc_stderr": 0.02791705074848462, + "acc_norm": 0.5916398713826366, + "acc_norm_stderr": 0.02791705074848462 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5216049382716049, + "acc_stderr": 0.02779476010500873, + "acc_norm": 0.5216049382716049, + "acc_norm_stderr": 0.02779476010500873 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3475177304964539, + "acc_stderr": 0.02840662780959095, + "acc_norm": 0.3475177304964539, + "acc_norm_stderr": 0.02840662780959095 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.34810951760104303, + "acc_stderr": 0.012166738993698203, + "acc_norm": 0.34810951760104303, + "acc_norm_stderr": 0.012166738993698203 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5036764705882353, + "acc_stderr": 0.030372015885428195, + "acc_norm": 0.5036764705882353, + "acc_norm_stderr": 0.030372015885428195 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.42483660130718953, + "acc_stderr": 0.01999797303545833, + "acc_norm": 0.42483660130718953, + "acc_norm_stderr": 0.01999797303545833 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5181818181818182, + "acc_stderr": 0.04785964010794916, + "acc_norm": 0.5181818181818182, + "acc_norm_stderr": 0.04785964010794916 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.49387755102040815, + "acc_stderr": 0.03200682020163908, + "acc_norm": 0.49387755102040815, + "acc_norm_stderr": 0.03200682020163908 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6417910447761194, + "acc_stderr": 0.03390393042268814, + "acc_norm": 0.6417910447761194, + "acc_norm_stderr": 0.03390393042268814 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4036144578313253, + "acc_stderr": 0.038194861407583984, + "acc_norm": 0.4036144578313253, + "acc_norm_stderr": 0.038194861407583984 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6842105263157895, + "acc_stderr": 0.03565079670708312, + "acc_norm": 0.6842105263157895, + "acc_norm_stderr": 0.03565079670708312 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2521419828641371, + "mc1_stderr": 0.01520152224629997, + "mc2": 0.3875988110597644, + "mc2_stderr": 0.014080197843816668 + }, + "all": { + "acc": 0.48200491255702765, + "acc_stderr": 0.03542377047214926, + "acc_norm": 0.48588814598582436, + "acc_norm_stderr": 0.03541224257584247, + "mc1": 0.2521419828641371, + "mc1_stderr": 0.01520152224629997, + "mc2": 0.3875988110597644, + "mc2_stderr": 0.014080197843816668 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "a3dd90b7fa78c46f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "c1668546a52ba7ee", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "aba8d0543ba39185", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "94447c0f9df8a2f2", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "86fcff08a2687aef", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "d8ae3328b34fcbe9", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "2b4300f9c5301d87", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "d8c3c755b7d10c36", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "b923fc66124f3a53", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "ab78cf0511ea8f9f", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "d4b0037b2649a3ea", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "a0fad6bb4fe6f18b", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "85d996d3692166d3", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "84be3f4596b5b001", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "f1854cd8319271d0", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "113f3f725bce334d", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fe907a266e06f2d", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "a1a11666961da140", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "2f1eb352e149b62e", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3042e08feedf5aef", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "8045260dd79ca91b", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "59fbfffd8c47bcfe", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "bfb30129ff73368d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "b5538d7bea55a393", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "2e6f2ab11754f52d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "18f4d06508cdc25a", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "a8c56eaff54bcd8f", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "dfd920166bafbeab", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "83941ae6969f09da", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "83035f00ce4d3f71", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "c0b4f291c85e3e0c", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "50f48b0a08d05109", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "64a6e2e2c0d4f2dc", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "fe2da14f3cf5fab6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "9d05936c3bf80340", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "5dcfb47bad3e54e3", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "1d720d65d9a4b09e", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "a81edfe831ddb4d7", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "0a100ec9e60e9845", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "f7de62a8844d30e5", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "84909c12af4fb9cb", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e15625e1426fb6a6", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "d8ef4a775d5af214", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "bdd7d35001579262", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "e6e4cbb039c9987a", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "0bbd072dc36eccdf", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "6d94c283eed90938", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "3848642365489c57", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "6da2f6b69cef14a0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "f038b6fd176c7aeb", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "e09cf8fe8428777e", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "5e57d5e23aebd728", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "842ecf72c3bc61e5", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "d2bae896a9be38dd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "06fa00ff41daa5e1", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "06fe4b96d35f7c14", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "51fefdfe0806e527", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "506320b3b19190d9", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6cb71dd631bb845e", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "d76ec491098584f7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "afff47ae9be41bdc", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4201.1513686180115", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/itsliupeng/llama2_7b_code/results_2023-10-26T11-17-28.829100.json b/eval-results/itsliupeng/llama2_7b_code/results_2023-10-26T11-17-28.829100.json new file mode 100644 index 0000000000000000000000000000000000000000..a2a1289d5c3ed531be48f075f7901cb3eed11ec4 --- /dev/null +++ b/eval-results/itsliupeng/llama2_7b_code/results_2023-10-26T11-17-28.829100.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "itsliupeng/llama2_7b_code", + "model_sha": "0e6d1edd87c8753b55d280179c8fb0e65ebf5fa2", + "model_size": "12.61 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0009437919463087249, + "em_stderr": 0.00031446531194130476, + "f1": 0.05393036912751694, + "f1_stderr": 0.0012935627430820335 + }, + "harness|gsm8k|5": { + "acc": 0.08112206216830932, + "acc_stderr": 0.007520395797922653 + }, + "harness|winogrande|5": { + "acc": 0.7150749802683505, + "acc_stderr": 0.012685986125141227 + }, + "all": { + "em": 0.0009437919463087249, + "em_stderr": 0.00031446531194130476, + "f1": 0.05393036912751694, + "f1_stderr": 0.0012935627430820335, + "acc": 0.3980985212183299, + "acc_stderr": 0.01010319096153194 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "642ac625f2ed4a8a", + "hash_cont_tokens": "7d3e0ad73f6a3001" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "7181cf25f7c30bc0", + "hash_cont_tokens": "6e0d5922000e4f12" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "10915604fe04c545", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "0dfd32103d4321d0", + "hash_cont_tokens": "afc47a875f1f4575" + }, + "total_evaluation_time_secondes": "9598.109056949615", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/itsliupeng/llama2_7b_mmlu/results_2023-10-10T15-25-23.413789.json b/eval-results/itsliupeng/llama2_7b_mmlu/results_2023-10-10T15-25-23.413789.json new file mode 100644 index 0000000000000000000000000000000000000000..0d0e23aa432ec869c4277173b3e9d4f6750c7428 --- /dev/null +++ b/eval-results/itsliupeng/llama2_7b_mmlu/results_2023-10-10T15-25-23.413789.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "itsliupeng/llama2_7b_mmlu", + "model_sha": "553178f8d5d69eb1dfa5b9503d2ce0c1e481e5b1", + "model_size": "12.61 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5179180887372014, + "acc_stderr": 0.014602005585490973, + "acc_norm": 0.5614334470989761, + "acc_norm_stderr": 0.014500682618212865 + }, + "harness|hellaswag|10": { + "acc": 0.5918143796056562, + "acc_stderr": 0.004904933500255873, + "acc_norm": 0.7912766381198965, + "acc_norm_stderr": 0.004055657006965434 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4962962962962963, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.4962962962962963, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5855263157894737, + "acc_stderr": 0.04008973785779206, + "acc_norm": 0.5855263157894737, + "acc_norm_stderr": 0.04008973785779206 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6490566037735849, + "acc_stderr": 0.02937364625323469, + "acc_norm": 0.6490566037735849, + "acc_norm_stderr": 0.02937364625323469 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6527777777777778, + "acc_stderr": 0.039812405437178615, + "acc_norm": 0.6527777777777778, + "acc_norm_stderr": 0.039812405437178615 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6358381502890174, + "acc_stderr": 0.03669072477416907, + "acc_norm": 0.6358381502890174, + "acc_norm_stderr": 0.03669072477416907 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.04784060704105654, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.04784060704105654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5191489361702127, + "acc_stderr": 0.032662042990646796, + "acc_norm": 0.5191489361702127, + "acc_norm_stderr": 0.032662042990646796 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.32456140350877194, + "acc_stderr": 0.044045561573747664, + "acc_norm": 0.32456140350877194, + "acc_norm_stderr": 0.044045561573747664 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5793103448275863, + "acc_stderr": 0.0411391498118926, + "acc_norm": 0.5793103448275863, + "acc_norm_stderr": 0.0411391498118926 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.02510742548113729, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.02510742548113729 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.043062412591271526, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.043062412591271526 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6903225806451613, + "acc_stderr": 0.026302774983517418, + "acc_norm": 0.6903225806451613, + "acc_norm_stderr": 0.026302774983517418 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.46798029556650245, + "acc_stderr": 0.035107665979592154, + "acc_norm": 0.46798029556650245, + "acc_norm_stderr": 0.035107665979592154 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7424242424242424, + "acc_stderr": 0.03115626951964683, + "acc_norm": 0.7424242424242424, + "acc_norm_stderr": 0.03115626951964683 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8808290155440415, + "acc_stderr": 0.023381935348121437, + "acc_norm": 0.8808290155440415, + "acc_norm_stderr": 0.023381935348121437 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6, + "acc_stderr": 0.02483881198803316, + "acc_norm": 0.6, + "acc_norm_stderr": 0.02483881198803316 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3111111111111111, + "acc_stderr": 0.02822644674968352, + "acc_norm": 0.3111111111111111, + "acc_norm_stderr": 0.02822644674968352 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6260504201680672, + "acc_stderr": 0.031429466378837076, + "acc_norm": 0.6260504201680672, + "acc_norm_stderr": 0.031429466378837076 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3576158940397351, + "acc_stderr": 0.03913453431177258, + "acc_norm": 0.3576158940397351, + "acc_norm_stderr": 0.03913453431177258 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7853211009174312, + "acc_stderr": 0.01760430414925648, + "acc_norm": 0.7853211009174312, + "acc_norm_stderr": 0.01760430414925648 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4166666666666667, + "acc_stderr": 0.03362277436608043, + "acc_norm": 0.4166666666666667, + "acc_norm_stderr": 0.03362277436608043 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7303921568627451, + "acc_stderr": 0.031145570659486782, + "acc_norm": 0.7303921568627451, + "acc_norm_stderr": 0.031145570659486782 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8059071729957806, + "acc_stderr": 0.02574490253229093, + "acc_norm": 0.8059071729957806, + "acc_norm_stderr": 0.02574490253229093 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6681614349775785, + "acc_stderr": 0.031602951437766785, + "acc_norm": 0.6681614349775785, + "acc_norm_stderr": 0.031602951437766785 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7022900763358778, + "acc_stderr": 0.040103589424622034, + "acc_norm": 0.7022900763358778, + "acc_norm_stderr": 0.040103589424622034 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8016528925619835, + "acc_stderr": 0.03640118271990944, + "acc_norm": 0.8016528925619835, + "acc_norm_stderr": 0.03640118271990944 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.75, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7423312883435583, + "acc_stderr": 0.03436150827846917, + "acc_norm": 0.7423312883435583, + "acc_norm_stderr": 0.03436150827846917 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.04521829902833585, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.04521829902833585 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.03916667762822584, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.03916667762822584 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8290598290598291, + "acc_stderr": 0.024662496845209807, + "acc_norm": 0.8290598290598291, + "acc_norm_stderr": 0.024662496845209807 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.776500638569604, + "acc_stderr": 0.01489723522945071, + "acc_norm": 0.776500638569604, + "acc_norm_stderr": 0.01489723522945071 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6965317919075145, + "acc_stderr": 0.024752411960917205, + "acc_norm": 0.6965317919075145, + "acc_norm_stderr": 0.024752411960917205 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3407821229050279, + "acc_stderr": 0.015852002449862096, + "acc_norm": 0.3407821229050279, + "acc_norm_stderr": 0.015852002449862096 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6895424836601307, + "acc_stderr": 0.026493033225145898, + "acc_norm": 0.6895424836601307, + "acc_norm_stderr": 0.026493033225145898 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7170418006430869, + "acc_stderr": 0.02558306248998482, + "acc_norm": 0.7170418006430869, + "acc_norm_stderr": 0.02558306248998482 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6759259259259259, + "acc_stderr": 0.02604176620271716, + "acc_norm": 0.6759259259259259, + "acc_norm_stderr": 0.02604176620271716 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4716312056737589, + "acc_stderr": 0.029779450957303062, + "acc_norm": 0.4716312056737589, + "acc_norm_stderr": 0.029779450957303062 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.43285528031290743, + "acc_stderr": 0.012654565234622864, + "acc_norm": 0.43285528031290743, + "acc_norm_stderr": 0.012654565234622864 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5808823529411765, + "acc_stderr": 0.02997280717046462, + "acc_norm": 0.5808823529411765, + "acc_norm_stderr": 0.02997280717046462 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6209150326797386, + "acc_stderr": 0.019627444748412236, + "acc_norm": 0.6209150326797386, + "acc_norm_stderr": 0.019627444748412236 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.04582004841505417, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.04582004841505417 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.710204081632653, + "acc_stderr": 0.02904308868330433, + "acc_norm": 0.710204081632653, + "acc_norm_stderr": 0.02904308868330433 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7960199004975125, + "acc_stderr": 0.02849317624532607, + "acc_norm": 0.7960199004975125, + "acc_norm_stderr": 0.02849317624532607 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774707, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774707 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.463855421686747, + "acc_stderr": 0.03882310850890594, + "acc_norm": 0.463855421686747, + "acc_norm_stderr": 0.03882310850890594 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.030944459778533193, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.030944459778533193 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.27050183598531213, + "mc1_stderr": 0.015550778332842888, + "mc2": 0.40950657377856753, + "mc2_stderr": 0.013879529639480087 + }, + "all": { + "acc": 0.5988501243208318, + "acc_stderr": 0.03358876037616636, + "acc_norm": 0.6029683890136457, + "acc_norm_stderr": 0.03357264852090248, + "mc1": 0.27050183598531213, + "mc1_stderr": 0.015550778332842888, + "mc2": 0.40950657377856753, + "mc2_stderr": 0.013879529639480087 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "a3dd90b7fa78c46f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "c1668546a52ba7ee", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "aba8d0543ba39185", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "94447c0f9df8a2f2", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "86fcff08a2687aef", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "d8ae3328b34fcbe9", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "2b4300f9c5301d87", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "d8c3c755b7d10c36", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "b923fc66124f3a53", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "ab78cf0511ea8f9f", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "d4b0037b2649a3ea", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "a0fad6bb4fe6f18b", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "85d996d3692166d3", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "84be3f4596b5b001", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "f1854cd8319271d0", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "113f3f725bce334d", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fe907a266e06f2d", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "a1a11666961da140", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "2f1eb352e149b62e", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3042e08feedf5aef", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "8045260dd79ca91b", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "59fbfffd8c47bcfe", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "bfb30129ff73368d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "b5538d7bea55a393", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "2e6f2ab11754f52d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "18f4d06508cdc25a", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "a8c56eaff54bcd8f", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "dfd920166bafbeab", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "83941ae6969f09da", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "83035f00ce4d3f71", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "c0b4f291c85e3e0c", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "50f48b0a08d05109", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "64a6e2e2c0d4f2dc", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "fe2da14f3cf5fab6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "9d05936c3bf80340", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "5dcfb47bad3e54e3", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "1d720d65d9a4b09e", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "a81edfe831ddb4d7", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "0a100ec9e60e9845", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "f7de62a8844d30e5", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "84909c12af4fb9cb", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e15625e1426fb6a6", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "d8ef4a775d5af214", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "bdd7d35001579262", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "e6e4cbb039c9987a", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "0bbd072dc36eccdf", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "6d94c283eed90938", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "3848642365489c57", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "6da2f6b69cef14a0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "f038b6fd176c7aeb", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "e09cf8fe8428777e", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "5e57d5e23aebd728", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "842ecf72c3bc61e5", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "d2bae896a9be38dd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "06fa00ff41daa5e1", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "06fe4b96d35f7c14", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "51fefdfe0806e527", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "506320b3b19190d9", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6cb71dd631bb845e", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "d76ec491098584f7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "afff47ae9be41bdc", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4349.653074026108", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/itsliupeng/llama2_7b_mmlu/results_2023-10-25T10-05-20.920502.json b/eval-results/itsliupeng/llama2_7b_mmlu/results_2023-10-25T10-05-20.920502.json new file mode 100644 index 0000000000000000000000000000000000000000..b5827e37dc659cd43e2358cb4ae81f1af5409664 --- /dev/null +++ b/eval-results/itsliupeng/llama2_7b_mmlu/results_2023-10-25T10-05-20.920502.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "itsliupeng/llama2_7b_mmlu", + "model_sha": "553178f8d5d69eb1dfa5b9503d2ce0c1e481e5b1", + "model_size": "12.61 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0012583892617449664, + "em_stderr": 0.0003630560893119021, + "f1": 0.05594588926174501, + "f1_stderr": 0.0013036425627808016 + }, + "harness|gsm8k|5": { + "acc": 0.07884761182714177, + "acc_stderr": 0.00742339051987324 + }, + "harness|winogrande|5": { + "acc": 0.744277821625888, + "acc_stderr": 0.012261253845440473 + }, + "all": { + "em": 0.0012583892617449664, + "em_stderr": 0.0003630560893119021, + "f1": 0.05594588926174501, + "f1_stderr": 0.0013036425627808016, + "acc": 0.41156271672651484, + "acc_stderr": 0.009842322182656855 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "642ac625f2ed4a8a", + "hash_cont_tokens": "2182f91e6fd1b8b8" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "7181cf25f7c30bc0", + "hash_cont_tokens": "7f8551c7c2d43593" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "10915604fe04c545", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "0dfd32103d4321d0", + "hash_cont_tokens": "671d95d527ba0e3d" + }, + "total_evaluation_time_secondes": "10330.61654472351", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/itsliupeng/llama2_7b_zh/results_2023-11-15T10-51-37.128756.json b/eval-results/itsliupeng/llama2_7b_zh/results_2023-11-15T10-51-37.128756.json new file mode 100644 index 0000000000000000000000000000000000000000..f1ca3461bcf8d9b9bf64d98bb619e755126d1902 --- /dev/null +++ b/eval-results/itsliupeng/llama2_7b_zh/results_2023-11-15T10-51-37.128756.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 171198.264014494, + "end_time": 184290.244039052, + "total_evaluation_time_secondes": "13091.980024557997", + "model_name": "itsliupeng/llama2_7b_zh", + "model_sha": "410711781d2e24226c0d62959e4990d1de851c3c", + "model_dtype": "torch.bfloat16", + "model_size": "11.35 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.47952218430034127, + "acc_stderr": 0.01459913135303501, + "acc_norm": 0.5204778156996587, + "acc_norm_stderr": 0.01459913135303501 + }, + "harness|hellaswag|10": { + "acc": 0.5608444532961562, + "acc_stderr": 0.004952698802275648, + "acc_norm": 0.7487552280422227, + "acc_norm_stderr": 0.004328425700998689 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5703703703703704, + "acc_stderr": 0.042763494943765995, + "acc_norm": 0.5703703703703704, + "acc_norm_stderr": 0.042763494943765995 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6907894736842105, + "acc_stderr": 0.03761070869867479, + "acc_norm": 0.6907894736842105, + "acc_norm_stderr": 0.03761070869867479 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6566037735849056, + "acc_stderr": 0.029224526469124792, + "acc_norm": 0.6566037735849056, + "acc_norm_stderr": 0.029224526469124792 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7013888888888888, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.7013888888888888, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6011560693641619, + "acc_stderr": 0.0373362665538351, + "acc_norm": 0.6011560693641619, + "acc_norm_stderr": 0.0373362665538351 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.37254901960784315, + "acc_stderr": 0.04810840148082634, + "acc_norm": 0.37254901960784315, + "acc_norm_stderr": 0.04810840148082634 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.548936170212766, + "acc_stderr": 0.03252909619613197, + "acc_norm": 0.548936170212766, + "acc_norm_stderr": 0.03252909619613197 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.34210526315789475, + "acc_stderr": 0.044629175353369355, + "acc_norm": 0.34210526315789475, + "acc_norm_stderr": 0.044629175353369355 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5724137931034483, + "acc_stderr": 0.04122737111370333, + "acc_norm": 0.5724137931034483, + "acc_norm_stderr": 0.04122737111370333 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.025107425481137285, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.025107425481137285 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.04306241259127153, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.04306241259127153 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7064516129032258, + "acc_stderr": 0.025906087021319295, + "acc_norm": 0.7064516129032258, + "acc_norm_stderr": 0.025906087021319295 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.49261083743842365, + "acc_stderr": 0.035176035403610084, + "acc_norm": 0.49261083743842365, + "acc_norm_stderr": 0.035176035403610084 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7393939393939394, + "acc_stderr": 0.034277431758165236, + "acc_norm": 0.7393939393939394, + "acc_norm_stderr": 0.034277431758165236 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7727272727272727, + "acc_stderr": 0.029857515673386414, + "acc_norm": 0.7727272727272727, + "acc_norm_stderr": 0.029857515673386414 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8393782383419689, + "acc_stderr": 0.02649905770139744, + "acc_norm": 0.8393782383419689, + "acc_norm_stderr": 0.02649905770139744 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5871794871794872, + "acc_stderr": 0.024962683564331796, + "acc_norm": 0.5871794871794872, + "acc_norm_stderr": 0.024962683564331796 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3111111111111111, + "acc_stderr": 0.02822644674968352, + "acc_norm": 0.3111111111111111, + "acc_norm_stderr": 0.02822644674968352 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.0303883535518868, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.0303883535518868 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8091743119266055, + "acc_stderr": 0.01684767640009109, + "acc_norm": 0.8091743119266055, + "acc_norm_stderr": 0.01684767640009109 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.03407632093854051, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.03407632093854051 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7303921568627451, + "acc_stderr": 0.031145570659486782, + "acc_norm": 0.7303921568627451, + "acc_norm_stderr": 0.031145570659486782 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7383966244725738, + "acc_stderr": 0.028609516716994934, + "acc_norm": 0.7383966244725738, + "acc_norm_stderr": 0.028609516716994934 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6591928251121076, + "acc_stderr": 0.0318114974705536, + "acc_norm": 0.6591928251121076, + "acc_norm_stderr": 0.0318114974705536 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6870229007633588, + "acc_stderr": 0.04066962905677697, + "acc_norm": 0.6870229007633588, + "acc_norm_stderr": 0.04066962905677697 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.768595041322314, + "acc_stderr": 0.03849856098794088, + "acc_norm": 0.768595041322314, + "acc_norm_stderr": 0.03849856098794088 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7685185185185185, + "acc_stderr": 0.04077494709252626, + "acc_norm": 0.7685185185185185, + "acc_norm_stderr": 0.04077494709252626 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7116564417177914, + "acc_stderr": 0.03559039531617342, + "acc_norm": 0.7116564417177914, + "acc_norm_stderr": 0.03559039531617342 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4017857142857143, + "acc_stderr": 0.04653333146973646, + "acc_norm": 0.4017857142857143, + "acc_norm_stderr": 0.04653333146973646 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8461538461538461, + "acc_stderr": 0.023636873317489298, + "acc_norm": 0.8461538461538461, + "acc_norm_stderr": 0.023636873317489298 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.74, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.74, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7726692209450831, + "acc_stderr": 0.014987270640946012, + "acc_norm": 0.7726692209450831, + "acc_norm_stderr": 0.014987270640946012 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6820809248554913, + "acc_stderr": 0.025070713719153176, + "acc_norm": 0.6820809248554913, + "acc_norm_stderr": 0.025070713719153176 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3217877094972067, + "acc_stderr": 0.015624236160792582, + "acc_norm": 0.3217877094972067, + "acc_norm_stderr": 0.015624236160792582 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6633986928104575, + "acc_stderr": 0.02705797462449438, + "acc_norm": 0.6633986928104575, + "acc_norm_stderr": 0.02705797462449438 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.684887459807074, + "acc_stderr": 0.026385273703464496, + "acc_norm": 0.684887459807074, + "acc_norm_stderr": 0.026385273703464496 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.654320987654321, + "acc_stderr": 0.02646248777700187, + "acc_norm": 0.654320987654321, + "acc_norm_stderr": 0.02646248777700187 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4645390070921986, + "acc_stderr": 0.029752389657427047, + "acc_norm": 0.4645390070921986, + "acc_norm_stderr": 0.029752389657427047 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.455019556714472, + "acc_stderr": 0.012718456618701763, + "acc_norm": 0.455019556714472, + "acc_norm_stderr": 0.012718456618701763 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6286764705882353, + "acc_stderr": 0.02934980313976587, + "acc_norm": 0.6286764705882353, + "acc_norm_stderr": 0.02934980313976587 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6160130718954249, + "acc_stderr": 0.01967580813528151, + "acc_norm": 0.6160130718954249, + "acc_norm_stderr": 0.01967580813528151 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.045820048415054174, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.045820048415054174 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7224489795918367, + "acc_stderr": 0.02866685779027465, + "acc_norm": 0.7224489795918367, + "acc_norm_stderr": 0.02866685779027465 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8009950248756219, + "acc_stderr": 0.028231365092758406, + "acc_norm": 0.8009950248756219, + "acc_norm_stderr": 0.028231365092758406 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5180722891566265, + "acc_stderr": 0.03889951252827216, + "acc_norm": 0.5180722891566265, + "acc_norm_stderr": 0.03889951252827216 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.030944459778533207, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.030944459778533207 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2766217870257038, + "mc1_stderr": 0.015659605755326912, + "mc2": 0.42858587749612026, + "mc2_stderr": 0.014059235435250938 + }, + "harness|winogrande|5": { + "acc": 0.7174427782162589, + "acc_stderr": 0.01265406285097139 + }, + "harness|drop|3": { + "em": 0.18791946308724833, + "em_stderr": 0.004000599568072892, + "f1": 0.23667890100671124, + "f1_stderr": 0.003992615682814011 + }, + "harness|gsm8k|5": { + "acc": 0.06444275966641395, + "acc_stderr": 0.006763391728488265 + }, + "all": { + "acc": 0.5969511263414031, + "acc_stderr": 0.0329865461490785, + "acc_norm": 0.6078135521201408, + "acc_norm_stderr": 0.03376504385445851, + "mc1": 0.2766217870257038, + "mc1_stderr": 0.015659605755326912, + "mc2": 0.42858587749612026, + "mc2_stderr": 0.014059235435250938, + "em": 0.18791946308724833, + "em_stderr": 0.004000599568072892, + "f1": 0.23667890100671124, + "f1_stderr": 0.003992615682814011 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "c84bbabff7655573", + "hash_cont_tokens": "e23c779c4c2dd1ec" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4682, + "non_padded": 5, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "52e70aa3670e3695", + "hash_cont_tokens": "55da5ba61989a8fe" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40097, + "non_padded": 71, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "085f405a873c9f87", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3b492ddc5de3f57a", + "hash_cont_tokens": "5cc800feae9fa1ad" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa55e6645b3f3526", + "hash_cont_tokens": "655dbb90034f484a" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "5f80d5327a047022", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c0a3ae71b5506278", + "hash_cont_tokens": "f77b74d946d7fc02" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "6fcc5fb2ad3a62b5", + "hash_cont_tokens": "1ba4b1a158d8bf3f" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "b3c5950ef0ab5b9f", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "d4b18e1debc64387", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "78289261a74f39aa", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "5449a8e432780f7f", + "hash_cont_tokens": "78a0ebf66d91c5cf" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "b55be981de130fed", + "hash_cont_tokens": "5a030c95824fdbe5" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "b39d36783fd07415", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "90db261ac05081a8", + "hash_cont_tokens": "2326dc60d0bc41b6" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "3b6ab5e66082a68d", + "hash_cont_tokens": "be908364b6f14dd6" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "a8e0453f990ff5aa", + "hash_cont_tokens": "179280ef597fe1bf" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 564, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "9e30d3a741143c4a", + "hash_cont_tokens": "95cdcdaf1abd0bd2" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "06838690ab0d64b9", + "hash_cont_tokens": "6a4818f3c307c346" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "50dc8670e216ba78", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "0097a3c431b4fc51", + "hash_cont_tokens": "36d0d84455f0bdba" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75f3de0dad7830bc", + "hash_cont_tokens": "c678f794a9b8ee74" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "bc373cd584fa942b", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "507c0abd3d17fd8f", + "hash_cont_tokens": "e9c94304326d875c" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a8ab4dfafa4f65b4", + "hash_cont_tokens": "f937a1349eb483eb" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "e33171fd6e0b4a9c", + "hash_cont_tokens": "8b27dd3907d25b4e" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "f3319223cf191987", + "hash_cont_tokens": "3763cae29e2f938c" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "2f08fbb89a3a31b0", + "hash_cont_tokens": "fd7b555352d765a4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d2ff2b6e81f3e039", + "hash_cont_tokens": "61f46d4a209b9aa2" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "dd50a9b81a6e14a2", + "hash_cont_tokens": "4e7053e7c19d680d" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d5f514e075b8a310", + "hash_cont_tokens": "84d19ae8790476bb" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "3faf848f9d19cb14", + "hash_cont_tokens": "b119c7b668213a4e" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "dafa7c29ee53148d", + "hash_cont_tokens": "a3b126bc622d571f" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "f3f7c0cb054a9101", + "hash_cont_tokens": "9abf19ceb76331ff" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "ee334f2be12733c8", + "hash_cont_tokens": "0e2e725ae9a898da" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "a9997011eacb1c14", + "hash_cont_tokens": "a94c1dea6d775249" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "5e065bb834e5eb5f", + "hash_cont_tokens": "3832f860859bb86b" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6694a4e4327a0eee", + "hash_cont_tokens": "9fac5a0c364fca8a" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "630193f0a85c4db4", + "hash_cont_tokens": "dc53ed31134ddf3a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "481eec60fca7d379", + "hash_cont_tokens": "e272b5456d5552d6" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "5e29b566e42d5c49", + "hash_cont_tokens": "7119d4642957b1f0" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "abc950328f30685d", + "hash_cont_tokens": "099d58c66ece3f11" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "7b7f0526063c20bd", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "2f35d509e71e13d9", + "hash_cont_tokens": "bae342d4e82ba8f7" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "a1fe66c367aec9a4", + "hash_cont_tokens": "578c64cbdbb1e0d4" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "477794fff20bb51b", + "hash_cont_tokens": "79b25f42b3fce0f9" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "f0035147162e2914", + "hash_cont_tokens": "9d1f3b976417156c" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "afde0a4bb78262a8", + "hash_cont_tokens": "88dab560e1e06d97" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "80cbaf9c72217b9b", + "hash_cont_tokens": "04ea847139fe9393" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "34fa03402fe143e2", + "hash_cont_tokens": "0435ff692ad17e68" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1124, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "970559d2709d7dfb", + "hash_cont_tokens": "b852c74e9f8801bd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "e6bad9d3d227482c", + "hash_cont_tokens": "5db0f6460652d063" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "5915ac075f743cd6", + "hash_cont_tokens": "c960676ef7f3dbe5" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "abdaa0333725e504", + "hash_cont_tokens": "3320565f412c4b01" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "5e5e21ce02813577", + "hash_cont_tokens": "218ed775ef60aab9" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "74f6e50f8da04eb6", + "hash_cont_tokens": "20babf5cc4cc7f3d" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "4234573f54827f4f", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "d8f9c3d810f8d6f2", + "hash_cont_tokens": "dc6d57296bea0882" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "a96ae58b7a2f1010", + "hash_cont_tokens": "37f53444db289ed3" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "4214b9bf45e97067", + "hash_cont_tokens": "71a67034827cd30e" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "a7eeaad96f70499b", + "hash_cont_tokens": "c93e9c22fa3077a0" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "0e6ecbc56f7e5009", + "hash_cont_tokens": "1998fae1651f30c4" + }, + "truncated": 1, + "non_truncated": 9535, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "d488b9ef001d40f5", + "hash_cont_tokens": "35ffaa3d11f6dc08" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "30bfead6e298fa54", + "hash_cont_tokens": "e521263859f17353" + }, + "truncated": 1, + "non_truncated": 38194, + "padded": 113445, + "non_padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/itsliupeng/openllama-7b-base/results_2023-12-09T17-41-52.346369.json b/eval-results/itsliupeng/openllama-7b-base/results_2023-12-09T17-41-52.346369.json new file mode 100644 index 0000000000000000000000000000000000000000..aa5a4ac495ba8ea2d25b2b685d2d7d5c7979af1c --- /dev/null +++ b/eval-results/itsliupeng/openllama-7b-base/results_2023-12-09T17-41-52.346369.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 592937.035189291, + "end_time": 598583.213599166, + "total_evaluation_time_secondes": "5646.178409875021", + "model_name": "itsliupeng/openllama-7b-base", + "model_sha": "24d98f339fabfa479e3c85404f5e4dda9e43dcd1", + "model_dtype": "torch.bfloat16", + "model_size": "12.58 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.44197952218430037, + "acc_stderr": 0.014512682523128343, + "acc_norm": 0.4616040955631399, + "acc_norm_stderr": 0.01456824555029636 + }, + "harness|hellaswag|10": { + "acc": 0.5703047201752639, + "acc_stderr": 0.004940208641372079, + "acc_norm": 0.7639912368054173, + "acc_norm_stderr": 0.0042375981420072475 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421296, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421296 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.43703703703703706, + "acc_stderr": 0.042849586397533994, + "acc_norm": 0.43703703703703706, + "acc_norm_stderr": 0.042849586397533994 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.42105263157894735, + "acc_stderr": 0.04017901275981749, + "acc_norm": 0.42105263157894735, + "acc_norm_stderr": 0.04017901275981749 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4830188679245283, + "acc_stderr": 0.030755120364119905, + "acc_norm": 0.4830188679245283, + "acc_norm_stderr": 0.030755120364119905 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.04174752578923185, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.04174752578923185 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3930635838150289, + "acc_stderr": 0.03724249595817729, + "acc_norm": 0.3930635838150289, + "acc_norm_stderr": 0.03724249595817729 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.039505818611799616, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.039505818611799616 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3702127659574468, + "acc_stderr": 0.03156564682236784, + "acc_norm": 0.3702127659574468, + "acc_norm_stderr": 0.03156564682236784 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.04339138322579861, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.04339138322579861 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5241379310344828, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.5241379310344828, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.28835978835978837, + "acc_stderr": 0.0233306540545359, + "acc_norm": 0.28835978835978837, + "acc_norm_stderr": 0.0233306540545359 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2698412698412698, + "acc_stderr": 0.03970158273235172, + "acc_norm": 0.2698412698412698, + "acc_norm_stderr": 0.03970158273235172 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.45806451612903226, + "acc_stderr": 0.028343787250540618, + "acc_norm": 0.45806451612903226, + "acc_norm_stderr": 0.028343787250540618 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3103448275862069, + "acc_stderr": 0.03255086769970103, + "acc_norm": 0.3103448275862069, + "acc_norm_stderr": 0.03255086769970103 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.42, + "acc_stderr": 0.04960449637488584, + "acc_norm": 0.42, + "acc_norm_stderr": 0.04960449637488584 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.4909090909090909, + "acc_stderr": 0.0390369864774844, + "acc_norm": 0.4909090909090909, + "acc_norm_stderr": 0.0390369864774844 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.4898989898989899, + "acc_stderr": 0.035616254886737454, + "acc_norm": 0.4898989898989899, + "acc_norm_stderr": 0.035616254886737454 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6321243523316062, + "acc_stderr": 0.034801756684660366, + "acc_norm": 0.6321243523316062, + "acc_norm_stderr": 0.034801756684660366 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4076923076923077, + "acc_stderr": 0.024915243985987837, + "acc_norm": 0.4076923076923077, + "acc_norm_stderr": 0.024915243985987837 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.026719240783712163, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.026719240783712163 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.36134453781512604, + "acc_stderr": 0.031204691225150013, + "acc_norm": 0.36134453781512604, + "acc_norm_stderr": 0.031204691225150013 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.03631329803969653, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.03631329803969653 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.5614678899082569, + "acc_stderr": 0.021274713073954565, + "acc_norm": 0.5614678899082569, + "acc_norm_stderr": 0.021274713073954565 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.0316746870682898, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.0316746870682898 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.45098039215686275, + "acc_stderr": 0.03492406104163613, + "acc_norm": 0.45098039215686275, + "acc_norm_stderr": 0.03492406104163613 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.5569620253164557, + "acc_stderr": 0.032335327775334835, + "acc_norm": 0.5569620253164557, + "acc_norm_stderr": 0.032335327775334835 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.4260089686098655, + "acc_stderr": 0.033188332862172806, + "acc_norm": 0.4260089686098655, + "acc_norm_stderr": 0.033188332862172806 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.48854961832061067, + "acc_stderr": 0.043841400240780176, + "acc_norm": 0.48854961832061067, + "acc_norm_stderr": 0.043841400240780176 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5537190082644629, + "acc_stderr": 0.0453793517794788, + "acc_norm": 0.5537190082644629, + "acc_norm_stderr": 0.0453793517794788 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.04830366024635331, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.04830366024635331 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.48466257668711654, + "acc_stderr": 0.039265223787088424, + "acc_norm": 0.48466257668711654, + "acc_norm_stderr": 0.039265223787088424 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5242718446601942, + "acc_stderr": 0.049449010929737795, + "acc_norm": 0.5242718446601942, + "acc_norm_stderr": 0.049449010929737795 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.03193705726200293, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.03193705726200293 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5925925925925926, + "acc_stderr": 0.017570705239256558, + "acc_norm": 0.5925925925925926, + "acc_norm_stderr": 0.017570705239256558 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.4884393063583815, + "acc_stderr": 0.02691189868637792, + "acc_norm": 0.4884393063583815, + "acc_norm_stderr": 0.02691189868637792 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2435754189944134, + "acc_stderr": 0.01435591196476786, + "acc_norm": 0.2435754189944134, + "acc_norm_stderr": 0.01435591196476786 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.477124183006536, + "acc_stderr": 0.028599936776089786, + "acc_norm": 0.477124183006536, + "acc_norm_stderr": 0.028599936776089786 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.45980707395498394, + "acc_stderr": 0.028306190403305693, + "acc_norm": 0.45980707395498394, + "acc_norm_stderr": 0.028306190403305693 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.4876543209876543, + "acc_stderr": 0.027812262269327242, + "acc_norm": 0.4876543209876543, + "acc_norm_stderr": 0.027812262269327242 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3546099290780142, + "acc_stderr": 0.02853865002887864, + "acc_norm": 0.3546099290780142, + "acc_norm_stderr": 0.02853865002887864 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3389830508474576, + "acc_stderr": 0.012089941857584477, + "acc_norm": 0.3389830508474576, + "acc_norm_stderr": 0.012089941857584477 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.41544117647058826, + "acc_stderr": 0.02993534270787775, + "acc_norm": 0.41544117647058826, + "acc_norm_stderr": 0.02993534270787775 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4133986928104575, + "acc_stderr": 0.019922115682786682, + "acc_norm": 0.4133986928104575, + "acc_norm_stderr": 0.019922115682786682 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5, + "acc_stderr": 0.04789131426105757, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04789131426105757 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.45714285714285713, + "acc_stderr": 0.031891418324213966, + "acc_norm": 0.45714285714285713, + "acc_norm_stderr": 0.031891418324213966 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.5771144278606966, + "acc_stderr": 0.034932317774212816, + "acc_norm": 0.5771144278606966, + "acc_norm_stderr": 0.034932317774212816 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3614457831325301, + "acc_stderr": 0.037400593820293204, + "acc_norm": 0.3614457831325301, + "acc_norm_stderr": 0.037400593820293204 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.5964912280701754, + "acc_stderr": 0.03762738699917057, + "acc_norm": 0.5964912280701754, + "acc_norm_stderr": 0.03762738699917057 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862677, + "mc2": 0.3664912047351792, + "mc2_stderr": 0.01364656500793206 + }, + "harness|winogrande|5": { + "acc": 0.7087608524072613, + "acc_stderr": 0.012769029305370702 + }, + "harness|gsm8k|5": { + "acc": 0.09628506444275967, + "acc_stderr": 0.008125264128215908 + }, + "all": { + "acc": 0.42989152566033884, + "acc_stderr": 0.03449698744058074, + "acc_norm": 0.43443471590575655, + "acc_norm_stderr": 0.03530126937236681, + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862677, + "mc2": 0.3664912047351792, + "mc2_stderr": 0.01364656500793206 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "041985237c885d54", + "hash_cont_tokens": "2e8835aa03b9c2cf" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4676, + "non_padded": 11, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "21d693dbbc77cc0b", + "hash_cont_tokens": "18a48de3edcef462" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 39987, + "non_padded": 181, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "001ebf934ceb5cfc", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d6e588681ccff46f", + "hash_cont_tokens": "1d81fa80e3039a08" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "f3e9971a61ea9d00", + "hash_cont_tokens": "247dc44c6b578728" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ae5001e99fd680cc", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1ce6abe4e260f248", + "hash_cont_tokens": "26e3b69d5fb27bb2" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "f3b3a09b96723203", + "hash_cont_tokens": "bbda31842f3930d5" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 568, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "cf267ebed654ee2a", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "63a2fce9e8a6c4ce", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "bbefbc4ba4160e4d", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "ba7af09ead7df486", + "hash_cont_tokens": "894854ed7bec57f7" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 688, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "2f5303106dc26337", + "hash_cont_tokens": "13130ec6de384bbb" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "151396dced83ec83", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "e623a4a7f37c694c", + "hash_cont_tokens": "29089b8b7020611e" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "406b5663640b79e3", + "hash_cont_tokens": "efc596dfa1a1f073" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "082aaa5548e614aa", + "hash_cont_tokens": "70817a7ac9f44af2" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 560, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3e411365794de91f", + "hash_cont_tokens": "937cd53d06cc6e16" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "8620b3e66c880f2b", + "hash_cont_tokens": "eec972abe0fc0f5a" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "0a9061121f4cfd2e", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4cd23dba5ca30403", + "hash_cont_tokens": "94971ccfe8e59c25" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "83cfc43951998dbd", + "hash_cont_tokens": "a78e38b59778a04c" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "ca1be9098cb9250d", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "142b719c7d7d4fe0", + "hash_cont_tokens": "91dc522e4e4e91c3" + }, + "truncated": 660, + "non_truncated": -495, + "padded": 0, + "non_padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "2da66fb3a3c8439a", + "hash_cont_tokens": "f275c901b3d285f9" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "cd855470d1b7aafc", + "hash_cont_tokens": "85eb58f423437cce" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 770, + "non_padded": 2, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "0172a82e3f172856", + "hash_cont_tokens": "39a93706184f896b" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "9c0fb5b04cbaf714", + "hash_cont_tokens": "d41065d20b689af3" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "40c8cb775f0d9b69", + "hash_cont_tokens": "28c1f7c11bf85409" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "da586570f7046bae", + "hash_cont_tokens": "78c510e6c5d316ac" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "26e52e1bb05450ab", + "hash_cont_tokens": "0ba4ecffc67603c5" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "c46477e30932b082", + "hash_cont_tokens": "4a0339e9ad3efa6d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a54112084a848a44", + "hash_cont_tokens": "2529d55ec490f81f" + }, + "truncated": 816, + "non_truncated": -612, + "padded": 0, + "non_padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2f52872fc891c1b5", + "hash_cont_tokens": "21808b54f5df97b2" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "7cf58bc03f52313c", + "hash_cont_tokens": "92acdd467ed943e1" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "687501ddb0f34488", + "hash_cont_tokens": "a6034ed95a124315" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cf8eea3d44d90bb4", + "hash_cont_tokens": "223fbf3fd106c04b" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "0d6405105760514c", + "hash_cont_tokens": "7c8e30f486ff156a" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 428, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "16c5482fad1409bc", + "hash_cont_tokens": "b4cc4a8d31bbaa03" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 636, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "85014eabaa9b54b8", + "hash_cont_tokens": "7f0e1289ec188e82" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "715fb569d7974854", + "hash_cont_tokens": "66b726b356a02feb" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "b806bfa21345fe52", + "hash_cont_tokens": "f08457005b652d25" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "a6776c4e0d94e35b", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "abfe6a461b28c61e", + "hash_cont_tokens": "647bcbd68f292558" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "9200dabf7e9b9381", + "hash_cont_tokens": "6849b7fe56c50dda" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1368, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "f4faa207af1056b7", + "hash_cont_tokens": "81585ec455b1e3e5" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "5e66c4cf4bc67baf", + "hash_cont_tokens": "471b68eb20e5d34b" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "d3a50f2bd3b5ee5a", + "hash_cont_tokens": "6e39384b9c0a8cc2" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "8fc689b6555cec5f", + "hash_cont_tokens": "bfe513578190093f" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "591835aff4e06a52", + "hash_cont_tokens": "9ce431b67350b312" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "8704dadd4fe41dab", + "hash_cont_tokens": "0ff990d9cc38024d" + }, + "truncated": 168, + "non_truncated": 1366, + "padded": 5968, + "non_padded": 168, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "94302bb70b3a5b6f", + "hash_cont_tokens": "bc3c70e15bc7dce0" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "42664d142176093f", + "hash_cont_tokens": "58464ea26d81f908" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "09153181a50fd286", + "hash_cont_tokens": "eaf6a5d3ddd39a12" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "35a2d3d422ab5a6c", + "hash_cont_tokens": "618fd4f954253134" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "2d783f62b6a1e880", + "hash_cont_tokens": "b4962d9e583b12c0" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "ac91f7d04c420c12", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e635cd0ac96a79ab", + "hash_cont_tokens": "397a75462a9735e3" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6f462544d7b17456", + "hash_cont_tokens": "de629d1414e01de8" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "227f58882e48effa", + "hash_cont_tokens": "df48bc66e06781f2" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "1553528548a3ed49", + "hash_cont_tokens": "828897df1f4f08a1" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "b1f8ec5d7ba1cfc5", + "hash_cont_tokens": "d0fde0aea5effbc3" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "01d7ee9ddcda2dca", + "hash_cont_tokens": "a10706ccf0020566" + }, + "truncated": 1644, + "non_truncated": 27015, + "padded": 111639, + "non_padded": 3233, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/itsliupeng/openllama-7b-icl/results_2023-12-09T17-48-05.024924.json b/eval-results/itsliupeng/openllama-7b-icl/results_2023-12-09T17-48-05.024924.json new file mode 100644 index 0000000000000000000000000000000000000000..5199111394ce78ced3737419cb99cde16d6159ca --- /dev/null +++ b/eval-results/itsliupeng/openllama-7b-icl/results_2023-12-09T17-48-05.024924.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 593038.861762763, + "end_time": 598952.97761519, + "total_evaluation_time_secondes": "5914.11585242697", + "model_name": "itsliupeng/openllama-7b-icl", + "model_sha": "d6317fed3b190cc4d4c27b9f27ccf7c77f0b2e3b", + "model_dtype": "torch.bfloat16", + "model_size": "12.58 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.44197952218430037, + "acc_stderr": 0.014512682523128345, + "acc_norm": 0.47952218430034127, + "acc_norm_stderr": 0.014599131353035007 + }, + "harness|hellaswag|10": { + "acc": 0.5676160127464649, + "acc_stderr": 0.0049439450696114546, + "acc_norm": 0.7703644692292372, + "acc_norm_stderr": 0.004197388626940065 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.46710526315789475, + "acc_stderr": 0.040601270352363966, + "acc_norm": 0.46710526315789475, + "acc_norm_stderr": 0.040601270352363966 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4867924528301887, + "acc_stderr": 0.030762134874500482, + "acc_norm": 0.4867924528301887, + "acc_norm_stderr": 0.030762134874500482 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4930555555555556, + "acc_stderr": 0.04180806750294938, + "acc_norm": 0.4930555555555556, + "acc_norm_stderr": 0.04180806750294938 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.47398843930635837, + "acc_stderr": 0.038073017265045105, + "acc_norm": 0.47398843930635837, + "acc_norm_stderr": 0.038073017265045105 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.63, + "acc_stderr": 0.048523658709390974, + "acc_norm": 0.63, + "acc_norm_stderr": 0.048523658709390974 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.34893617021276596, + "acc_stderr": 0.03115852213135778, + "acc_norm": 0.34893617021276596, + "acc_norm_stderr": 0.03115852213135778 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.22807017543859648, + "acc_stderr": 0.03947152782669415, + "acc_norm": 0.22807017543859648, + "acc_norm_stderr": 0.03947152782669415 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5448275862068965, + "acc_stderr": 0.04149886942192118, + "acc_norm": 0.5448275862068965, + "acc_norm_stderr": 0.04149886942192118 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2804232804232804, + "acc_stderr": 0.02313528797432563, + "acc_norm": 0.2804232804232804, + "acc_norm_stderr": 0.02313528797432563 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.1984126984126984, + "acc_stderr": 0.03567016675276864, + "acc_norm": 0.1984126984126984, + "acc_norm_stderr": 0.03567016675276864 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.45161290322580644, + "acc_stderr": 0.028310500348568392, + "acc_norm": 0.45161290322580644, + "acc_norm_stderr": 0.028310500348568392 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.33497536945812806, + "acc_stderr": 0.033208527423483104, + "acc_norm": 0.33497536945812806, + "acc_norm_stderr": 0.033208527423483104 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.509090909090909, + "acc_stderr": 0.0390369864774844, + "acc_norm": 0.509090909090909, + "acc_norm_stderr": 0.0390369864774844 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5202020202020202, + "acc_stderr": 0.03559443565563918, + "acc_norm": 0.5202020202020202, + "acc_norm_stderr": 0.03559443565563918 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.616580310880829, + "acc_stderr": 0.03508984236295342, + "acc_norm": 0.616580310880829, + "acc_norm_stderr": 0.03508984236295342 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4, + "acc_stderr": 0.024838811988033158, + "acc_norm": 0.4, + "acc_norm_stderr": 0.024838811988033158 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.02534809746809784, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.02534809746809784 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3487394957983193, + "acc_stderr": 0.030956636328566548, + "acc_norm": 0.3487394957983193, + "acc_norm_stderr": 0.030956636328566548 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3509933774834437, + "acc_stderr": 0.03896981964257375, + "acc_norm": 0.3509933774834437, + "acc_norm_stderr": 0.03896981964257375 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6110091743119266, + "acc_stderr": 0.020902300887392866, + "acc_norm": 0.6110091743119266, + "acc_norm_stderr": 0.020902300887392866 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.029157522184605607, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.029157522184605607 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5, + "acc_stderr": 0.03509312031717982, + "acc_norm": 0.5, + "acc_norm_stderr": 0.03509312031717982 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6160337552742616, + "acc_stderr": 0.031658678064106674, + "acc_norm": 0.6160337552742616, + "acc_norm_stderr": 0.031658678064106674 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5022421524663677, + "acc_stderr": 0.03355746535223265, + "acc_norm": 0.5022421524663677, + "acc_norm_stderr": 0.03355746535223265 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.48854961832061067, + "acc_stderr": 0.043841400240780176, + "acc_norm": 0.48854961832061067, + "acc_norm_stderr": 0.043841400240780176 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5537190082644629, + "acc_stderr": 0.0453793517794788, + "acc_norm": 0.5537190082644629, + "acc_norm_stderr": 0.0453793517794788 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.048262172941398944, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.048262172941398944 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5521472392638037, + "acc_stderr": 0.03906947479456605, + "acc_norm": 0.5521472392638037, + "acc_norm_stderr": 0.03906947479456605 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.04432804055291519, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.04432804055291519 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5825242718446602, + "acc_stderr": 0.048828405482122375, + "acc_norm": 0.5825242718446602, + "acc_norm_stderr": 0.048828405482122375 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6239316239316239, + "acc_stderr": 0.03173393632969481, + "acc_norm": 0.6239316239316239, + "acc_norm_stderr": 0.03173393632969481 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5977011494252874, + "acc_stderr": 0.01753529452906895, + "acc_norm": 0.5977011494252874, + "acc_norm_stderr": 0.01753529452906895 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.4682080924855491, + "acc_stderr": 0.026864624366756653, + "acc_norm": 0.4682080924855491, + "acc_norm_stderr": 0.026864624366756653 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23910614525139665, + "acc_stderr": 0.014265554192331161, + "acc_norm": 0.23910614525139665, + "acc_norm_stderr": 0.014265554192331161 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.48366013071895425, + "acc_stderr": 0.028614624752805413, + "acc_norm": 0.48366013071895425, + "acc_norm_stderr": 0.028614624752805413 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.49517684887459806, + "acc_stderr": 0.02839677044411129, + "acc_norm": 0.49517684887459806, + "acc_norm_stderr": 0.02839677044411129 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5030864197530864, + "acc_stderr": 0.02782021415859437, + "acc_norm": 0.5030864197530864, + "acc_norm_stderr": 0.02782021415859437 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.35815602836879434, + "acc_stderr": 0.02860208586275942, + "acc_norm": 0.35815602836879434, + "acc_norm_stderr": 0.02860208586275942 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3474576271186441, + "acc_stderr": 0.012161417729749798, + "acc_norm": 0.3474576271186441, + "acc_norm_stderr": 0.012161417729749798 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.41544117647058826, + "acc_stderr": 0.029935342707877753, + "acc_norm": 0.41544117647058826, + "acc_norm_stderr": 0.029935342707877753 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.42320261437908496, + "acc_stderr": 0.01998780976948207, + "acc_norm": 0.42320261437908496, + "acc_norm_stderr": 0.01998780976948207 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5909090909090909, + "acc_stderr": 0.04709306978661896, + "acc_norm": 0.5909090909090909, + "acc_norm_stderr": 0.04709306978661896 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4816326530612245, + "acc_stderr": 0.03198761546763126, + "acc_norm": 0.4816326530612245, + "acc_norm_stderr": 0.03198761546763126 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.5472636815920398, + "acc_stderr": 0.035197027175769155, + "acc_norm": 0.5472636815920398, + "acc_norm_stderr": 0.035197027175769155 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4397590361445783, + "acc_stderr": 0.03864139923699121, + "acc_norm": 0.4397590361445783, + "acc_norm_stderr": 0.03864139923699121 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6491228070175439, + "acc_stderr": 0.03660298834049163, + "acc_norm": 0.6491228070175439, + "acc_norm_stderr": 0.03660298834049163 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23745410036719705, + "mc1_stderr": 0.014896277441041836, + "mc2": 0.3706359177223847, + "mc2_stderr": 0.01391522805511699 + }, + "harness|winogrande|5": { + "acc": 0.7016574585635359, + "acc_stderr": 0.012858885010030421 + }, + "harness|gsm8k|5": { + "acc": 0.10993176648976498, + "acc_stderr": 0.008616195587865418 + }, + "all": { + "acc": 0.44441569324312047, + "acc_stderr": 0.03430171403503658, + "acc_norm": 0.4497976132436587, + "acc_norm_stderr": 0.035089311320789345, + "mc1": 0.23745410036719705, + "mc1_stderr": 0.014896277441041836, + "mc2": 0.3706359177223847, + "mc2_stderr": 0.01391522805511699 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "041985237c885d54", + "hash_cont_tokens": "2e8835aa03b9c2cf" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4676, + "non_padded": 11, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "21d693dbbc77cc0b", + "hash_cont_tokens": "18a48de3edcef462" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 39987, + "non_padded": 181, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "001ebf934ceb5cfc", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d6e588681ccff46f", + "hash_cont_tokens": "1d81fa80e3039a08" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "f3e9971a61ea9d00", + "hash_cont_tokens": "247dc44c6b578728" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ae5001e99fd680cc", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1ce6abe4e260f248", + "hash_cont_tokens": "26e3b69d5fb27bb2" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "f3b3a09b96723203", + "hash_cont_tokens": "bbda31842f3930d5" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 568, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "cf267ebed654ee2a", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "63a2fce9e8a6c4ce", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "bbefbc4ba4160e4d", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "ba7af09ead7df486", + "hash_cont_tokens": "894854ed7bec57f7" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 688, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "2f5303106dc26337", + "hash_cont_tokens": "13130ec6de384bbb" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "151396dced83ec83", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "e623a4a7f37c694c", + "hash_cont_tokens": "29089b8b7020611e" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "406b5663640b79e3", + "hash_cont_tokens": "efc596dfa1a1f073" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "082aaa5548e614aa", + "hash_cont_tokens": "70817a7ac9f44af2" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 560, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3e411365794de91f", + "hash_cont_tokens": "937cd53d06cc6e16" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "8620b3e66c880f2b", + "hash_cont_tokens": "eec972abe0fc0f5a" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "0a9061121f4cfd2e", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4cd23dba5ca30403", + "hash_cont_tokens": "94971ccfe8e59c25" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "83cfc43951998dbd", + "hash_cont_tokens": "a78e38b59778a04c" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "ca1be9098cb9250d", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "142b719c7d7d4fe0", + "hash_cont_tokens": "91dc522e4e4e91c3" + }, + "truncated": 660, + "non_truncated": -495, + "padded": 0, + "non_padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "2da66fb3a3c8439a", + "hash_cont_tokens": "f275c901b3d285f9" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "cd855470d1b7aafc", + "hash_cont_tokens": "85eb58f423437cce" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 770, + "non_padded": 2, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "0172a82e3f172856", + "hash_cont_tokens": "39a93706184f896b" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "9c0fb5b04cbaf714", + "hash_cont_tokens": "d41065d20b689af3" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "40c8cb775f0d9b69", + "hash_cont_tokens": "28c1f7c11bf85409" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "da586570f7046bae", + "hash_cont_tokens": "78c510e6c5d316ac" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "26e52e1bb05450ab", + "hash_cont_tokens": "0ba4ecffc67603c5" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "c46477e30932b082", + "hash_cont_tokens": "4a0339e9ad3efa6d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a54112084a848a44", + "hash_cont_tokens": "2529d55ec490f81f" + }, + "truncated": 816, + "non_truncated": -612, + "padded": 0, + "non_padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2f52872fc891c1b5", + "hash_cont_tokens": "21808b54f5df97b2" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "7cf58bc03f52313c", + "hash_cont_tokens": "92acdd467ed943e1" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "687501ddb0f34488", + "hash_cont_tokens": "a6034ed95a124315" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cf8eea3d44d90bb4", + "hash_cont_tokens": "223fbf3fd106c04b" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "0d6405105760514c", + "hash_cont_tokens": "7c8e30f486ff156a" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 428, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "16c5482fad1409bc", + "hash_cont_tokens": "b4cc4a8d31bbaa03" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 636, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "85014eabaa9b54b8", + "hash_cont_tokens": "7f0e1289ec188e82" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "715fb569d7974854", + "hash_cont_tokens": "66b726b356a02feb" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "b806bfa21345fe52", + "hash_cont_tokens": "f08457005b652d25" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "a6776c4e0d94e35b", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "abfe6a461b28c61e", + "hash_cont_tokens": "647bcbd68f292558" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "9200dabf7e9b9381", + "hash_cont_tokens": "6849b7fe56c50dda" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1368, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "f4faa207af1056b7", + "hash_cont_tokens": "81585ec455b1e3e5" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "5e66c4cf4bc67baf", + "hash_cont_tokens": "471b68eb20e5d34b" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "d3a50f2bd3b5ee5a", + "hash_cont_tokens": "6e39384b9c0a8cc2" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "8fc689b6555cec5f", + "hash_cont_tokens": "bfe513578190093f" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "591835aff4e06a52", + "hash_cont_tokens": "9ce431b67350b312" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "8704dadd4fe41dab", + "hash_cont_tokens": "0ff990d9cc38024d" + }, + "truncated": 168, + "non_truncated": 1366, + "padded": 5968, + "non_padded": 168, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "94302bb70b3a5b6f", + "hash_cont_tokens": "bc3c70e15bc7dce0" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "42664d142176093f", + "hash_cont_tokens": "58464ea26d81f908" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "09153181a50fd286", + "hash_cont_tokens": "eaf6a5d3ddd39a12" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "35a2d3d422ab5a6c", + "hash_cont_tokens": "618fd4f954253134" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "2d783f62b6a1e880", + "hash_cont_tokens": "b4962d9e583b12c0" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "ac91f7d04c420c12", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e635cd0ac96a79ab", + "hash_cont_tokens": "397a75462a9735e3" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6f462544d7b17456", + "hash_cont_tokens": "de629d1414e01de8" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "227f58882e48effa", + "hash_cont_tokens": "df48bc66e06781f2" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "1553528548a3ed49", + "hash_cont_tokens": "828897df1f4f08a1" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "b1f8ec5d7ba1cfc5", + "hash_cont_tokens": "baf84b6a523fcd81" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "01d7ee9ddcda2dca", + "hash_cont_tokens": "e3736d374f7c892d" + }, + "truncated": 1644, + "non_truncated": 27015, + "padded": 111639, + "non_padded": 3233, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/jarradh/llama2_70b_chat_uncensored/results_2023-08-09T17-41-26.455015.json b/eval-results/jarradh/llama2_70b_chat_uncensored/results_2023-08-09T17-41-26.455015.json new file mode 100644 index 0000000000000000000000000000000000000000..4e841518767d7786a1e514c92fe98bfc440c61fe --- /dev/null +++ b/eval-results/jarradh/llama2_70b_chat_uncensored/results_2023-08-09T17-41-26.455015.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.6424914675767918, + "acc_stderr": 0.014005494275916576, + "acc_norm": 0.6843003412969283, + "acc_norm_stderr": 0.01358257109581529 + }, + "harness|hellaswag|10": { + "acc": 0.6794463254331806, + "acc_stderr": 0.004657356402226453, + "acc_norm": 0.8676558454491137, + "acc_norm_stderr": 0.0033817200071652002 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252606, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252606 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6222222222222222, + "acc_stderr": 0.04188307537595853, + "acc_norm": 0.6222222222222222, + "acc_norm_stderr": 0.04188307537595853 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.8026315789473685, + "acc_stderr": 0.03238981601699397, + "acc_norm": 0.8026315789473685, + "acc_norm_stderr": 0.03238981601699397 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.73, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.73, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7245283018867924, + "acc_stderr": 0.027495663683724053, + "acc_norm": 0.7245283018867924, + "acc_norm_stderr": 0.027495663683724053 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7986111111111112, + "acc_stderr": 0.033536474697138406, + "acc_norm": 0.7986111111111112, + "acc_norm_stderr": 0.033536474697138406 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.44, + "acc_stderr": 0.049888765156985884, + "acc_norm": 0.44, + "acc_norm_stderr": 0.049888765156985884 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6416184971098265, + "acc_stderr": 0.036563436533531585, + "acc_norm": 0.6416184971098265, + "acc_norm_stderr": 0.036563436533531585 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.04755129616062946, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.04755129616062946 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6127659574468085, + "acc_stderr": 0.03184389265339526, + "acc_norm": 0.6127659574468085, + "acc_norm_stderr": 0.03184389265339526 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.43859649122807015, + "acc_stderr": 0.04668000738510455, + "acc_norm": 0.43859649122807015, + "acc_norm_stderr": 0.04668000738510455 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6, + "acc_stderr": 0.040824829046386284, + "acc_norm": 0.6, + "acc_norm_stderr": 0.040824829046386284 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.02559185776138218, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.02559185776138218 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8161290322580645, + "acc_stderr": 0.022037217340267822, + "acc_norm": 0.8161290322580645, + "acc_norm_stderr": 0.022037217340267822 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5270935960591133, + "acc_stderr": 0.03512819077876106, + "acc_norm": 0.5270935960591133, + "acc_norm_stderr": 0.03512819077876106 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.73, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.73, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8484848484848485, + "acc_stderr": 0.027998073798781675, + "acc_norm": 0.8484848484848485, + "acc_norm_stderr": 0.027998073798781675 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8535353535353535, + "acc_stderr": 0.025190921114603918, + "acc_norm": 0.8535353535353535, + "acc_norm_stderr": 0.025190921114603918 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.927461139896373, + "acc_stderr": 0.018718998520678178, + "acc_norm": 0.927461139896373, + "acc_norm_stderr": 0.018718998520678178 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7102564102564103, + "acc_stderr": 0.02300062824368797, + "acc_norm": 0.7102564102564103, + "acc_norm_stderr": 0.02300062824368797 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.31851851851851853, + "acc_stderr": 0.02840653309060846, + "acc_norm": 0.31851851851851853, + "acc_norm_stderr": 0.02840653309060846 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.773109243697479, + "acc_stderr": 0.027205371538279472, + "acc_norm": 0.773109243697479, + "acc_norm_stderr": 0.027205371538279472 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4304635761589404, + "acc_stderr": 0.04042809961395634, + "acc_norm": 0.4304635761589404, + "acc_norm_stderr": 0.04042809961395634 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8844036697247707, + "acc_stderr": 0.01370874953417264, + "acc_norm": 0.8844036697247707, + "acc_norm_stderr": 0.01370874953417264 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5925925925925926, + "acc_stderr": 0.03350991604696044, + "acc_norm": 0.5925925925925926, + "acc_norm_stderr": 0.03350991604696044 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9117647058823529, + "acc_stderr": 0.019907399791316945, + "acc_norm": 0.9117647058823529, + "acc_norm_stderr": 0.019907399791316945 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8734177215189873, + "acc_stderr": 0.021644195727955173, + "acc_norm": 0.8734177215189873, + "acc_norm_stderr": 0.021644195727955173 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7802690582959642, + "acc_stderr": 0.02779017706438359, + "acc_norm": 0.7802690582959642, + "acc_norm_stderr": 0.02779017706438359 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8549618320610687, + "acc_stderr": 0.030884661089515375, + "acc_norm": 0.8549618320610687, + "acc_norm_stderr": 0.030884661089515375 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.859504132231405, + "acc_stderr": 0.03172233426002158, + "acc_norm": 0.859504132231405, + "acc_norm_stderr": 0.03172233426002158 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8055555555555556, + "acc_stderr": 0.038260763248848646, + "acc_norm": 0.8055555555555556, + "acc_norm_stderr": 0.038260763248848646 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8159509202453987, + "acc_stderr": 0.03044677768797173, + "acc_norm": 0.8159509202453987, + "acc_norm_stderr": 0.03044677768797173 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5535714285714286, + "acc_stderr": 0.04718471485219588, + "acc_norm": 0.5535714285714286, + "acc_norm_stderr": 0.04718471485219588 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8155339805825242, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.8155339805825242, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9017094017094017, + "acc_stderr": 0.019503444900757567, + "acc_norm": 0.9017094017094017, + "acc_norm_stderr": 0.019503444900757567 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8467432950191571, + "acc_stderr": 0.012881968968303271, + "acc_norm": 0.8467432950191571, + "acc_norm_stderr": 0.012881968968303271 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7716763005780347, + "acc_stderr": 0.022598703804321635, + "acc_norm": 0.7716763005780347, + "acc_norm_stderr": 0.022598703804321635 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4134078212290503, + "acc_stderr": 0.016469814928406164, + "acc_norm": 0.4134078212290503, + "acc_norm_stderr": 0.016469814928406164 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7516339869281046, + "acc_stderr": 0.02473998135511359, + "acc_norm": 0.7516339869281046, + "acc_norm_stderr": 0.02473998135511359 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7813504823151125, + "acc_stderr": 0.023475581417861113, + "acc_norm": 0.7813504823151125, + "acc_norm_stderr": 0.023475581417861113 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8117283950617284, + "acc_stderr": 0.021751866060815882, + "acc_norm": 0.8117283950617284, + "acc_norm_stderr": 0.021751866060815882 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5319148936170213, + "acc_stderr": 0.02976667507587387, + "acc_norm": 0.5319148936170213, + "acc_norm_stderr": 0.02976667507587387 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.560625814863103, + "acc_stderr": 0.012676014778580214, + "acc_norm": 0.560625814863103, + "acc_norm_stderr": 0.012676014778580214 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7205882352941176, + "acc_stderr": 0.027257202606114948, + "acc_norm": 0.7205882352941176, + "acc_norm_stderr": 0.027257202606114948 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7516339869281046, + "acc_stderr": 0.017479487001364764, + "acc_norm": 0.7516339869281046, + "acc_norm_stderr": 0.017479487001364764 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7363636363636363, + "acc_stderr": 0.04220224692971987, + "acc_norm": 0.7363636363636363, + "acc_norm_stderr": 0.04220224692971987 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7959183673469388, + "acc_stderr": 0.0258012834750905, + "acc_norm": 0.7959183673469388, + "acc_norm_stderr": 0.0258012834750905 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8557213930348259, + "acc_stderr": 0.02484575321230604, + "acc_norm": 0.8557213930348259, + "acc_norm_stderr": 0.02484575321230604 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.91, + "acc_stderr": 0.02876234912646613, + "acc_norm": 0.91, + "acc_norm_stderr": 0.02876234912646613 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5542168674698795, + "acc_stderr": 0.03869543323472101, + "acc_norm": 0.5542168674698795, + "acc_norm_stderr": 0.03869543323472101 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8654970760233918, + "acc_stderr": 0.026168221344662297, + "acc_norm": 0.8654970760233918, + "acc_norm_stderr": 0.026168221344662297 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3708690330477356, + "mc1_stderr": 0.016909693580248828, + "mc2": 0.5250068580538624, + "mc2_stderr": 0.014948189232416583 + }, + "all": { + "acc": 0.6867116236638459, + "acc_stderr": 0.031299095507140634, + "acc_norm": 0.6906102405068301, + "acc_norm_stderr": 0.031270306361798894, + "mc1": 0.3708690330477356, + "mc1_stderr": 0.016909693580248828, + "mc2": 0.5250068580538624, + "mc2_stderr": 0.014948189232416583 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "jarradh/llama2_70b_chat_uncensored", + "model_sha": "34b23982a9a996adc8f45c4c2eac7245c4e251b3", + "model_dtype": "torch.float16", + "lighteval_sha": "da839e70121267a9bf55a0fbea4fb2fae2948337", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "27633.710366249084", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/jarradh/llama2_70b_chat_uncensored/results_2023-10-13T07-51-05.565296.json b/eval-results/jarradh/llama2_70b_chat_uncensored/results_2023-10-13T07-51-05.565296.json new file mode 100644 index 0000000000000000000000000000000000000000..0650f574efee3819091c3e70d9e7d8c2366f1106 --- /dev/null +++ b/eval-results/jarradh/llama2_70b_chat_uncensored/results_2023-10-13T07-51-05.565296.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "jarradh/llama2_70b_chat_uncensored", + "model_sha": "8d04aecaed6b2ecbcdfa74238767a18a91bb88f4", + "model_size": "128.56 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.14586828859060402, + "em_stderr": 0.003614785389347219, + "f1": 0.2008619966442949, + "f1_stderr": 0.0036435562383754947 + }, + "harness|gsm8k|5": { + "acc": 0.3025018953752843, + "acc_stderr": 0.012652544133186129 + }, + "harness|winogrande|5": { + "acc": 0.8255722178374112, + "acc_stderr": 0.010665187902498442 + }, + "all": { + "em": 0.14586828859060402, + "em_stderr": 0.003614785389347219, + "f1": 0.2008619966442949, + "f1_stderr": 0.0036435562383754947, + "acc": 0.5640370566063477, + "acc_stderr": 0.011658866017842285 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "8afa75f70fa46110" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "62781f03cdbaec99" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "af9ff49b74475579" + }, + "total_evaluation_time_secondes": "45310.05150437355", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/jphme/Llama-2-13b-chat-german/results_2023-08-12T09-25-25.222755.json b/eval-results/jphme/Llama-2-13b-chat-german/results_2023-08-12T09-25-25.222755.json new file mode 100644 index 0000000000000000000000000000000000000000..27d7b431a01c1515d074ebf0e48844c85449aed9 --- /dev/null +++ b/eval-results/jphme/Llama-2-13b-chat-german/results_2023-08-12T09-25-25.222755.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5349829351535836, + "acc_stderr": 0.01457558392201967, + "acc_norm": 0.5784982935153583, + "acc_norm_stderr": 0.014430197069326023 + }, + "harness|hellaswag|10": { + "acc": 0.616211909978092, + "acc_stderr": 0.004853134271547765, + "acc_norm": 0.8165704043019318, + "acc_norm_stderr": 0.003862273626504547 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421296, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421296 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4962962962962963, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.4962962962962963, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5723684210526315, + "acc_stderr": 0.04026097083296564, + "acc_norm": 0.5723684210526315, + "acc_norm_stderr": 0.04026097083296564 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5811320754716981, + "acc_stderr": 0.030365050829115205, + "acc_norm": 0.5811320754716981, + "acc_norm_stderr": 0.030365050829115205 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.041227287076512825, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.041227287076512825 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4682080924855491, + "acc_stderr": 0.03804749744364763, + "acc_norm": 0.4682080924855491, + "acc_norm_stderr": 0.03804749744364763 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.04488482852329017, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.04488482852329017 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3829787234042553, + "acc_stderr": 0.03177821250236922, + "acc_norm": 0.3829787234042553, + "acc_norm_stderr": 0.03177821250236922 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5586206896551724, + "acc_stderr": 0.04137931034482757, + "acc_norm": 0.5586206896551724, + "acc_norm_stderr": 0.04137931034482757 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3386243386243386, + "acc_stderr": 0.024373197867983056, + "acc_norm": 0.3386243386243386, + "acc_norm_stderr": 0.024373197867983056 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.041049472699033945, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.041049472699033945 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6064516129032258, + "acc_stderr": 0.027791878753132267, + "acc_norm": 0.6064516129032258, + "acc_norm_stderr": 0.027791878753132267 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4433497536945813, + "acc_stderr": 0.03495334582162934, + "acc_norm": 0.4433497536945813, + "acc_norm_stderr": 0.03495334582162934 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.03663974994391244, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.03663974994391244 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6818181818181818, + "acc_stderr": 0.03318477333845331, + "acc_norm": 0.6818181818181818, + "acc_norm_stderr": 0.03318477333845331 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7927461139896373, + "acc_stderr": 0.029252823291803638, + "acc_norm": 0.7927461139896373, + "acc_norm_stderr": 0.029252823291803638 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.48205128205128206, + "acc_stderr": 0.025334667080954935, + "acc_norm": 0.48205128205128206, + "acc_norm_stderr": 0.025334667080954935 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.29259259259259257, + "acc_stderr": 0.02773896963217609, + "acc_norm": 0.29259259259259257, + "acc_norm_stderr": 0.02773896963217609 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5168067226890757, + "acc_stderr": 0.03246013680375308, + "acc_norm": 0.5168067226890757, + "acc_norm_stderr": 0.03246013680375308 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7412844036697248, + "acc_stderr": 0.018776052319619627, + "acc_norm": 0.7412844036697248, + "acc_norm_stderr": 0.018776052319619627 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.39814814814814814, + "acc_stderr": 0.033384734032074016, + "acc_norm": 0.39814814814814814, + "acc_norm_stderr": 0.033384734032074016 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7549019607843137, + "acc_stderr": 0.03019028245350195, + "acc_norm": 0.7549019607843137, + "acc_norm_stderr": 0.03019028245350195 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7426160337552743, + "acc_stderr": 0.02845882099146029, + "acc_norm": 0.7426160337552743, + "acc_norm_stderr": 0.02845882099146029 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6322869955156951, + "acc_stderr": 0.03236198350928275, + "acc_norm": 0.6322869955156951, + "acc_norm_stderr": 0.03236198350928275 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6335877862595419, + "acc_stderr": 0.04225875451969638, + "acc_norm": 0.6335877862595419, + "acc_norm_stderr": 0.04225875451969638 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.768595041322314, + "acc_stderr": 0.03849856098794088, + "acc_norm": 0.768595041322314, + "acc_norm_stderr": 0.03849856098794088 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7037037037037037, + "acc_stderr": 0.04414343666854933, + "acc_norm": 0.7037037037037037, + "acc_norm_stderr": 0.04414343666854933 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6625766871165644, + "acc_stderr": 0.03714908409935574, + "acc_norm": 0.6625766871165644, + "acc_norm_stderr": 0.03714908409935574 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.04521829902833587, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.04521829902833587 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7378640776699029, + "acc_stderr": 0.04354631077260595, + "acc_norm": 0.7378640776699029, + "acc_norm_stderr": 0.04354631077260595 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.027236013946196708, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.027236013946196708 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7458492975734355, + "acc_stderr": 0.015569254692045755, + "acc_norm": 0.7458492975734355, + "acc_norm_stderr": 0.015569254692045755 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6069364161849711, + "acc_stderr": 0.02629622791561367, + "acc_norm": 0.6069364161849711, + "acc_norm_stderr": 0.02629622791561367 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.28938547486033517, + "acc_stderr": 0.015166544550490317, + "acc_norm": 0.28938547486033517, + "acc_norm_stderr": 0.015166544550490317 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6143790849673203, + "acc_stderr": 0.027870745278290275, + "acc_norm": 0.6143790849673203, + "acc_norm_stderr": 0.027870745278290275 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6045016077170418, + "acc_stderr": 0.027770918531427838, + "acc_norm": 0.6045016077170418, + "acc_norm_stderr": 0.027770918531427838 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6141975308641975, + "acc_stderr": 0.027085401226132146, + "acc_norm": 0.6141975308641975, + "acc_norm_stderr": 0.027085401226132146 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3971631205673759, + "acc_stderr": 0.0291898056735871, + "acc_norm": 0.3971631205673759, + "acc_norm_stderr": 0.0291898056735871 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.39113428943937417, + "acc_stderr": 0.012463861839982063, + "acc_norm": 0.39113428943937417, + "acc_norm_stderr": 0.012463861839982063 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4963235294117647, + "acc_stderr": 0.030372015885428195, + "acc_norm": 0.4963235294117647, + "acc_norm_stderr": 0.030372015885428195 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5310457516339869, + "acc_stderr": 0.020188804456361894, + "acc_norm": 0.5310457516339869, + "acc_norm_stderr": 0.020188804456361894 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6448979591836734, + "acc_stderr": 0.030635655150387638, + "acc_norm": 0.6448979591836734, + "acc_norm_stderr": 0.030635655150387638 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7263681592039801, + "acc_stderr": 0.031524391865554016, + "acc_norm": 0.7263681592039801, + "acc_norm_stderr": 0.031524391865554016 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.46987951807228917, + "acc_stderr": 0.03885425420866766, + "acc_norm": 0.46987951807228917, + "acc_norm_stderr": 0.03885425420866766 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7485380116959064, + "acc_stderr": 0.033275044238468436, + "acc_norm": 0.7485380116959064, + "acc_norm_stderr": 0.033275044238468436 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.30599755201958384, + "mc1_stderr": 0.016132229728155048, + "mc2": 0.4631805504949388, + "mc2_stderr": 0.01494024022328432 + }, + "all": { + "acc": 0.5455550513134634, + "acc_stderr": 0.03446256824348003, + "acc_norm": 0.5496885064437279, + "acc_norm_stderr": 0.034443309811315, + "mc1": 0.30599755201958384, + "mc1_stderr": 0.016132229728155048, + "mc2": 0.4631805504949388, + "mc2_stderr": 0.01494024022328432 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "jphme/Llama-2-13b-chat-german", + "model_sha": "d72667bd92fd6f76835466d302563d213e0b1ee1", + "model_dtype": "torch.float16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6259.172531604767", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/jphme/Llama-2-13b-chat-german/results_2023-09-17T15-03-11.382260.json b/eval-results/jphme/Llama-2-13b-chat-german/results_2023-09-17T15-03-11.382260.json new file mode 100644 index 0000000000000000000000000000000000000000..29d110722ee45286c66b8654db16b04702178ea6 --- /dev/null +++ b/eval-results/jphme/Llama-2-13b-chat-german/results_2023-09-17T15-03-11.382260.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "jphme/Llama-2-13b-chat-german", + "model_sha": "d72667bd92fd6f76835466d302563d213e0b1ee1", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.006606543624161074, + "em_stderr": 0.000829635738992222, + "f1": 0.06547399328859073, + "f1_stderr": 0.0015176277275461638 + }, + "harness|gsm8k|5": { + "acc": 0.13646702047005307, + "acc_stderr": 0.00945574199881554 + }, + "harness|winogrande|5": { + "acc": 0.7647987371744278, + "acc_stderr": 0.01192000816365088 + }, + "all": { + "em": 0.006606543624161074, + "em_stderr": 0.000829635738992222, + "f1": 0.06547399328859073, + "f1_stderr": 0.0015176277275461638, + "acc": 0.45063287882224046, + "acc_stderr": 0.01068787508123321 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "745dec159cb84e4f" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "39776148f0a30b9c" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "92346969dde44fd9" + }, + "total_evaluation_time_secondes": "12202.891308784485", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/jphme/em_german_leo_mistral/results_2023-10-11T17-57-34.404631.json b/eval-results/jphme/em_german_leo_mistral/results_2023-10-11T17-57-34.404631.json new file mode 100644 index 0000000000000000000000000000000000000000..8f9f0f26513f2f0ef3a4a2e5e111c996e6339c9f --- /dev/null +++ b/eval-results/jphme/em_german_leo_mistral/results_2023-10-11T17-57-34.404631.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "jphme/em_german_leo_mistral", + "model_sha": "aa63a32154923034fb89b1408d3d7ffa994d3327", + "model_size": "13.99 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4854948805460751, + "acc_stderr": 0.014605241081370053, + "acc_norm": 0.5281569965870307, + "acc_norm_stderr": 0.014588204105102203 + }, + "harness|hellaswag|10": { + "acc": 0.5837482573192591, + "acc_stderr": 0.004919289113027508, + "acc_norm": 0.7803226448914559, + "acc_norm_stderr": 0.004131818797713872 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45185185185185184, + "acc_stderr": 0.04299268905480863, + "acc_norm": 0.45185185185185184, + "acc_norm_stderr": 0.04299268905480863 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.48026315789473684, + "acc_stderr": 0.040657710025626036, + "acc_norm": 0.48026315789473684, + "acc_norm_stderr": 0.040657710025626036 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5584905660377358, + "acc_stderr": 0.030561590426731837, + "acc_norm": 0.5584905660377358, + "acc_norm_stderr": 0.030561590426731837 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5486111111111112, + "acc_stderr": 0.04161402398403279, + "acc_norm": 0.5486111111111112, + "acc_norm_stderr": 0.04161402398403279 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720683, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720683 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3137254901960784, + "acc_stderr": 0.04617034827006717, + "acc_norm": 0.3137254901960784, + "acc_norm_stderr": 0.04617034827006717 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.62, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.62, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.37446808510638296, + "acc_stderr": 0.031639106653672915, + "acc_norm": 0.37446808510638296, + "acc_norm_stderr": 0.031639106653672915 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3508771929824561, + "acc_stderr": 0.044895393502706986, + "acc_norm": 0.3508771929824561, + "acc_norm_stderr": 0.044895393502706986 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.36551724137931035, + "acc_stderr": 0.04013124195424386, + "acc_norm": 0.36551724137931035, + "acc_norm_stderr": 0.04013124195424386 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.024677862841332783, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.024677862841332783 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.04325506042017086, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.04325506042017086 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5806451612903226, + "acc_stderr": 0.02807158890109184, + "acc_norm": 0.5806451612903226, + "acc_norm_stderr": 0.02807158890109184 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.35960591133004927, + "acc_stderr": 0.03376458246509567, + "acc_norm": 0.35960591133004927, + "acc_norm_stderr": 0.03376458246509567 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6121212121212121, + "acc_stderr": 0.038049136539710114, + "acc_norm": 0.6121212121212121, + "acc_norm_stderr": 0.038049136539710114 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.033586181457325226, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.033586181457325226 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7409326424870466, + "acc_stderr": 0.031618779179354094, + "acc_norm": 0.7409326424870466, + "acc_norm_stderr": 0.031618779179354094 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.48717948717948717, + "acc_stderr": 0.025342671293807257, + "acc_norm": 0.48717948717948717, + "acc_norm_stderr": 0.025342671293807257 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.02730914058823018, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.02730914058823018 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5336134453781513, + "acc_stderr": 0.03240501447690071, + "acc_norm": 0.5336134453781513, + "acc_norm_stderr": 0.03240501447690071 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2582781456953642, + "acc_stderr": 0.035737053147634576, + "acc_norm": 0.2582781456953642, + "acc_norm_stderr": 0.035737053147634576 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6495412844036698, + "acc_stderr": 0.020456077599824464, + "acc_norm": 0.6495412844036698, + "acc_norm_stderr": 0.020456077599824464 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4861111111111111, + "acc_stderr": 0.03408655867977748, + "acc_norm": 0.4861111111111111, + "acc_norm_stderr": 0.03408655867977748 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6519607843137255, + "acc_stderr": 0.03343311240488419, + "acc_norm": 0.6519607843137255, + "acc_norm_stderr": 0.03343311240488419 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6455696202531646, + "acc_stderr": 0.031137304297185812, + "acc_norm": 0.6455696202531646, + "acc_norm_stderr": 0.031137304297185812 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5739910313901345, + "acc_stderr": 0.033188332862172806, + "acc_norm": 0.5739910313901345, + "acc_norm_stderr": 0.033188332862172806 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5190839694656488, + "acc_stderr": 0.04382094705550988, + "acc_norm": 0.5190839694656488, + "acc_norm_stderr": 0.04382094705550988 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7107438016528925, + "acc_stderr": 0.04139112727635463, + "acc_norm": 0.7107438016528925, + "acc_norm_stderr": 0.04139112727635463 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5648148148148148, + "acc_stderr": 0.047928981709070624, + "acc_norm": 0.5648148148148148, + "acc_norm_stderr": 0.047928981709070624 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6196319018404908, + "acc_stderr": 0.03814269893261837, + "acc_norm": 0.6196319018404908, + "acc_norm_stderr": 0.03814269893261837 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.30357142857142855, + "acc_stderr": 0.04364226155841044, + "acc_norm": 0.30357142857142855, + "acc_norm_stderr": 0.04364226155841044 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6893203883495146, + "acc_stderr": 0.04582124160161551, + "acc_norm": 0.6893203883495146, + "acc_norm_stderr": 0.04582124160161551 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7649572649572649, + "acc_stderr": 0.027778835904935437, + "acc_norm": 0.7649572649572649, + "acc_norm_stderr": 0.027778835904935437 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6704980842911877, + "acc_stderr": 0.016808322261740467, + "acc_norm": 0.6704980842911877, + "acc_norm_stderr": 0.016808322261740467 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5115606936416185, + "acc_stderr": 0.026911898686377913, + "acc_norm": 0.5115606936416185, + "acc_norm_stderr": 0.026911898686377913 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.30502793296089387, + "acc_stderr": 0.015398723510916718, + "acc_norm": 0.30502793296089387, + "acc_norm_stderr": 0.015398723510916718 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5032679738562091, + "acc_stderr": 0.028629305194003543, + "acc_norm": 0.5032679738562091, + "acc_norm_stderr": 0.028629305194003543 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5819935691318328, + "acc_stderr": 0.028013651891995072, + "acc_norm": 0.5819935691318328, + "acc_norm_stderr": 0.028013651891995072 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.027431623722415012, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.027431623722415012 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.40070921985815605, + "acc_stderr": 0.02923346574557309, + "acc_norm": 0.40070921985815605, + "acc_norm_stderr": 0.02923346574557309 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.37157757496740546, + "acc_stderr": 0.012341828514528284, + "acc_norm": 0.37157757496740546, + "acc_norm_stderr": 0.012341828514528284 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5, + "acc_stderr": 0.030372836961539352, + "acc_norm": 0.5, + "acc_norm_stderr": 0.030372836961539352 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4673202614379085, + "acc_stderr": 0.020184583359102202, + "acc_norm": 0.4673202614379085, + "acc_norm_stderr": 0.020184583359102202 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5545454545454546, + "acc_stderr": 0.047605488214603246, + "acc_norm": 0.5545454545454546, + "acc_norm_stderr": 0.047605488214603246 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4163265306122449, + "acc_stderr": 0.031557828165561644, + "acc_norm": 0.4163265306122449, + "acc_norm_stderr": 0.031557828165561644 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6567164179104478, + "acc_stderr": 0.03357379665433431, + "acc_norm": 0.6567164179104478, + "acc_norm_stderr": 0.03357379665433431 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.68, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.68, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3674698795180723, + "acc_stderr": 0.03753267402120575, + "acc_norm": 0.3674698795180723, + "acc_norm_stderr": 0.03753267402120575 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6491228070175439, + "acc_stderr": 0.036602988340491645, + "acc_norm": 0.6491228070175439, + "acc_norm_stderr": 0.036602988340491645 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.31946144430844553, + "mc1_stderr": 0.016322644182960498, + "mc2": 0.5019055515288229, + "mc2_stderr": 0.015328774012087968 + }, + "all": { + "acc": 0.5015089044506577, + "acc_stderr": 0.035214009871026054, + "acc_norm": 0.505563760444101, + "acc_norm_stderr": 0.03520037415421959, + "mc1": 0.31946144430844553, + "mc1_stderr": 0.016322644182960498, + "mc2": 0.5019055515288229, + "mc2_stderr": 0.015328774012087968 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "4154.051306962967", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/jphme/em_german_leo_mistral/results_2023-10-26T05-35-49.227572.json b/eval-results/jphme/em_german_leo_mistral/results_2023-10-26T05-35-49.227572.json new file mode 100644 index 0000000000000000000000000000000000000000..6a174f85e727a7f759fae9361123fda25329a44f --- /dev/null +++ b/eval-results/jphme/em_german_leo_mistral/results_2023-10-26T05-35-49.227572.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "jphme/em_german_leo_mistral", + "model_sha": "b457e761fb5e95fcaddd2177ca8db4d682d10f73", + "model_size": "13.99 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.2305998322147651, + "em_stderr": 0.004313653760724557, + "f1": 0.2864733640939601, + "f1_stderr": 0.004317447810452205 + }, + "harness|gsm8k|5": { + "acc": 0.056103108415466264, + "acc_stderr": 0.00633866843132188 + }, + "harness|winogrande|5": { + "acc": 0.7348066298342542, + "acc_stderr": 0.012406549466192858 + }, + "all": { + "em": 0.2305998322147651, + "em_stderr": 0.004313653760724557, + "f1": 0.2864733640939601, + "f1_stderr": 0.004317447810452205, + "acc": 0.3954548691248602, + "acc_stderr": 0.009372608948757369 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "c36a251a5730cbbf" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "cad456feb2c7b6c6" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2397, + "non-padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "94b792bcf7911679" + }, + "total_evaluation_time_secondes": "8887.143160104752", + "truncated": 0, + "non-truncated": 13389, + "padded": 2397, + "non-padded": 10992, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/jphme/orca_mini_v2_ger_7b/results_2023-07-19T17-09-14.589500.json b/eval-results/jphme/orca_mini_v2_ger_7b/results_2023-07-19T17-09-14.589500.json new file mode 100644 index 0000000000000000000000000000000000000000..13e6075812936b52669f4d79df4ee5e7f3086f1d --- /dev/null +++ b/eval-results/jphme/orca_mini_v2_ger_7b/results_2023-07-19T17-09-14.589500.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.46501706484641636, + "acc_stderr": 0.014575583922019669, + "acc_norm": 0.49829351535836175, + "acc_norm_stderr": 0.01461130570505699 + }, + "harness|hellaswag|10": { + "acc": 0.5635331607249552, + "acc_stderr": 0.004949335356881857, + "acc_norm": 0.7550288787094205, + "acc_norm_stderr": 0.004291911350430704 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847415, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847415 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3925925925925926, + "acc_stderr": 0.04218506215368879, + "acc_norm": 0.3925925925925926, + "acc_norm_stderr": 0.04218506215368879 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3355263157894737, + "acc_stderr": 0.038424985593952694, + "acc_norm": 0.3355263157894737, + "acc_norm_stderr": 0.038424985593952694 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4679245283018868, + "acc_stderr": 0.030709486992556545, + "acc_norm": 0.4679245283018868, + "acc_norm_stderr": 0.030709486992556545 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3611111111111111, + "acc_stderr": 0.040166600304512336, + "acc_norm": 0.3611111111111111, + "acc_norm_stderr": 0.040166600304512336 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768077, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768077 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768077, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768077 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2832369942196532, + "acc_stderr": 0.034355680560478746, + "acc_norm": 0.2832369942196532, + "acc_norm_stderr": 0.034355680560478746 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.18627450980392157, + "acc_stderr": 0.038739587141493524, + "acc_norm": 0.18627450980392157, + "acc_norm_stderr": 0.038739587141493524 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3659574468085106, + "acc_stderr": 0.03148955829745529, + "acc_norm": 0.3659574468085106, + "acc_norm_stderr": 0.03148955829745529 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748142, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748142 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3724137931034483, + "acc_stderr": 0.040287315329475576, + "acc_norm": 0.3724137931034483, + "acc_norm_stderr": 0.040287315329475576 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.02256989707491842, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.02256989707491842 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.03932537680392871, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.03932537680392871 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3741935483870968, + "acc_stderr": 0.02752890429984578, + "acc_norm": 0.3741935483870968, + "acc_norm_stderr": 0.02752890429984578 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.22167487684729065, + "acc_stderr": 0.029225575892489593, + "acc_norm": 0.22167487684729065, + "acc_norm_stderr": 0.029225575892489593 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.4727272727272727, + "acc_stderr": 0.0389853160557942, + "acc_norm": 0.4727272727272727, + "acc_norm_stderr": 0.0389853160557942 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.4292929292929293, + "acc_stderr": 0.035265527246011986, + "acc_norm": 0.4292929292929293, + "acc_norm_stderr": 0.035265527246011986 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.5440414507772021, + "acc_stderr": 0.03594413711272438, + "acc_norm": 0.5440414507772021, + "acc_norm_stderr": 0.03594413711272438 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3871794871794872, + "acc_stderr": 0.024697216930878944, + "acc_norm": 0.3871794871794872, + "acc_norm_stderr": 0.024697216930878944 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25555555555555554, + "acc_stderr": 0.026593939101844065, + "acc_norm": 0.25555555555555554, + "acc_norm_stderr": 0.026593939101844065 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3403361344537815, + "acc_stderr": 0.030778057422931673, + "acc_norm": 0.3403361344537815, + "acc_norm_stderr": 0.030778057422931673 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.48807339449541287, + "acc_stderr": 0.021431223617362227, + "acc_norm": 0.48807339449541287, + "acc_norm_stderr": 0.021431223617362227 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3101851851851852, + "acc_stderr": 0.031546962856566295, + "acc_norm": 0.3101851851851852, + "acc_norm_stderr": 0.031546962856566295 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.46078431372549017, + "acc_stderr": 0.03498501649369527, + "acc_norm": 0.46078431372549017, + "acc_norm_stderr": 0.03498501649369527 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.48523206751054854, + "acc_stderr": 0.032533028078777386, + "acc_norm": 0.48523206751054854, + "acc_norm_stderr": 0.032533028078777386 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.484304932735426, + "acc_stderr": 0.0335412657542081, + "acc_norm": 0.484304932735426, + "acc_norm_stderr": 0.0335412657542081 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3893129770992366, + "acc_stderr": 0.04276486542814591, + "acc_norm": 0.3893129770992366, + "acc_norm_stderr": 0.04276486542814591 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5537190082644629, + "acc_stderr": 0.04537935177947879, + "acc_norm": 0.5537190082644629, + "acc_norm_stderr": 0.04537935177947879 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.04803752235190193, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.04803752235190193 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4539877300613497, + "acc_stderr": 0.0391170190467718, + "acc_norm": 0.4539877300613497, + "acc_norm_stderr": 0.0391170190467718 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.24107142857142858, + "acc_stderr": 0.040598672469526864, + "acc_norm": 0.24107142857142858, + "acc_norm_stderr": 0.040598672469526864 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.39805825242718446, + "acc_stderr": 0.04846748253977238, + "acc_norm": 0.39805825242718446, + "acc_norm_stderr": 0.04846748253977238 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.594017094017094, + "acc_stderr": 0.03217180182641086, + "acc_norm": 0.594017094017094, + "acc_norm_stderr": 0.03217180182641086 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.017867695938429774, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.017867695938429774 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.430635838150289, + "acc_stderr": 0.026658800273672376, + "acc_norm": 0.430635838150289, + "acc_norm_stderr": 0.026658800273672376 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.22793296089385476, + "acc_stderr": 0.014030149950805097, + "acc_norm": 0.22793296089385476, + "acc_norm_stderr": 0.014030149950805097 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4084967320261438, + "acc_stderr": 0.028146405993096358, + "acc_norm": 0.4084967320261438, + "acc_norm_stderr": 0.028146405993096358 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.42443729903536975, + "acc_stderr": 0.028071928247946205, + "acc_norm": 0.42443729903536975, + "acc_norm_stderr": 0.028071928247946205 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.45987654320987653, + "acc_stderr": 0.02773102275353927, + "acc_norm": 0.45987654320987653, + "acc_norm_stderr": 0.02773102275353927 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3262411347517731, + "acc_stderr": 0.027968453043563164, + "acc_norm": 0.3262411347517731, + "acc_norm_stderr": 0.027968453043563164 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.31421121251629724, + "acc_stderr": 0.011855911587048224, + "acc_norm": 0.31421121251629724, + "acc_norm_stderr": 0.011855911587048224 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.030161911930767102, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.030161911930767102 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4068627450980392, + "acc_stderr": 0.019873802005061173, + "acc_norm": 0.4068627450980392, + "acc_norm_stderr": 0.019873802005061173 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.4, + "acc_stderr": 0.0469237132203465, + "acc_norm": 0.4, + "acc_norm_stderr": 0.0469237132203465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.39591836734693875, + "acc_stderr": 0.03130802899065686, + "acc_norm": 0.39591836734693875, + "acc_norm_stderr": 0.03130802899065686 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.48756218905472637, + "acc_stderr": 0.0353443984853958, + "acc_norm": 0.48756218905472637, + "acc_norm_stderr": 0.0353443984853958 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3132530120481928, + "acc_stderr": 0.036108050180310235, + "acc_norm": 0.3132530120481928, + "acc_norm_stderr": 0.036108050180310235 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.5614035087719298, + "acc_stderr": 0.0380579750559046, + "acc_norm": 0.5614035087719298, + "acc_norm_stderr": 0.0380579750559046 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2998776009791922, + "mc1_stderr": 0.01604035296671363, + "mc2": 0.4573591687927265, + "mc2_stderr": 0.01493808311431925 + }, + "all": { + "acc": 0.3951394895709494, + "acc_stderr": 0.034772184950245706, + "acc_norm": 0.39894918734207496, + "acc_norm_stderr": 0.034761647624425136, + "mc1": 0.2998776009791922, + "mc1_stderr": 0.01604035296671363, + "mc2": 0.4573591687927265, + "mc2_stderr": 0.01493808311431925 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "jphme/orca_mini_v2_ger_7b", + "model_sha": "175965f50907c6a8cd40f1a4b10d28342969c066", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/jphme/orca_mini_v2_ger_7b/results_2023-09-17T20-22-22.461526.json b/eval-results/jphme/orca_mini_v2_ger_7b/results_2023-09-17T20-22-22.461526.json new file mode 100644 index 0000000000000000000000000000000000000000..d3478db1e58c12f1e008e0dee7a2dbbc41bc591a --- /dev/null +++ b/eval-results/jphme/orca_mini_v2_ger_7b/results_2023-09-17T20-22-22.461526.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "jphme/orca_mini_v2_ger_7b", + "model_sha": "799677173fb506c6307123a3686af4e65cc2f38f", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.05180369127516778, + "em_stderr": 0.002269703538491734, + "f1": 0.10419043624161092, + "f1_stderr": 0.0025209765448865502 + }, + "harness|gsm8k|5": { + "acc": 0.04169825625473844, + "acc_stderr": 0.005506205058175767 + }, + "harness|winogrande|5": { + "acc": 0.7158642462509865, + "acc_stderr": 0.012675392786772722 + }, + "all": { + "em": 0.05180369127516778, + "em_stderr": 0.002269703538491734, + "f1": 0.10419043624161092, + "f1_stderr": 0.0025209765448865502, + "acc": 0.3787812512528625, + "acc_stderr": 0.009090798922474245 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "de2e62368a199410" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "106a435525cbb671" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "47d9961f20ce6ea5" + }, + "total_evaluation_time_secondes": "24324.262102127075", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/jslin09/bloom-560m-finetuned-fraud/results_2023-08-17T18-20-24.088120.json b/eval-results/jslin09/bloom-560m-finetuned-fraud/results_2023-08-17T18-20-24.088120.json new file mode 100644 index 0000000000000000000000000000000000000000..522c933575f3ddbf06b3b62febd2a6e9360fedb4 --- /dev/null +++ b/eval-results/jslin09/bloom-560m-finetuned-fraud/results_2023-08-17T18-20-24.088120.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.22866894197952217, + "acc_stderr": 0.012272853582540802, + "acc_norm": 0.2696245733788396, + "acc_norm_stderr": 0.012968040686869166 + }, + "harness|hellaswag|10": { + "acc": 0.26289583748257317, + "acc_stderr": 0.0043930667609168245, + "acc_norm": 0.28868751244771956, + "acc_norm_stderr": 0.0045222621281770055 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.23, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.23, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2074074074074074, + "acc_stderr": 0.03502553170678316, + "acc_norm": 0.2074074074074074, + "acc_norm_stderr": 0.03502553170678316 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2188679245283019, + "acc_stderr": 0.02544786382510863, + "acc_norm": 0.2188679245283019, + "acc_norm_stderr": 0.02544786382510863 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2847222222222222, + "acc_stderr": 0.037738099906869334, + "acc_norm": 0.2847222222222222, + "acc_norm_stderr": 0.037738099906869334 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.21965317919075145, + "acc_stderr": 0.03156809362703173, + "acc_norm": 0.21965317919075145, + "acc_norm_stderr": 0.03156809362703173 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.038351539543994194, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.038351539543994194 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2482758620689655, + "acc_stderr": 0.036001056927277716, + "acc_norm": 0.2482758620689655, + "acc_norm_stderr": 0.036001056927277716 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.20899470899470898, + "acc_stderr": 0.02094048156533485, + "acc_norm": 0.20899470899470898, + "acc_norm_stderr": 0.02094048156533485 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.039325376803928724, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.039325376803928724 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.1774193548387097, + "acc_stderr": 0.02173254068932927, + "acc_norm": 0.1774193548387097, + "acc_norm_stderr": 0.02173254068932927 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.1625615763546798, + "acc_stderr": 0.025960300064605576, + "acc_norm": 0.1625615763546798, + "acc_norm_stderr": 0.025960300064605576 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.23030303030303031, + "acc_stderr": 0.03287666758603488, + "acc_norm": 0.23030303030303031, + "acc_norm_stderr": 0.03287666758603488 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2676767676767677, + "acc_stderr": 0.031544498882702866, + "acc_norm": 0.2676767676767677, + "acc_norm_stderr": 0.031544498882702866 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.19689119170984457, + "acc_stderr": 0.028697873971860664, + "acc_norm": 0.19689119170984457, + "acc_norm_stderr": 0.028697873971860664 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2512820512820513, + "acc_stderr": 0.02199201666237056, + "acc_norm": 0.2512820512820513, + "acc_norm_stderr": 0.02199201666237056 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24814814814814815, + "acc_stderr": 0.0263357394040558, + "acc_norm": 0.24814814814814815, + "acc_norm_stderr": 0.0263357394040558 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.20168067226890757, + "acc_stderr": 0.02606431340630452, + "acc_norm": 0.20168067226890757, + "acc_norm_stderr": 0.02606431340630452 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2185430463576159, + "acc_stderr": 0.033742355504256936, + "acc_norm": 0.2185430463576159, + "acc_norm_stderr": 0.033742355504256936 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1853211009174312, + "acc_stderr": 0.01665927970029583, + "acc_norm": 0.1853211009174312, + "acc_norm_stderr": 0.01665927970029583 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.027920963147993676, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.027920963147993676 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.030190282453501947, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.030190282453501947 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.25316455696202533, + "acc_stderr": 0.028304657943035282, + "acc_norm": 0.25316455696202533, + "acc_norm_stderr": 0.028304657943035282 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3094170403587444, + "acc_stderr": 0.031024411740572203, + "acc_norm": 0.3094170403587444, + "acc_norm_stderr": 0.031024411740572203 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070416, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070416 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2392638036809816, + "acc_stderr": 0.03351953879521269, + "acc_norm": 0.2392638036809816, + "acc_norm_stderr": 0.03351953879521269 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2905982905982906, + "acc_stderr": 0.02974504857267404, + "acc_norm": 0.2905982905982906, + "acc_norm_stderr": 0.02974504857267404 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.015302380123542085, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.015302380123542085 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2581699346405229, + "acc_stderr": 0.025058503316958147, + "acc_norm": 0.2581699346405229, + "acc_norm_stderr": 0.025058503316958147 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1864951768488746, + "acc_stderr": 0.02212243977248077, + "acc_norm": 0.1864951768488746, + "acc_norm_stderr": 0.02212243977248077 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21604938271604937, + "acc_stderr": 0.022899162918445806, + "acc_norm": 0.21604938271604937, + "acc_norm_stderr": 0.022899162918445806 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2375886524822695, + "acc_stderr": 0.025389512552729906, + "acc_norm": 0.2375886524822695, + "acc_norm_stderr": 0.025389512552729906 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24511082138200782, + "acc_stderr": 0.010986307870045517, + "acc_norm": 0.24511082138200782, + "acc_norm_stderr": 0.010986307870045517 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.30514705882352944, + "acc_stderr": 0.027971541370170595, + "acc_norm": 0.30514705882352944, + "acc_norm_stderr": 0.027971541370170595 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25, + "acc_stderr": 0.01751781884501444, + "acc_norm": 0.25, + "acc_norm_stderr": 0.01751781884501444 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.041723430387053825, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.041723430387053825 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.20408163265306123, + "acc_stderr": 0.025801283475090503, + "acc_norm": 0.20408163265306123, + "acc_norm_stderr": 0.025801283475090503 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401465, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401465 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3072289156626506, + "acc_stderr": 0.035915667978246635, + "acc_norm": 0.3072289156626506, + "acc_norm_stderr": 0.035915667978246635 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.03508771929824565, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.03508771929824565 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2141982864137087, + "mc1_stderr": 0.014362148155690478, + "mc2": NaN, + "mc2_stderr": NaN + }, + "all": { + "acc": 0.24049985036781363, + "acc_stderr": 0.031121406563736853, + "acc_norm": 0.24163116064517742, + "acc_norm_stderr": 0.031135379148001067, + "mc1": 0.2141982864137087, + "mc1_stderr": 0.014362148155690478, + "mc2": NaN, + "mc2_stderr": NaN + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "jslin09/bloom-560m-finetuned-fraud", + "model_sha": "5571f87f557b909e863005c6e3870bc2e77341a7", + "model_dtype": "torch.float16", + "lighteval_sha": "8bab069fee0c6e75ffa4c1ef8a9591c28ee0e049", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "35156985b5b768ac", + "hash_cont_tokens": "eb79e0ee08afd9c5" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4665, + "non-padded": 22, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "802113368d0f1a61", + "hash_cont_tokens": "aa1808c68f0c2e9e" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40120, + "non-padded": 48, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "1c87c27110cefaac", + "hash_cont_tokens": "ff3cb34539bb5f2b" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "13f665d423f8fb55", + "hash_cont_tokens": "a00708302b3b327f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "3163a7c233d03aa0", + "hash_cont_tokens": "afca3891736a057c" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "fcff53892cac8af7", + "hash_cont_tokens": "623c2802bff933bd" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "25f9d69a12a3d6fb", + "hash_cont_tokens": "60ad657330ea2669" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "53210da63551cfa9", + "hash_cont_tokens": "ff8f8d720a714978" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "a8a101eecc3c7ae4", + "hash_cont_tokens": "06c8db5079b94b18" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "f5fab1f263660b00", + "hash_cont_tokens": "5c5503282d075007" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "984b14d4503f8332", + "hash_cont_tokens": "31a3cad8f65201ce" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "93a35d459b1816f2", + "hash_cont_tokens": "5f3d7568e997541d" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "13ae5cfcd25132dc", + "hash_cont_tokens": "cb71c9206ba66d39" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "0d4319aeeced7337", + "hash_cont_tokens": "ff3cb34539bb5f2b" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "a284181382c79188", + "hash_cont_tokens": "cedbf5e586ec0b01" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "552a88b73073790e", + "hash_cont_tokens": "a8d41562d030b943" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "527861508133911b", + "hash_cont_tokens": "745f338915acaec1" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "840360a71e722f2f", + "hash_cont_tokens": "644b3e35adfdda03" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1500, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "4983fa5c8766c5c3", + "hash_cont_tokens": "b1d1befd309fb284" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "f8a39d91e1a5f312", + "hash_cont_tokens": "ff3cb34539bb5f2b" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "34cc1cdbbd5aea32", + "hash_cont_tokens": "95493e0017e65777" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b6fab2a0a58c488c", + "hash_cont_tokens": "a0e0e297040d4db0" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 792, + "non-padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "d41cd4a4ab99eb7b", + "hash_cont_tokens": "f43af0d07377a42f" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "1ebccad4a595b975", + "hash_cont_tokens": "6cbba303f6f147d6" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "de5bf0d293a94631", + "hash_cont_tokens": "4f8383d2638c80b7" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "144bb2cb47e5f030", + "hash_cont_tokens": "6541cee4c2ca2d3a" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 769, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "c6a09f0272f5bace", + "hash_cont_tokens": "d417bab46172bb21" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "2024f3f2d057a290", + "hash_cont_tokens": "58c97e5a823a3277" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1064, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "68c5fe2cee06f876", + "hash_cont_tokens": "c502355d3ae25590" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "20a2a1ae249b0427", + "hash_cont_tokens": "5ad557521091a307" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef86d0e501611b56", + "hash_cont_tokens": "b322ff20aeb4639a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "c2a12d76b5c5b16e", + "hash_cont_tokens": "cb3c1184c996a0fd" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "f4d8cc6ff1561ffb", + "hash_cont_tokens": "02c8e41eeaaf0fb6" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "781c9de2e152f1f7", + "hash_cont_tokens": "67762569c3ca674b" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "ca19e8c8dea6a330", + "hash_cont_tokens": "d7d143f4200a154b" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "be219a621e44079f", + "hash_cont_tokens": "e250079c7cb1e43a" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "ef2b37dd79b408c3", + "hash_cont_tokens": "7c12f7710db74df2" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4034ec807e309e30", + "hash_cont_tokens": "005a8757e3cea1a0" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "a5d2fb9bcabf25bc", + "hash_cont_tokens": "da13f7058d48bfb8" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "45b6f1bd5dfc50f9", + "hash_cont_tokens": "c5577e2f23277e8d" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "1deb3fa175f7db94", + "hash_cont_tokens": "ffb9bfe9d93f2805" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "abb0936d70e4898e", + "hash_cont_tokens": "d1937472945127ac" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "8b358fa6f9360089", + "hash_cont_tokens": "ff3cb34539bb5f2b" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "7d5760ded8c8b193", + "hash_cont_tokens": "ec3426e0c715eba0" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "9fff95292ccc387e", + "hash_cont_tokens": "3c698eb5d068ae91" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1348, + "non-padded": 36, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "dc4d8d7eb5d86c76", + "hash_cont_tokens": "931a0dc420ffe5d2" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c1eebfd25d5b3b21", + "hash_cont_tokens": "e71b28eed09ee0db" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "b681c1548971cbd5", + "hash_cont_tokens": "86081de9fe78d617" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "22271d3665ffc490", + "hash_cont_tokens": "d19da4ee78c53502" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "3ee14b573131427a", + "hash_cont_tokens": "48e35b1976b2a5f3" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "68008bded9a55e2d", + "hash_cont_tokens": "24274a72dafedc6d" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "314bcd23b7e0cd0e", + "hash_cont_tokens": "1e361f2e4210e370" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "36aa8145c6ec38cc", + "hash_cont_tokens": "c7537c3a213e806a" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "392fc1a6a1ad66ac", + "hash_cont_tokens": "3e87cc044c17ee28" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "8f0bd8aaa62f7905", + "hash_cont_tokens": "74153f80677eefee" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "2f2f613b67ddc7af", + "hash_cont_tokens": "a6d31805e7a3e5be" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 788, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "afaa8bd0a88549df", + "hash_cont_tokens": "ff3cb34539bb5f2b" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "0b70128defa17577", + "hash_cont_tokens": "cff0acf2094548e5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "274b72144201a976", + "hash_cont_tokens": "a6a664cc2f4c4875" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "af91ab67a7877659", + "hash_cont_tokens": "0e161fac767f71bd" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "cdfe8c24f95ba6c6", + "hash_cont_tokens": "7321f751ca655ff8" + }, + "total_evaluation_time_secondes": "1536.9232699871063", + "truncated": 0, + "non-truncated": 111019, + "padded": 110826, + "non-padded": 193, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/jslin09/bloom-560m-finetuned-fraud/results_2023-09-17T09-10-48.065151.json b/eval-results/jslin09/bloom-560m-finetuned-fraud/results_2023-09-17T09-10-48.065151.json new file mode 100644 index 0000000000000000000000000000000000000000..e303b25069ae1f113ac9cdaf1dc3a52ea1b42ba5 --- /dev/null +++ b/eval-results/jslin09/bloom-560m-finetuned-fraud/results_2023-09-17T09-10-48.065151.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "jslin09/bloom-560m-finetuned-fraud", + "model_sha": "fea91fefb89bd616c410100997fc63749272a77a", + "model_size": "1.04 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0026216442953020135, + "em_stderr": 0.0005236685642965815, + "f1": 0.0032707634228187916, + "f1_stderr": 0.0005552444547661462 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.48382004735595896, + "acc_stderr": 0.014045126130978601 + }, + "all": { + "em": 0.0026216442953020135, + "em_stderr": 0.0005236685642965815, + "f1": 0.0032707634228187916, + "f1_stderr": 0.0005552444547661462, + "acc": 0.24191002367797948, + "acc_stderr": 0.0070225630654893005 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "caecfb00a2a35349", + "hash_cont_tokens": "c43a08219cf96493" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "427483208f27a631", + "hash_cont_tokens": "36bdb59c98d6ad22" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "08911ae783a65566", + "hash_cont_tokens": "45c5d4a60b277db2" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2355, + "non-padded": 179, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "5b93bc2ffb1a417d", + "hash_cont_tokens": "0d41ec2eca27fdca" + }, + "total_evaluation_time_secondes": "10947.60011601448", + "truncated": 0, + "non-truncated": 13389, + "padded": 2355, + "non-padded": 11034, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/kevinpro/Vicuna-13B-CoT/results_2023-07-19T18-33-25.891730.json b/eval-results/kevinpro/Vicuna-13B-CoT/results_2023-07-19T18-33-25.891730.json new file mode 100644 index 0000000000000000000000000000000000000000..ba0c298e0d98fcf48132c1bcdaae9d01d328d211 --- /dev/null +++ b/eval-results/kevinpro/Vicuna-13B-CoT/results_2023-07-19T18-33-25.891730.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5196245733788396, + "acc_stderr": 0.014600132075947094, + "acc_norm": 0.5273037542662116, + "acc_norm_stderr": 0.014589589101985996 + }, + "harness|hellaswag|10": { + "acc": 0.6007767377016531, + "acc_stderr": 0.004887378682406532, + "acc_norm": 0.8013343955387373, + "acc_norm_stderr": 0.003981802822377587 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421296, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421296 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.506578947368421, + "acc_stderr": 0.040685900502249704, + "acc_norm": 0.506578947368421, + "acc_norm_stderr": 0.040685900502249704 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4981132075471698, + "acc_stderr": 0.030772653642075664, + "acc_norm": 0.4981132075471698, + "acc_norm_stderr": 0.030772653642075664 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5972222222222222, + "acc_stderr": 0.04101405519842426, + "acc_norm": 0.5972222222222222, + "acc_norm_stderr": 0.04101405519842426 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4161849710982659, + "acc_stderr": 0.03758517775404948, + "acc_norm": 0.4161849710982659, + "acc_norm_stderr": 0.03758517775404948 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.044405219061793254, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.044405219061793254 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.39574468085106385, + "acc_stderr": 0.031967586978353627, + "acc_norm": 0.39574468085106385, + "acc_norm_stderr": 0.031967586978353627 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.04339138322579861, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.04339138322579861 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3386243386243386, + "acc_stderr": 0.02437319786798306, + "acc_norm": 0.3386243386243386, + "acc_norm_stderr": 0.02437319786798306 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.04390259265377562, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.04390259265377562 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.567741935483871, + "acc_stderr": 0.028181739720019416, + "acc_norm": 0.567741935483871, + "acc_norm_stderr": 0.028181739720019416 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4039408866995074, + "acc_stderr": 0.03452453903822039, + "acc_norm": 0.4039408866995074, + "acc_norm_stderr": 0.03452453903822039 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.0368105086916155, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.0368105086916155 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6565656565656566, + "acc_stderr": 0.03383201223244441, + "acc_norm": 0.6565656565656566, + "acc_norm_stderr": 0.03383201223244441 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.694300518134715, + "acc_stderr": 0.033248379397581594, + "acc_norm": 0.694300518134715, + "acc_norm_stderr": 0.033248379397581594 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4717948717948718, + "acc_stderr": 0.0253106392549339, + "acc_norm": 0.4717948717948718, + "acc_norm_stderr": 0.0253106392549339 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.027940457136228416, + "acc_norm": 0.3, + "acc_norm_stderr": 0.027940457136228416 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4495798319327731, + "acc_stderr": 0.03231293497137707, + "acc_norm": 0.4495798319327731, + "acc_norm_stderr": 0.03231293497137707 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6862385321100918, + "acc_stderr": 0.019894723341469116, + "acc_norm": 0.6862385321100918, + "acc_norm_stderr": 0.019894723341469116 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.37037037037037035, + "acc_stderr": 0.03293377139415191, + "acc_norm": 0.37037037037037035, + "acc_norm_stderr": 0.03293377139415191 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6862745098039216, + "acc_stderr": 0.03256685484460388, + "acc_norm": 0.6862745098039216, + "acc_norm_stderr": 0.03256685484460388 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7088607594936709, + "acc_stderr": 0.02957160106575337, + "acc_norm": 0.7088607594936709, + "acc_norm_stderr": 0.02957160106575337 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5874439461883408, + "acc_stderr": 0.03304062175449297, + "acc_norm": 0.5874439461883408, + "acc_norm_stderr": 0.03304062175449297 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6717557251908397, + "acc_stderr": 0.04118438565806298, + "acc_norm": 0.6717557251908397, + "acc_norm_stderr": 0.04118438565806298 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6859504132231405, + "acc_stderr": 0.042369647530410184, + "acc_norm": 0.6859504132231405, + "acc_norm_stderr": 0.042369647530410184 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6203703703703703, + "acc_stderr": 0.04691521224077742, + "acc_norm": 0.6203703703703703, + "acc_norm_stderr": 0.04691521224077742 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6380368098159509, + "acc_stderr": 0.037757007291414416, + "acc_norm": 0.6380368098159509, + "acc_norm_stderr": 0.037757007291414416 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.04541609446503947, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.04541609446503947 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7521367521367521, + "acc_stderr": 0.028286324075564386, + "acc_norm": 0.7521367521367521, + "acc_norm_stderr": 0.028286324075564386 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6922094508301405, + "acc_stderr": 0.016506045045155637, + "acc_norm": 0.6922094508301405, + "acc_norm_stderr": 0.016506045045155637 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.546242774566474, + "acc_stderr": 0.026803720583206177, + "acc_norm": 0.546242774566474, + "acc_norm_stderr": 0.026803720583206177 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3307262569832402, + "acc_stderr": 0.01573502625896612, + "acc_norm": 0.3307262569832402, + "acc_norm_stderr": 0.01573502625896612 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.02845263998508801, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.02845263998508801 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5241157556270096, + "acc_stderr": 0.028365041542564577, + "acc_norm": 0.5241157556270096, + "acc_norm_stderr": 0.028365041542564577 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5462962962962963, + "acc_stderr": 0.0277012284685426, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.0277012284685426 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.39361702127659576, + "acc_stderr": 0.029144544781596154, + "acc_norm": 0.39361702127659576, + "acc_norm_stderr": 0.029144544781596154 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4165580182529335, + "acc_stderr": 0.012591153245057383, + "acc_norm": 0.4165580182529335, + "acc_norm_stderr": 0.012591153245057383 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4889705882352941, + "acc_stderr": 0.030365446477275675, + "acc_norm": 0.4889705882352941, + "acc_norm_stderr": 0.030365446477275675 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5212418300653595, + "acc_stderr": 0.020209572388600248, + "acc_norm": 0.5212418300653595, + "acc_norm_stderr": 0.020209572388600248 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5545454545454546, + "acc_stderr": 0.047605488214603246, + "acc_norm": 0.5545454545454546, + "acc_norm_stderr": 0.047605488214603246 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6244897959183674, + "acc_stderr": 0.03100120903989484, + "acc_norm": 0.6244897959183674, + "acc_norm_stderr": 0.03100120903989484 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7711442786069652, + "acc_stderr": 0.029705284056772436, + "acc_norm": 0.7711442786069652, + "acc_norm_stderr": 0.029705284056772436 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.76, + "acc_stderr": 0.04292346959909281, + "acc_norm": 0.76, + "acc_norm_stderr": 0.04292346959909281 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4397590361445783, + "acc_stderr": 0.03864139923699121, + "acc_norm": 0.4397590361445783, + "acc_norm_stderr": 0.03864139923699121 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7134502923976608, + "acc_stderr": 0.03467826685703826, + "acc_norm": 0.7134502923976608, + "acc_norm_stderr": 0.03467826685703826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3623011015911873, + "mc1_stderr": 0.016826646897262255, + "mc2": 0.5207836984948891, + "mc2_stderr": 0.01580678689190342 + }, + "all": { + "acc": 0.5207458541981249, + "acc_stderr": 0.03494058387309796, + "acc_norm": 0.5242752921426072, + "acc_norm_stderr": 0.03492505643523372, + "mc1": 0.3623011015911873, + "mc1_stderr": 0.016826646897262255, + "mc2": 0.5207836984948891, + "mc2_stderr": 0.01580678689190342 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "kevinpro/Vicuna-13B-CoT", + "model_sha": "346e3c46959cf9f1e03feffa761afe020c0fb6a8", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/kevinpro/Vicuna-13B-CoT/results_2023-09-17T13-31-22.626797.json b/eval-results/kevinpro/Vicuna-13B-CoT/results_2023-09-17T13-31-22.626797.json new file mode 100644 index 0000000000000000000000000000000000000000..bc9f95d27f31dea323e3f57c4a736356bc4b5219 --- /dev/null +++ b/eval-results/kevinpro/Vicuna-13B-CoT/results_2023-09-17T13-31-22.626797.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "kevinpro/Vicuna-13B-CoT", + "model_sha": "346e3c46959cf9f1e03feffa761afe020c0fb6a8", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.029677013422818792, + "em_stderr": 0.0017378324714143493, + "f1": 0.09310612416107406, + "f1_stderr": 0.002167792401176146 + }, + "harness|gsm8k|5": { + "acc": 0.08642911296436695, + "acc_stderr": 0.00774004433710381 + }, + "harness|winogrande|5": { + "acc": 0.7419100236779794, + "acc_stderr": 0.012298278833972384 + }, + "all": { + "em": 0.029677013422818792, + "em_stderr": 0.0017378324714143493, + "f1": 0.09310612416107406, + "f1_stderr": 0.002167792401176146, + "acc": 0.4141695683211732, + "acc_stderr": 0.010019161585538096 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "ac752e2682fcf21e" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "6a30e0a9abfde216" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "3120c9f83854444f" + }, + "total_evaluation_time_secondes": "12804.959372282028", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/kfkas/Llama-2-ko-7b-Chat/results_2023-07-27T10-54-54.901743.json b/eval-results/kfkas/Llama-2-ko-7b-Chat/results_2023-07-27T10-54-54.901743.json new file mode 100644 index 0000000000000000000000000000000000000000..4f6d8712443d8d47f98db36b2e625bbb6af62061 --- /dev/null +++ b/eval-results/kfkas/Llama-2-ko-7b-Chat/results_2023-07-27T10-54-54.901743.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.36689419795221845, + "acc_stderr": 0.01408413311810429, + "acc_norm": 0.4044368600682594, + "acc_norm_stderr": 0.014342036483436175 + }, + "harness|hellaswag|10": { + "acc": 0.49083847839075884, + "acc_stderr": 0.004988943721711207, + "acc_norm": 0.6715793666600279, + "acc_norm_stderr": 0.004686789042445377 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847415, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847415 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3851851851851852, + "acc_stderr": 0.04203921040156279, + "acc_norm": 0.3851851851851852, + "acc_norm_stderr": 0.04203921040156279 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.27631578947368424, + "acc_stderr": 0.03639057569952925, + "acc_norm": 0.27631578947368424, + "acc_norm_stderr": 0.03639057569952925 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720683, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720683 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.3169811320754717, + "acc_stderr": 0.02863723563980092, + "acc_norm": 0.3169811320754717, + "acc_norm_stderr": 0.02863723563980092 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.22916666666666666, + "acc_stderr": 0.035146974678623884, + "acc_norm": 0.22916666666666666, + "acc_norm_stderr": 0.035146974678623884 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.26011560693641617, + "acc_stderr": 0.03345036916788991, + "acc_norm": 0.26011560693641617, + "acc_norm_stderr": 0.03345036916788991 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179961, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179961 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.33617021276595743, + "acc_stderr": 0.030881618520676942, + "acc_norm": 0.33617021276595743, + "acc_norm_stderr": 0.030881618520676942 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.0433913832257986, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.0433913832257986 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.23448275862068965, + "acc_stderr": 0.035306258743465914, + "acc_norm": 0.23448275862068965, + "acc_norm_stderr": 0.035306258743465914 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2566137566137566, + "acc_stderr": 0.022494510767503154, + "acc_norm": 0.2566137566137566, + "acc_norm_stderr": 0.022494510767503154 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.18253968253968253, + "acc_stderr": 0.034550710191021496, + "acc_norm": 0.18253968253968253, + "acc_norm_stderr": 0.034550710191021496 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3193548387096774, + "acc_stderr": 0.02652270967466777, + "acc_norm": 0.3193548387096774, + "acc_norm_stderr": 0.02652270967466777 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2955665024630542, + "acc_stderr": 0.032104944337514575, + "acc_norm": 0.2955665024630542, + "acc_norm_stderr": 0.032104944337514575 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2787878787878788, + "acc_stderr": 0.03501438706296781, + "acc_norm": 0.2787878787878788, + "acc_norm_stderr": 0.03501438706296781 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.35858585858585856, + "acc_stderr": 0.034169036403915214, + "acc_norm": 0.35858585858585856, + "acc_norm_stderr": 0.034169036403915214 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.27461139896373055, + "acc_stderr": 0.032210245080411544, + "acc_norm": 0.27461139896373055, + "acc_norm_stderr": 0.032210245080411544 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.258974358974359, + "acc_stderr": 0.022211106810061658, + "acc_norm": 0.258974358974359, + "acc_norm_stderr": 0.022211106810061658 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.026719240783712177, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.026719240783712177 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.030388353551886838, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.030388353551886838 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763744, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763744 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3412844036697248, + "acc_stderr": 0.02032861281659244, + "acc_norm": 0.3412844036697248, + "acc_norm_stderr": 0.02032861281659244 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2037037037037037, + "acc_stderr": 0.027467401804058014, + "acc_norm": 0.2037037037037037, + "acc_norm_stderr": 0.027467401804058014 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.22058823529411764, + "acc_stderr": 0.02910225438967408, + "acc_norm": 0.22058823529411764, + "acc_norm_stderr": 0.02910225438967408 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.3037974683544304, + "acc_stderr": 0.029936696387138594, + "acc_norm": 0.3037974683544304, + "acc_norm_stderr": 0.029936696387138594 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3721973094170404, + "acc_stderr": 0.032443052830087304, + "acc_norm": 0.3721973094170404, + "acc_norm_stderr": 0.032443052830087304 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.31297709923664124, + "acc_stderr": 0.04066962905677697, + "acc_norm": 0.31297709923664124, + "acc_norm_stderr": 0.04066962905677697 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.3305785123966942, + "acc_stderr": 0.04294340845212094, + "acc_norm": 0.3305785123966942, + "acc_norm_stderr": 0.04294340845212094 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.37037037037037035, + "acc_stderr": 0.04668408033024931, + "acc_norm": 0.37037037037037035, + "acc_norm_stderr": 0.04668408033024931 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.26993865030674846, + "acc_stderr": 0.034878251684978906, + "acc_norm": 0.26993865030674846, + "acc_norm_stderr": 0.034878251684978906 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3883495145631068, + "acc_stderr": 0.048257293373563895, + "acc_norm": 0.3883495145631068, + "acc_norm_stderr": 0.048257293373563895 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.4230769230769231, + "acc_stderr": 0.032366121762202014, + "acc_norm": 0.4230769230769231, + "acc_norm_stderr": 0.032366121762202014 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.40102171136653897, + "acc_stderr": 0.017526133150124572, + "acc_norm": 0.40102171136653897, + "acc_norm_stderr": 0.017526133150124572 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.3092485549132948, + "acc_stderr": 0.024883140570071748, + "acc_norm": 0.3092485549132948, + "acc_norm_stderr": 0.024883140570071748 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.3660130718954248, + "acc_stderr": 0.027582811415159603, + "acc_norm": 0.3660130718954248, + "acc_norm_stderr": 0.027582811415159603 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3408360128617363, + "acc_stderr": 0.02692084126077616, + "acc_norm": 0.3408360128617363, + "acc_norm_stderr": 0.02692084126077616 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.32098765432098764, + "acc_stderr": 0.025976566010862737, + "acc_norm": 0.32098765432098764, + "acc_norm_stderr": 0.025976566010862737 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2730496453900709, + "acc_stderr": 0.026577860943307854, + "acc_norm": 0.2730496453900709, + "acc_norm_stderr": 0.026577860943307854 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.25097783572359844, + "acc_stderr": 0.011073730299187234, + "acc_norm": 0.25097783572359844, + "acc_norm_stderr": 0.011073730299187234 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4338235294117647, + "acc_stderr": 0.030105636570016643, + "acc_norm": 0.4338235294117647, + "acc_norm_stderr": 0.030105636570016643 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2973856209150327, + "acc_stderr": 0.018492596536396955, + "acc_norm": 0.2973856209150327, + "acc_norm_stderr": 0.018492596536396955 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.4, + "acc_stderr": 0.0469237132203465, + "acc_norm": 0.4, + "acc_norm_stderr": 0.0469237132203465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2979591836734694, + "acc_stderr": 0.029279567411065657, + "acc_norm": 0.2979591836734694, + "acc_norm_stderr": 0.029279567411065657 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.26865671641791045, + "acc_stderr": 0.03134328358208954, + "acc_norm": 0.26865671641791045, + "acc_norm_stderr": 0.03134328358208954 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.35542168674698793, + "acc_stderr": 0.03726214354322415, + "acc_norm": 0.35542168674698793, + "acc_norm_stderr": 0.03726214354322415 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.391812865497076, + "acc_stderr": 0.037439798259264, + "acc_norm": 0.391812865497076, + "acc_norm_stderr": 0.037439798259264 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2141982864137087, + "mc1_stderr": 0.014362148155690466, + "mc2": 0.3547558879371776, + "mc2_stderr": 0.013677330543053609 + }, + "all": { + "acc": 0.3082072716170763, + "acc_stderr": 0.0332912841968488, + "acc_norm": 0.3119069928100476, + "acc_norm_stderr": 0.03329053417457873, + "mc1": 0.2141982864137087, + "mc1_stderr": 0.014362148155690466, + "mc2": 0.3547558879371776, + "mc2_stderr": 0.013677330543053609 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "kfkas/Llama-2-ko-7b-Chat", + "model_sha": "3293b98cd8204371988f898dafa9b5a297555cbe", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "2546.925250530243", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/kfkas/Llama-2-ko-7b-Chat/results_2023-07-27T16-15-02.960730.json b/eval-results/kfkas/Llama-2-ko-7b-Chat/results_2023-07-27T16-15-02.960730.json new file mode 100644 index 0000000000000000000000000000000000000000..08309c567ee6b6223bf5f1c8f51ae62fd42440f9 --- /dev/null +++ b/eval-results/kfkas/Llama-2-ko-7b-Chat/results_2023-07-27T16-15-02.960730.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.371160409556314, + "acc_stderr": 0.014117971901142808, + "acc_norm": 0.4044368600682594, + "acc_norm_stderr": 0.014342036483436175 + }, + "harness|hellaswag|10": { + "acc": 0.4907388966341366, + "acc_stderr": 0.004988925410522775, + "acc_norm": 0.6711810396335391, + "acc_norm_stderr": 0.004688239419302083 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847415, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847415 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.37037037037037035, + "acc_stderr": 0.041716541613545426, + "acc_norm": 0.37037037037037035, + "acc_norm_stderr": 0.041716541613545426 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.27631578947368424, + "acc_stderr": 0.03639057569952925, + "acc_norm": 0.27631578947368424, + "acc_norm_stderr": 0.03639057569952925 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.32452830188679244, + "acc_stderr": 0.028815615713432115, + "acc_norm": 0.32452830188679244, + "acc_norm_stderr": 0.028815615713432115 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2361111111111111, + "acc_stderr": 0.03551446610810826, + "acc_norm": 0.2361111111111111, + "acc_norm_stderr": 0.03551446610810826 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.03295304696818317, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.03295304696818317 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179961, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179961 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.33617021276595743, + "acc_stderr": 0.030881618520676942, + "acc_norm": 0.33617021276595743, + "acc_norm_stderr": 0.030881618520676942 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537315, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537315 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.022644212615525214, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.022644212615525214 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.18253968253968253, + "acc_stderr": 0.034550710191021496, + "acc_norm": 0.18253968253968253, + "acc_norm_stderr": 0.034550710191021496 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3096774193548387, + "acc_stderr": 0.026302774983517414, + "acc_norm": 0.3096774193548387, + "acc_norm_stderr": 0.026302774983517414 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.03178529710642749, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.03178529710642749 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2606060606060606, + "acc_stderr": 0.03427743175816524, + "acc_norm": 0.2606060606060606, + "acc_norm_stderr": 0.03427743175816524 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.35858585858585856, + "acc_stderr": 0.034169036403915214, + "acc_norm": 0.35858585858585856, + "acc_norm_stderr": 0.034169036403915214 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.2694300518134715, + "acc_stderr": 0.03201867122877794, + "acc_norm": 0.2694300518134715, + "acc_norm_stderr": 0.03201867122877794 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2564102564102564, + "acc_stderr": 0.02213908110397154, + "acc_norm": 0.2564102564102564, + "acc_norm_stderr": 0.02213908110397154 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.026842057873833706, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.026842057873833706 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.31932773109243695, + "acc_stderr": 0.0302839955258844, + "acc_norm": 0.31932773109243695, + "acc_norm_stderr": 0.0302839955258844 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763744, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763744 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3302752293577982, + "acc_stderr": 0.02016446633634298, + "acc_norm": 0.3302752293577982, + "acc_norm_stderr": 0.02016446633634298 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.18981481481481483, + "acc_stderr": 0.026744714834691943, + "acc_norm": 0.18981481481481483, + "acc_norm_stderr": 0.026744714834691943 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.22058823529411764, + "acc_stderr": 0.02910225438967409, + "acc_norm": 0.22058823529411764, + "acc_norm_stderr": 0.02910225438967409 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.29957805907172996, + "acc_stderr": 0.0298180247497531, + "acc_norm": 0.29957805907172996, + "acc_norm_stderr": 0.0298180247497531 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.36771300448430494, + "acc_stderr": 0.03236198350928276, + "acc_norm": 0.36771300448430494, + "acc_norm_stderr": 0.03236198350928276 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.31297709923664124, + "acc_stderr": 0.04066962905677697, + "acc_norm": 0.31297709923664124, + "acc_norm_stderr": 0.04066962905677697 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.32231404958677684, + "acc_stderr": 0.04266416363352167, + "acc_norm": 0.32231404958677684, + "acc_norm_stderr": 0.04266416363352167 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.37962962962962965, + "acc_stderr": 0.04691521224077742, + "acc_norm": 0.37962962962962965, + "acc_norm_stderr": 0.04691521224077742 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.26993865030674846, + "acc_stderr": 0.034878251684978906, + "acc_norm": 0.26993865030674846, + "acc_norm_stderr": 0.034878251684978906 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3883495145631068, + "acc_stderr": 0.048257293373563895, + "acc_norm": 0.3883495145631068, + "acc_norm_stderr": 0.048257293373563895 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.42735042735042733, + "acc_stderr": 0.03240847393516327, + "acc_norm": 0.42735042735042733, + "acc_norm_stderr": 0.03240847393516327 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.3959131545338442, + "acc_stderr": 0.017488247006979263, + "acc_norm": 0.3959131545338442, + "acc_norm_stderr": 0.017488247006979263 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.3063583815028902, + "acc_stderr": 0.024818350129436593, + "acc_norm": 0.3063583815028902, + "acc_norm_stderr": 0.024818350129436593 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.369281045751634, + "acc_stderr": 0.027634176689602653, + "acc_norm": 0.369281045751634, + "acc_norm_stderr": 0.027634176689602653 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3408360128617363, + "acc_stderr": 0.02692084126077616, + "acc_norm": 0.3408360128617363, + "acc_norm_stderr": 0.02692084126077616 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.3117283950617284, + "acc_stderr": 0.025773111169630453, + "acc_norm": 0.3117283950617284, + "acc_norm_stderr": 0.025773111169630453 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2730496453900709, + "acc_stderr": 0.026577860943307854, + "acc_norm": 0.2730496453900709, + "acc_norm_stderr": 0.026577860943307854 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24771838331160365, + "acc_stderr": 0.011025499291443742, + "acc_norm": 0.24771838331160365, + "acc_norm_stderr": 0.011025499291443742 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4338235294117647, + "acc_stderr": 0.030105636570016643, + "acc_norm": 0.4338235294117647, + "acc_norm_stderr": 0.030105636570016643 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.29248366013071897, + "acc_stderr": 0.01840341571010979, + "acc_norm": 0.29248366013071897, + "acc_norm_stderr": 0.01840341571010979 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.38181818181818183, + "acc_stderr": 0.04653429807913509, + "acc_norm": 0.38181818181818183, + "acc_norm_stderr": 0.04653429807913509 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2979591836734694, + "acc_stderr": 0.029279567411065657, + "acc_norm": 0.2979591836734694, + "acc_norm_stderr": 0.029279567411065657 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.26865671641791045, + "acc_stderr": 0.03134328358208954, + "acc_norm": 0.26865671641791045, + "acc_norm_stderr": 0.03134328358208954 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3493975903614458, + "acc_stderr": 0.0371172519074075, + "acc_norm": 0.3493975903614458, + "acc_norm_stderr": 0.0371172519074075 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.40350877192982454, + "acc_stderr": 0.03762738699917055, + "acc_norm": 0.40350877192982454, + "acc_norm_stderr": 0.03762738699917055 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.21297429620563035, + "mc1_stderr": 0.014332203787059685, + "mc2": 0.354461555693958, + "mc2_stderr": 0.013669397819228006 + }, + "all": { + "acc": 0.30628775582383155, + "acc_stderr": 0.03323244986423244, + "acc_norm": 0.3099101048663968, + "acc_norm_stderr": 0.03323115119628452, + "mc1": 0.21297429620563035, + "mc1_stderr": 0.014332203787059685, + "mc2": 0.354461555693958, + "mc2_stderr": 0.013669397819228006 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "kfkas/Llama-2-ko-7b-Chat", + "model_sha": "3293b98cd8204371988f898dafa9b5a297555cbe", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "2723.627646923065", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/kfkas/Llama-2-ko-7b-Chat/results_2023-09-17T05-11-56.274160.json b/eval-results/kfkas/Llama-2-ko-7b-Chat/results_2023-09-17T05-11-56.274160.json new file mode 100644 index 0000000000000000000000000000000000000000..321aac759ccb2768c517658da8a46efb9a13ee9e --- /dev/null +++ b/eval-results/kfkas/Llama-2-ko-7b-Chat/results_2023-09-17T05-11-56.274160.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "kfkas/Llama-2-ko-7b-Chat", + "model_sha": "e6b29e983cfdee9d289034ed28ac6b95f36e599a", + "model_size": "12.8 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.030620805369127518, + "em_stderr": 0.0017643908749744859, + "f1": 0.11281669463087184, + "f1_stderr": 0.0025982439722939026 + }, + "harness|gsm8k|5": { + "acc": 0.016679302501895376, + "acc_stderr": 0.0035275958887224534 + }, + "harness|winogrande|5": { + "acc": 0.6661404893449092, + "acc_stderr": 0.013254029695143348 + }, + "all": { + "em": 0.030620805369127518, + "em_stderr": 0.0017643908749744859, + "f1": 0.11281669463087184, + "f1_stderr": 0.0025982439722939026, + "acc": 0.3414098959234023, + "acc_stderr": 0.0083908127919329 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "ca29bfc7c5678194" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "f94cce231e1090ce" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "d7de2f14b3513a0f" + }, + "total_evaluation_time_secondes": "25955.403176784515", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/kfkas/Llama-2-ko-7b-Chat/results_2023-09-18T06-20-53.119467.json b/eval-results/kfkas/Llama-2-ko-7b-Chat/results_2023-09-18T06-20-53.119467.json new file mode 100644 index 0000000000000000000000000000000000000000..dcc7d20b13e68e769b4c0cb7b53f0d1cfacdacc7 --- /dev/null +++ b/eval-results/kfkas/Llama-2-ko-7b-Chat/results_2023-09-18T06-20-53.119467.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "kfkas/Llama-2-ko-7b-Chat", + "model_sha": "e6b29e983cfdee9d289034ed28ac6b95f36e599a", + "model_size": "12.8 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.029886744966442953, + "em_stderr": 0.0017437739254467523, + "f1": 0.11206061241610675, + "f1_stderr": 0.002589360675643281 + }, + "harness|gsm8k|5": { + "acc": 0.01288855193328279, + "acc_stderr": 0.003106901266499642 + }, + "harness|winogrande|5": { + "acc": 0.6685082872928176, + "acc_stderr": 0.01323039719896465 + }, + "all": { + "em": 0.029886744966442953, + "em_stderr": 0.0017437739254467523, + "f1": 0.11206061241610675, + "f1_stderr": 0.002589360675643281, + "acc": 0.3406984196130502, + "acc_stderr": 0.008168649232732146 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "c33a187a765b3460" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "8f08b7abe98c7992" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "bb5bc2df1f57c700" + }, + "total_evaluation_time_secondes": "24708.188175678253", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/khoantap/wizard-limarp/results_2023-10-01T15-39-54.493965.json b/eval-results/khoantap/wizard-limarp/results_2023-10-01T15-39-54.493965.json new file mode 100644 index 0000000000000000000000000000000000000000..47c1bdeb025c5e75b5ea06ec9bf95b2c4f9a794a --- /dev/null +++ b/eval-results/khoantap/wizard-limarp/results_2023-10-01T15-39-54.493965.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "khoantap/wizard-limarp", + "model_sha": "7301565c37edfe74296dbb280c69aab05e82d39a", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5452218430034129, + "acc_stderr": 0.014551507060836357, + "acc_norm": 0.5861774744027304, + "acc_norm_stderr": 0.014392730009221005 + }, + "harness|hellaswag|10": { + "acc": 0.6244771957777335, + "acc_stderr": 0.004832679188788789, + "acc_norm": 0.8186616211909978, + "acc_norm_stderr": 0.003845108476401298 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5037037037037037, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.5037037037037037, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5394736842105263, + "acc_stderr": 0.04056242252249034, + "acc_norm": 0.5394736842105263, + "acc_norm_stderr": 0.04056242252249034 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6075471698113207, + "acc_stderr": 0.030052580579557845, + "acc_norm": 0.6075471698113207, + "acc_norm_stderr": 0.030052580579557845 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5625, + "acc_stderr": 0.04148415739394154, + "acc_norm": 0.5625, + "acc_norm_stderr": 0.04148415739394154 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5086705202312138, + "acc_stderr": 0.03811890988940412, + "acc_norm": 0.5086705202312138, + "acc_norm_stderr": 0.03811890988940412 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171451, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171451 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4340425531914894, + "acc_stderr": 0.032400380867927465, + "acc_norm": 0.4340425531914894, + "acc_norm_stderr": 0.032400380867927465 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322004, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322004 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4689655172413793, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.4689655172413793, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3544973544973545, + "acc_stderr": 0.024636830602842, + "acc_norm": 0.3544973544973545, + "acc_norm_stderr": 0.024636830602842 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.04390259265377562, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.04390259265377562 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6387096774193548, + "acc_stderr": 0.027327548447957532, + "acc_norm": 0.6387096774193548, + "acc_norm_stderr": 0.027327548447957532 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.458128078817734, + "acc_stderr": 0.03505630140785741, + "acc_norm": 0.458128078817734, + "acc_norm_stderr": 0.03505630140785741 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.03663974994391244, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.03663974994391244 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.702020202020202, + "acc_stderr": 0.03258630383836556, + "acc_norm": 0.702020202020202, + "acc_norm_stderr": 0.03258630383836556 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7927461139896373, + "acc_stderr": 0.029252823291803638, + "acc_norm": 0.7927461139896373, + "acc_norm_stderr": 0.029252823291803638 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5205128205128206, + "acc_stderr": 0.02532966316348994, + "acc_norm": 0.5205128205128206, + "acc_norm_stderr": 0.02532966316348994 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.02803792996911499, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.02803792996911499 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5840336134453782, + "acc_stderr": 0.032016501007396114, + "acc_norm": 0.5840336134453782, + "acc_norm_stderr": 0.032016501007396114 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.03822746937658753, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.03822746937658753 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.728440366972477, + "acc_stderr": 0.01906909836319143, + "acc_norm": 0.728440366972477, + "acc_norm_stderr": 0.01906909836319143 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.03324708911809117, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.03324708911809117 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.75, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.75, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7257383966244726, + "acc_stderr": 0.029041333510598028, + "acc_norm": 0.7257383966244726, + "acc_norm_stderr": 0.029041333510598028 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6502242152466368, + "acc_stderr": 0.03200736719484503, + "acc_norm": 0.6502242152466368, + "acc_norm_stderr": 0.03200736719484503 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6183206106870229, + "acc_stderr": 0.042607351576445594, + "acc_norm": 0.6183206106870229, + "acc_norm_stderr": 0.042607351576445594 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7768595041322314, + "acc_stderr": 0.03800754475228732, + "acc_norm": 0.7768595041322314, + "acc_norm_stderr": 0.03800754475228732 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.04236511258094633, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.04236511258094633 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6380368098159509, + "acc_stderr": 0.037757007291414416, + "acc_norm": 0.6380368098159509, + "acc_norm_stderr": 0.037757007291414416 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285712, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285712 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6699029126213593, + "acc_stderr": 0.04656147110012351, + "acc_norm": 0.6699029126213593, + "acc_norm_stderr": 0.04656147110012351 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.811965811965812, + "acc_stderr": 0.02559819368665224, + "acc_norm": 0.811965811965812, + "acc_norm_stderr": 0.02559819368665224 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7279693486590039, + "acc_stderr": 0.015913367447500503, + "acc_norm": 0.7279693486590039, + "acc_norm_stderr": 0.015913367447500503 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5924855491329479, + "acc_stderr": 0.026454578146931505, + "acc_norm": 0.5924855491329479, + "acc_norm_stderr": 0.026454578146931505 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.30726256983240224, + "acc_stderr": 0.015430158846469609, + "acc_norm": 0.30726256983240224, + "acc_norm_stderr": 0.015430158846469609 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6013071895424836, + "acc_stderr": 0.028036092273891776, + "acc_norm": 0.6013071895424836, + "acc_norm_stderr": 0.028036092273891776 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6045016077170418, + "acc_stderr": 0.02777091853142784, + "acc_norm": 0.6045016077170418, + "acc_norm_stderr": 0.02777091853142784 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5771604938271605, + "acc_stderr": 0.027487472980871595, + "acc_norm": 0.5771604938271605, + "acc_norm_stderr": 0.027487472980871595 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.41843971631205673, + "acc_stderr": 0.02942799403941999, + "acc_norm": 0.41843971631205673, + "acc_norm_stderr": 0.02942799403941999 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.41460234680573665, + "acc_stderr": 0.012582597058908284, + "acc_norm": 0.41460234680573665, + "acc_norm_stderr": 0.012582597058908284 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5551470588235294, + "acc_stderr": 0.030187532060329387, + "acc_norm": 0.5551470588235294, + "acc_norm_stderr": 0.030187532060329387 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5359477124183006, + "acc_stderr": 0.02017548876548404, + "acc_norm": 0.5359477124183006, + "acc_norm_stderr": 0.02017548876548404 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6693877551020408, + "acc_stderr": 0.030116426296540603, + "acc_norm": 0.6693877551020408, + "acc_norm_stderr": 0.030116426296540603 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7014925373134329, + "acc_stderr": 0.03235743789355042, + "acc_norm": 0.7014925373134329, + "acc_norm_stderr": 0.03235743789355042 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.46987951807228917, + "acc_stderr": 0.03885425420866766, + "acc_norm": 0.46987951807228917, + "acc_norm_stderr": 0.03885425420866766 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7543859649122807, + "acc_stderr": 0.0330140594698725, + "acc_norm": 0.7543859649122807, + "acc_norm_stderr": 0.0330140594698725 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.33414932680538556, + "mc1_stderr": 0.016512530677150538, + "mc2": 0.482777527677442, + "mc2_stderr": 0.015184988472523642 + }, + "all": { + "acc": 0.5508016266489828, + "acc_stderr": 0.03448632181800869, + "acc_norm": 0.5547870513407215, + "acc_norm_stderr": 0.03446689219489, + "mc1": 0.33414932680538556, + "mc1_stderr": 0.016512530677150538, + "mc2": 0.482777527677442, + "mc2_stderr": 0.015184988472523642 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "7487.086874723434", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/gpt-2-xl_camel-ai-physics/results_2023-09-21T19-46-11.375703.json b/eval-results/lgaalves/gpt-2-xl_camel-ai-physics/results_2023-09-21T19-46-11.375703.json new file mode 100644 index 0000000000000000000000000000000000000000..39c1b5c8aac232e42c19c3bc44e7086f0d9603c4 --- /dev/null +++ b/eval-results/lgaalves/gpt-2-xl_camel-ai-physics/results_2023-09-21T19-46-11.375703.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "lgaalves/gpt-2-xl_camel-ai-physics", + "model_sha": "e20cf5a8c89441f4dc15fd2af12dbe72b7df8e60", + "model_size": "2.91 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.27474402730375425, + "acc_stderr": 0.013044617212771227, + "acc_norm": 0.295221843003413, + "acc_norm_stderr": 0.013329750293382316 + }, + "harness|hellaswag|10": { + "acc": 0.39842660824536946, + "acc_stderr": 0.004885735963346902, + "acc_norm": 0.5061740689105756, + "acc_norm_stderr": 0.004989400984722232 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.03915450630414251, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.03915450630414251 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.03317672787533157, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.03317672787533157 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.32075471698113206, + "acc_stderr": 0.028727502957880263, + "acc_norm": 0.32075471698113206, + "acc_norm_stderr": 0.028727502957880263 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3352601156069364, + "acc_stderr": 0.03599586301247078, + "acc_norm": 0.3352601156069364, + "acc_norm_stderr": 0.03599586301247078 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3137254901960784, + "acc_stderr": 0.04617034827006718, + "acc_norm": 0.3137254901960784, + "acc_norm_stderr": 0.04617034827006718 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.19574468085106383, + "acc_stderr": 0.025937853139977148, + "acc_norm": 0.19574468085106383, + "acc_norm_stderr": 0.025937853139977148 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813344, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813344 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3103448275862069, + "acc_stderr": 0.03855289616378947, + "acc_norm": 0.3103448275862069, + "acc_norm_stderr": 0.03855289616378947 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25132275132275134, + "acc_stderr": 0.022340482339643898, + "acc_norm": 0.25132275132275134, + "acc_norm_stderr": 0.022340482339643898 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.04263906892795132, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.04263906892795132 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24838709677419354, + "acc_stderr": 0.02458002892148101, + "acc_norm": 0.24838709677419354, + "acc_norm_stderr": 0.02458002892148101 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.270935960591133, + "acc_stderr": 0.031270907132976984, + "acc_norm": 0.270935960591133, + "acc_norm_stderr": 0.031270907132976984 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.3151515151515151, + "acc_stderr": 0.0362773057502241, + "acc_norm": 0.3151515151515151, + "acc_norm_stderr": 0.0362773057502241 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.03358618145732523, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.03358618145732523 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.29015544041450775, + "acc_stderr": 0.03275264467791516, + "acc_norm": 0.29015544041450775, + "acc_norm_stderr": 0.03275264467791516 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3487179487179487, + "acc_stderr": 0.02416278028401772, + "acc_norm": 0.3487179487179487, + "acc_norm_stderr": 0.02416278028401772 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.02730914058823018, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.02730914058823018 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2184873949579832, + "acc_stderr": 0.026841514322958948, + "acc_norm": 0.2184873949579832, + "acc_norm_stderr": 0.026841514322958948 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.30825688073394497, + "acc_stderr": 0.019798366698367268, + "acc_norm": 0.30825688073394497, + "acc_norm_stderr": 0.019798366698367268 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.03388857118502325, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.03388857118502325 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.028867431449849313, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.028867431449849313 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.189873417721519, + "acc_stderr": 0.025530100460233504, + "acc_norm": 0.189873417721519, + "acc_norm_stderr": 0.025530100460233504 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.13004484304932734, + "acc_stderr": 0.02257451942417487, + "acc_norm": 0.13004484304932734, + "acc_norm_stderr": 0.02257451942417487 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.26717557251908397, + "acc_stderr": 0.03880848301082396, + "acc_norm": 0.26717557251908397, + "acc_norm_stderr": 0.03880848301082396 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2231404958677686, + "acc_stderr": 0.03800754475228733, + "acc_norm": 0.2231404958677686, + "acc_norm_stderr": 0.03800754475228733 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.04284467968052192, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.04284467968052192 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.25766871165644173, + "acc_stderr": 0.03436150827846917, + "acc_norm": 0.25766871165644173, + "acc_norm_stderr": 0.03436150827846917 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25892857142857145, + "acc_stderr": 0.04157751539865629, + "acc_norm": 0.25892857142857145, + "acc_norm_stderr": 0.04157751539865629 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3592233009708738, + "acc_stderr": 0.04750458399041693, + "acc_norm": 0.3592233009708738, + "acc_norm_stderr": 0.04750458399041693 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.02934311479809445, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.02934311479809445 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909282, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909282 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2707535121328225, + "acc_stderr": 0.01588988836256049, + "acc_norm": 0.2707535121328225, + "acc_norm_stderr": 0.01588988836256049 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.25722543352601157, + "acc_stderr": 0.023532925431044276, + "acc_norm": 0.25722543352601157, + "acc_norm_stderr": 0.023532925431044276 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24804469273743016, + "acc_stderr": 0.014444157808261469, + "acc_norm": 0.24804469273743016, + "acc_norm_stderr": 0.014444157808261469 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.26143790849673204, + "acc_stderr": 0.025160998214292456, + "acc_norm": 0.26143790849673204, + "acc_norm_stderr": 0.025160998214292456 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.18006430868167203, + "acc_stderr": 0.021823422857744953, + "acc_norm": 0.18006430868167203, + "acc_norm_stderr": 0.021823422857744953 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.24691358024691357, + "acc_stderr": 0.02399350170904211, + "acc_norm": 0.24691358024691357, + "acc_norm_stderr": 0.02399350170904211 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23404255319148937, + "acc_stderr": 0.025257861359432407, + "acc_norm": 0.23404255319148937, + "acc_norm_stderr": 0.025257861359432407 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2503259452411995, + "acc_stderr": 0.011064151027165434, + "acc_norm": 0.2503259452411995, + "acc_norm_stderr": 0.011064151027165434 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.34191176470588236, + "acc_stderr": 0.02881472242225417, + "acc_norm": 0.34191176470588236, + "acc_norm_stderr": 0.02881472242225417 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.017401816711427657, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.017401816711427657 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.33877551020408164, + "acc_stderr": 0.030299506562154185, + "acc_norm": 0.33877551020408164, + "acc_norm_stderr": 0.030299506562154185 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23880597014925373, + "acc_stderr": 0.030147775935409217, + "acc_norm": 0.23880597014925373, + "acc_norm_stderr": 0.030147775935409217 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2710843373493976, + "acc_stderr": 0.03460579907553027, + "acc_norm": 0.2710843373493976, + "acc_norm_stderr": 0.03460579907553027 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.03508771929824563, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.03508771929824563 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2350061199510404, + "mc1_stderr": 0.01484306150773162, + "mc2": 0.3911759819393894, + "mc2_stderr": 0.014209348354116707 + }, + "all": { + "acc": 0.2702104699407836, + "acc_stderr": 0.03206267177660607, + "acc_norm": 0.2723837797096797, + "acc_norm_stderr": 0.03206926157494483, + "mc1": 0.2350061199510404, + "mc1_stderr": 0.01484306150773162, + "mc2": 0.3911759819393894, + "mc2_stderr": 0.014209348354116707 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "ed17e576dbafa5da" + }, + "truncated": 1568, + "non-truncated": 3119, + "padded": 3087, + "non-padded": 1600, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "0875c25c8fc0a94d" + }, + "truncated": 1975, + "non-truncated": 38193, + "padded": 38021, + "non-padded": 2147, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "18cfffb76bc8f0d1" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "16b3626c8a5e3797" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 20, + "non-truncated": 672, + "padded": 660, + "non-padded": 32, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "21f0989f5760198a" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "f7d801bfd913884d" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "23f9089575432d5a" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "04b8293f2ab7fbbf" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 16, + "non-truncated": 384, + "padded": 384, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "7994d94bfa36d003" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "a2c91752be5b1798" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "db71da66ed82b921" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "e81cf9738ad7e157" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "4a2d5f00cb00d9b7" + }, + "truncated": 4, + "non-truncated": 860, + "padded": 860, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "e9bcfaa6beefb456" + }, + "truncated": 948, + "non-truncated": 0, + "padded": 0, + "non-padded": 948, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "6f8215a3de7eebd1" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "aacac708cd4c5a61" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "16b6c6e390eb7cea" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "4130880a19c4edb0" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "96b81f570a84328b" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "e3a7592f84b44888" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "f9edf462e8201551" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 6136, + "non-truncated": 0, + "padded": 0, + "non-padded": 6136, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "ecf7754754c2bb76" + }, + "truncated": 1032, + "non-truncated": 56, + "padded": 48, + "non-padded": 1040, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "30b07e31cf9b5c6f" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "4d1dc7c4ad251829" + }, + "truncated": 980, + "non-truncated": 0, + "padded": 0, + "non-padded": 980, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "d36b9d9f0f4424fe" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "a0a7af55ac7ae037" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "84fd36aa004c8578" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "18a3fbefef0c4910", + "hash_cont_tokens": "24012b7d40528568" + }, + "total_evaluation_time_secondes": "2005.0589463710785", + "truncated": 14155, + "non-truncated": 96864, + "padded": 96540, + "non-padded": 14479, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/gpt-2-xl_camel-ai-physics/results_2023-10-25T20-38-31.656182.json b/eval-results/lgaalves/gpt-2-xl_camel-ai-physics/results_2023-10-25T20-38-31.656182.json new file mode 100644 index 0000000000000000000000000000000000000000..24733d1ff75d1049abba2d948c0e5682ec259887 --- /dev/null +++ b/eval-results/lgaalves/gpt-2-xl_camel-ai-physics/results_2023-10-25T20-38-31.656182.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "lgaalves/gpt-2-xl_camel-ai-physics", + "model_sha": "87439b2c38586865acc3135d4c189efc7746763a", + "model_size": "2.91 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.002202181208053691, + "em_stderr": 0.0004800510816619256, + "f1": 0.05571623322147659, + "f1_stderr": 0.001366603872793856 + }, + "harness|gsm8k|5": { + "acc": 0.001516300227445034, + "acc_stderr": 0.001071779348549263 + }, + "harness|winogrande|5": { + "acc": 0.5753749013417522, + "acc_stderr": 0.013891893150264225 + }, + "all": { + "em": 0.002202181208053691, + "em_stderr": 0.0004800510816619256, + "f1": 0.05571623322147659, + "f1_stderr": 0.001366603872793856, + "acc": 0.28844560078459863, + "acc_stderr": 0.007481836249406744 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "30025ac5d87613de" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "696da1602df4120d" + }, + "truncated": 917, + "non-truncated": 402, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "6b3de69e2f87348c", + "hash_cont_tokens": "c2e5f61e8ec5f4d4" + }, + "total_evaluation_time_secondes": "17725.59523510933", + "truncated": 10207, + "non-truncated": 3182, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/gpt2-dolly/results_2023-08-09T12-04-01.298115.json b/eval-results/lgaalves/gpt2-dolly/results_2023-08-09T12-04-01.298115.json new file mode 100644 index 0000000000000000000000000000000000000000..6223303323a8933debf41f8b6324af6ba231f6bf --- /dev/null +++ b/eval-results/lgaalves/gpt2-dolly/results_2023-08-09T12-04-01.298115.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.19795221843003413, + "acc_stderr": 0.011643990971573393, + "acc_norm": 0.2175767918088737, + "acc_norm_stderr": 0.0120572620209725 + }, + "harness|hellaswag|10": { + "acc": 0.29028082055367455, + "acc_stderr": 0.00452964282854641, + "acc_norm": 0.30770762796255724, + "acc_norm_stderr": 0.004606015773125624 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.03853254836552004, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.03853254836552004 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.21, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.21, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.21509433962264152, + "acc_stderr": 0.025288394502891366, + "acc_norm": 0.21509433962264152, + "acc_norm_stderr": 0.025288394502891366 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.1791907514450867, + "acc_stderr": 0.02924251305906329, + "acc_norm": 0.1791907514450867, + "acc_norm_stderr": 0.02924251305906329 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2680851063829787, + "acc_stderr": 0.028957342788342347, + "acc_norm": 0.2680851063829787, + "acc_norm_stderr": 0.028957342788342347 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135303, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135303 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.23015873015873015, + "acc_stderr": 0.021679219663693145, + "acc_norm": 0.23015873015873015, + "acc_norm_stderr": 0.021679219663693145 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.1349206349206349, + "acc_stderr": 0.030557101589417508, + "acc_norm": 0.1349206349206349, + "acc_norm_stderr": 0.030557101589417508 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2967741935483871, + "acc_stderr": 0.025988500792411898, + "acc_norm": 0.2967741935483871, + "acc_norm_stderr": 0.025988500792411898 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.24630541871921183, + "acc_stderr": 0.030315099285617715, + "acc_norm": 0.24630541871921183, + "acc_norm_stderr": 0.030315099285617715 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.15, + "acc_stderr": 0.035887028128263714, + "acc_norm": 0.15, + "acc_norm_stderr": 0.035887028128263714 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.22424242424242424, + "acc_stderr": 0.03256866661681102, + "acc_norm": 0.22424242424242424, + "acc_norm_stderr": 0.03256866661681102 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.36363636363636365, + "acc_stderr": 0.034273086529999344, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.034273086529999344 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.29015544041450775, + "acc_stderr": 0.03275264467791516, + "acc_norm": 0.29015544041450775, + "acc_norm_stderr": 0.03275264467791516 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2692307692307692, + "acc_stderr": 0.022489389793654817, + "acc_norm": 0.2692307692307692, + "acc_norm_stderr": 0.022489389793654817 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.026466117538959916, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.026466117538959916 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2689075630252101, + "acc_stderr": 0.028801392193631273, + "acc_norm": 0.2689075630252101, + "acc_norm_stderr": 0.028801392193631273 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2185430463576159, + "acc_stderr": 0.03374235550425694, + "acc_norm": 0.2185430463576159, + "acc_norm_stderr": 0.03374235550425694 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3155963302752294, + "acc_stderr": 0.019926117513869666, + "acc_norm": 0.3155963302752294, + "acc_norm_stderr": 0.019926117513869666 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4351851851851852, + "acc_stderr": 0.03381200005643525, + "acc_norm": 0.4351851851851852, + "acc_norm_stderr": 0.03381200005643525 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2696078431372549, + "acc_stderr": 0.031145570659486782, + "acc_norm": 0.2696078431372549, + "acc_norm_stderr": 0.031145570659486782 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.27848101265822783, + "acc_stderr": 0.029178682304842538, + "acc_norm": 0.27848101265822783, + "acc_norm_stderr": 0.029178682304842538 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.2600896860986547, + "acc_stderr": 0.029442495585857483, + "acc_norm": 0.2600896860986547, + "acc_norm_stderr": 0.029442495585857483 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3006134969325153, + "acc_stderr": 0.03602511318806771, + "acc_norm": 0.3006134969325153, + "acc_norm_stderr": 0.03602511318806771 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.24107142857142858, + "acc_stderr": 0.04059867246952687, + "acc_norm": 0.24107142857142858, + "acc_norm_stderr": 0.04059867246952687 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.23504273504273504, + "acc_stderr": 0.027778835904935427, + "acc_norm": 0.23504273504273504, + "acc_norm_stderr": 0.027778835904935427 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.24010217113665389, + "acc_stderr": 0.015274685213734195, + "acc_norm": 0.24010217113665389, + "acc_norm_stderr": 0.015274685213734195 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.23699421965317918, + "acc_stderr": 0.02289408248992599, + "acc_norm": 0.23699421965317918, + "acc_norm_stderr": 0.02289408248992599 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.24836601307189543, + "acc_stderr": 0.02473998135511359, + "acc_norm": 0.24836601307189543, + "acc_norm_stderr": 0.02473998135511359 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.19614147909967847, + "acc_stderr": 0.022552447780478026, + "acc_norm": 0.19614147909967847, + "acc_norm_stderr": 0.022552447780478026 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.20679012345679013, + "acc_stderr": 0.022535006705942825, + "acc_norm": 0.20679012345679013, + "acc_norm_stderr": 0.022535006705942825 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.24113475177304963, + "acc_stderr": 0.025518731049537773, + "acc_norm": 0.24113475177304963, + "acc_norm_stderr": 0.025518731049537773 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2542372881355932, + "acc_stderr": 0.011121129007840671, + "acc_norm": 0.2542372881355932, + "acc_norm_stderr": 0.011121129007840671 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.3492647058823529, + "acc_stderr": 0.02895975519682486, + "acc_norm": 0.3492647058823529, + "acc_norm_stderr": 0.02895975519682486 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25326797385620914, + "acc_stderr": 0.01759348689536683, + "acc_norm": 0.25326797385620914, + "acc_norm_stderr": 0.01759348689536683 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.20909090909090908, + "acc_stderr": 0.038950910157241364, + "acc_norm": 0.20909090909090908, + "acc_norm_stderr": 0.038950910157241364 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2612244897959184, + "acc_stderr": 0.02812342933514279, + "acc_norm": 0.2612244897959184, + "acc_norm_stderr": 0.02812342933514279 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23880597014925373, + "acc_stderr": 0.030147775935409217, + "acc_norm": 0.23880597014925373, + "acc_norm_stderr": 0.030147775935409217 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2289156626506024, + "acc_stderr": 0.03270745277352477, + "acc_norm": 0.2289156626506024, + "acc_norm_stderr": 0.03270745277352477 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.03301405946987249, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.03301405946987249 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2423500611995104, + "mc1_stderr": 0.01500067437357034, + "mc2": 0.42224597537560177, + "mc2_stderr": 0.014881649016730558 + }, + "all": { + "acc": 0.24648113999514087, + "acc_stderr": 0.031053730074742582, + "acc_norm": 0.24710912950001754, + "acc_norm_stderr": 0.03106202912548798, + "mc1": 0.2423500611995104, + "mc1_stderr": 0.01500067437357034, + "mc2": 0.42224597537560177, + "mc2_stderr": 0.014881649016730558 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "lgaalves/gpt2-dolly", + "model_sha": "52fcf61a8eef255a981be6efde187481086e1a48", + "model_dtype": "torch.float16", + "lighteval_sha": "da839e70121267a9bf55a0fbea4fb2fae2948337", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "c6e2e25e2b25a621" + }, + "truncated": 1568, + "non-truncated": 3119, + "padded": 3087, + "non-padded": 1600, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "8ad5f1a3e4068f36" + }, + "truncated": 1975, + "non-truncated": 38193, + "padded": 38021, + "non-padded": 2147, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "ca7527d5bdfd389a" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "08a1fa6c8dde9a82" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "b0c14ed86adbcb56" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "3cf1924b14cbf906" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "d09bf08193410dfa" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 20, + "non-truncated": 672, + "padded": 660, + "non-padded": 32, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "f3ed369e135c0e74" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "e69647d0f0359a4e" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "2ef491ecaa0b411b" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "2f65e8345a68d860" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "ec161287ac6222f4" + }, + "truncated": 16, + "non-truncated": 384, + "padded": 384, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "9da45062757ae791" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "fdea101837ab4409" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "56be0c12b78c81a3" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "f524cf6fe64b2a7f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "421b3dc903711e3d" + }, + "truncated": 4, + "non-truncated": 860, + "padded": 860, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "e1610a0b694e7b3a" + }, + "truncated": 948, + "non-truncated": 0, + "padded": 0, + "non-padded": 948, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "0229c63f045574c2" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "d11f2c877fe691dc" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "05c35d0e7dd2c7d4" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "f1e9e326e9540108" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "027ac34198453c9e" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "030e5bb46551865c" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "42b23299e8bae480" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 6136, + "non-truncated": 0, + "padded": 0, + "non-padded": 6136, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "33dc7eccd5de31ae" + }, + "truncated": 1032, + "non-truncated": 56, + "padded": 48, + "non-padded": 1040, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "2a666dc39f1f52ac" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "2e9916279a4cae95" + }, + "truncated": 980, + "non-truncated": 0, + "padded": 0, + "non-padded": 980, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "555f7a55738bbf37" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "984061eb58124367" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "f41d0880e9a23f4e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "18a3fbefef0c4910", + "hash_cont_tokens": "6159bf1904a8c8fb" + }, + "total_evaluation_time_secondes": "882.9626545906067", + "truncated": 14155, + "non-truncated": 96864, + "padded": 96540, + "non-padded": 14479, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/gpt2-dolly/results_2023-09-21T18-57-43.248355.json b/eval-results/lgaalves/gpt2-dolly/results_2023-09-21T18-57-43.248355.json new file mode 100644 index 0000000000000000000000000000000000000000..182f7ac6a8f475b8264901513786487008fed7ec --- /dev/null +++ b/eval-results/lgaalves/gpt2-dolly/results_2023-09-21T18-57-43.248355.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "lgaalves/gpt2-dolly", + "model_sha": "7e75e6f4626437305e4d3e7b2aa36f617c517247", + "model_size": "117.35 MB", + "model_dtype": "4bit", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.1945392491467577, + "acc_stderr": 0.011567709174648727, + "acc_norm": 0.22696245733788395, + "acc_norm_stderr": 0.01224049153613287 + }, + "harness|hellaswag|10": { + "acc": 0.2889862577175861, + "acc_stderr": 0.004523651184016274, + "acc_norm": 0.3015335590519817, + "acc_norm_stderr": 0.004579859084500792 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.21, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.21, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.03853254836552004, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.03853254836552004 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2037735849056604, + "acc_stderr": 0.024790784501775406, + "acc_norm": 0.2037735849056604, + "acc_norm_stderr": 0.024790784501775406 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.22916666666666666, + "acc_stderr": 0.035146974678623884, + "acc_norm": 0.22916666666666666, + "acc_norm_stderr": 0.035146974678623884 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749874, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749874 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2680851063829787, + "acc_stderr": 0.028957342788342347, + "acc_norm": 0.2680851063829787, + "acc_norm_stderr": 0.028957342788342347 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813365 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.30344827586206896, + "acc_stderr": 0.038312260488503336, + "acc_norm": 0.30344827586206896, + "acc_norm_stderr": 0.038312260488503336 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.19047619047619047, + "acc_stderr": 0.020223880317923854, + "acc_norm": 0.19047619047619047, + "acc_norm_stderr": 0.020223880317923854 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.14285714285714285, + "acc_stderr": 0.03129843185743808, + "acc_norm": 0.14285714285714285, + "acc_norm_stderr": 0.03129843185743808 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2064516129032258, + "acc_stderr": 0.02302589961718872, + "acc_norm": 0.2064516129032258, + "acc_norm_stderr": 0.02302589961718872 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.16748768472906403, + "acc_stderr": 0.026273086047535407, + "acc_norm": 0.16748768472906403, + "acc_norm_stderr": 0.026273086047535407 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.23030303030303031, + "acc_stderr": 0.03287666758603488, + "acc_norm": 0.23030303030303031, + "acc_norm_stderr": 0.03287666758603488 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.36363636363636365, + "acc_stderr": 0.034273086529999344, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.034273086529999344 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.3626943005181347, + "acc_stderr": 0.03469713791704371, + "acc_norm": 0.3626943005181347, + "acc_norm_stderr": 0.03469713791704371 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3153846153846154, + "acc_stderr": 0.02355964698318994, + "acc_norm": 0.3153846153846154, + "acc_norm_stderr": 0.02355964698318994 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.02646611753895991, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.02646611753895991 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.31932773109243695, + "acc_stderr": 0.030283995525884396, + "acc_norm": 0.31932773109243695, + "acc_norm_stderr": 0.030283995525884396 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.038020397601079024, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.038020397601079024 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.26972477064220185, + "acc_stderr": 0.019028486711115452, + "acc_norm": 0.26972477064220185, + "acc_norm_stderr": 0.019028486711115452 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.030190282453501943, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.030190282453501943 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.29535864978902954, + "acc_stderr": 0.029696338713422882, + "acc_norm": 0.29535864978902954, + "acc_norm_stderr": 0.029696338713422882 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.19282511210762332, + "acc_stderr": 0.02647824096048936, + "acc_norm": 0.19282511210762332, + "acc_norm_stderr": 0.02647824096048936 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3006134969325153, + "acc_stderr": 0.03602511318806771, + "acc_norm": 0.3006134969325153, + "acc_norm_stderr": 0.03602511318806771 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.04432804055291519, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.04432804055291519 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2905982905982906, + "acc_stderr": 0.02974504857267404, + "acc_norm": 0.2905982905982906, + "acc_norm_stderr": 0.02974504857267404 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2247765006385696, + "acc_stderr": 0.014927447101937164, + "acc_norm": 0.2247765006385696, + "acc_norm_stderr": 0.014927447101937164 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24022346368715083, + "acc_stderr": 0.014288343803925293, + "acc_norm": 0.24022346368715083, + "acc_norm_stderr": 0.014288343803925293 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.02495418432487991, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.02495418432487991 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2282958199356913, + "acc_stderr": 0.0238393033113982, + "acc_norm": 0.2282958199356913, + "acc_norm_stderr": 0.0238393033113982 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21604938271604937, + "acc_stderr": 0.022899162918445806, + "acc_norm": 0.21604938271604937, + "acc_norm_stderr": 0.022899162918445806 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23049645390070922, + "acc_stderr": 0.025123739226872405, + "acc_norm": 0.23049645390070922, + "acc_norm_stderr": 0.025123739226872405 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24641460234680573, + "acc_stderr": 0.011005971399927232, + "acc_norm": 0.24641460234680573, + "acc_norm_stderr": 0.011005971399927232 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4227941176470588, + "acc_stderr": 0.03000856284500347, + "acc_norm": 0.4227941176470588, + "acc_norm_stderr": 0.03000856284500347 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2679738562091503, + "acc_stderr": 0.017917974069594726, + "acc_norm": 0.2679738562091503, + "acc_norm_stderr": 0.017917974069594726 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.23636363636363636, + "acc_stderr": 0.04069306319721377, + "acc_norm": 0.23636363636363636, + "acc_norm_stderr": 0.04069306319721377 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2816326530612245, + "acc_stderr": 0.028795185574291282, + "acc_norm": 0.2816326530612245, + "acc_norm_stderr": 0.028795185574291282 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401465, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401465 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2891566265060241, + "acc_stderr": 0.03529486801511115, + "acc_norm": 0.2891566265060241, + "acc_norm_stderr": 0.03529486801511115 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.25146198830409355, + "acc_stderr": 0.033275044238468436, + "acc_norm": 0.25146198830409355, + "acc_norm_stderr": 0.033275044238468436 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2607099143206854, + "mc1_stderr": 0.015368841620766373, + "mc2": 0.44967367968919597, + "mc2_stderr": 0.015050441244044107 + }, + "all": { + "acc": 0.25750256181985026, + "acc_stderr": 0.031566230568758215, + "acc_norm": 0.2582647738457065, + "acc_norm_stderr": 0.03157858633591023, + "mc1": 0.2607099143206854, + "mc1_stderr": 0.015368841620766373, + "mc2": 0.44967367968919597, + "mc2_stderr": 0.015050441244044107 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "ed17e576dbafa5da" + }, + "truncated": 1568, + "non-truncated": 3119, + "padded": 3087, + "non-padded": 1600, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "0875c25c8fc0a94d" + }, + "truncated": 1975, + "non-truncated": 38193, + "padded": 38021, + "non-padded": 2147, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "18cfffb76bc8f0d1" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "16b3626c8a5e3797" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 20, + "non-truncated": 672, + "padded": 660, + "non-padded": 32, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "21f0989f5760198a" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "f7d801bfd913884d" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "23f9089575432d5a" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "04b8293f2ab7fbbf" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 16, + "non-truncated": 384, + "padded": 384, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "7994d94bfa36d003" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "a2c91752be5b1798" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "db71da66ed82b921" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "e81cf9738ad7e157" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "4a2d5f00cb00d9b7" + }, + "truncated": 4, + "non-truncated": 860, + "padded": 860, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "e9bcfaa6beefb456" + }, + "truncated": 948, + "non-truncated": 0, + "padded": 0, + "non-padded": 948, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "6f8215a3de7eebd1" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "aacac708cd4c5a61" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "16b6c6e390eb7cea" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "4130880a19c4edb0" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "96b81f570a84328b" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "e3a7592f84b44888" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "f9edf462e8201551" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 6136, + "non-truncated": 0, + "padded": 0, + "non-padded": 6136, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "ecf7754754c2bb76" + }, + "truncated": 1032, + "non-truncated": 56, + "padded": 48, + "non-padded": 1040, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "30b07e31cf9b5c6f" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "4d1dc7c4ad251829" + }, + "truncated": 980, + "non-truncated": 0, + "padded": 0, + "non-padded": 980, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "d36b9d9f0f4424fe" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "a0a7af55ac7ae037" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "84fd36aa004c8578" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "18a3fbefef0c4910", + "hash_cont_tokens": "24012b7d40528568" + }, + "total_evaluation_time_secondes": "1296.2403497695923", + "truncated": 14155, + "non-truncated": 96864, + "padded": 96540, + "non-padded": 14479, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/gpt2-dolly/results_2023-10-16T15-21-03.106621.json b/eval-results/lgaalves/gpt2-dolly/results_2023-10-16T15-21-03.106621.json new file mode 100644 index 0000000000000000000000000000000000000000..b260639c51f667eaaea7abd67a0ffd0b3aedc91f --- /dev/null +++ b/eval-results/lgaalves/gpt2-dolly/results_2023-10-16T15-21-03.106621.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "lgaalves/gpt2-dolly", + "model_sha": "cea99e267a084e71a5296bea24e00fecc2506cf8", + "model_size": "238.85 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001572986577181208, + "em_stderr": 0.00040584511324177414, + "f1": 0.04078963926174505, + "f1_stderr": 0.0012036734759647571 + }, + "harness|gsm8k|5": { + "acc": 0.000758150113722517, + "acc_stderr": 0.0007581501137225257 + }, + "harness|winogrande|5": { + "acc": 0.4956590370955012, + "acc_stderr": 0.014051956064076911 + }, + "all": { + "em": 0.001572986577181208, + "em_stderr": 0.00040584511324177414, + "f1": 0.04078963926174505, + "f1_stderr": 0.0012036734759647571, + "acc": 0.24820859360461187, + "acc_stderr": 0.007405053088899718 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "b25384df728a1126" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "3c7390841d777c12" + }, + "truncated": 917, + "non-truncated": 402, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "6b3de69e2f87348c", + "hash_cont_tokens": "0736a5de6484c2c3" + }, + "total_evaluation_time_secondes": "3876.526220560074", + "truncated": 10207, + "non-truncated": 3182, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/gpt2-dolly/results_2023-10-26T15-16-18.909977.json b/eval-results/lgaalves/gpt2-dolly/results_2023-10-26T15-16-18.909977.json new file mode 100644 index 0000000000000000000000000000000000000000..fc7cdac9a51bf92ffc5628a5e5c8dd9ae49d1f4b --- /dev/null +++ b/eval-results/lgaalves/gpt2-dolly/results_2023-10-26T15-16-18.909977.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "lgaalves/gpt2-dolly", + "model_sha": "cea99e267a084e71a5296bea24e00fecc2506cf8", + "model_size": "117.35 MB", + "model_dtype": "4bit", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0008389261744966443, + "em_stderr": 0.00029649629898012396, + "f1": 0.034500838926174546, + "f1_stderr": 0.0010901499685640162 + }, + "harness|gsm8k|5": { + "acc": 0.001516300227445034, + "acc_stderr": 0.0010717793485492627 + }, + "harness|winogrande|5": { + "acc": 0.5146014206787688, + "acc_stderr": 0.01404649238327583 + }, + "all": { + "em": 0.0008389261744966443, + "em_stderr": 0.00029649629898012396, + "f1": 0.034500838926174546, + "f1_stderr": 0.0010901499685640162, + "acc": 0.25805886045310694, + "acc_stderr": 0.007559135865912546 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "aded1f7ee99e71f9" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "c725091b6b9c332b" + }, + "truncated": 917, + "non-truncated": 402, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "6b3de69e2f87348c", + "hash_cont_tokens": "479b5d7ee7627fc9" + }, + "total_evaluation_time_secondes": "3895.11074757576", + "truncated": 10207, + "non-truncated": 3182, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/gpt2-xl_lima/results_2023-11-15T03-46-31.104311.json b/eval-results/lgaalves/gpt2-xl_lima/results_2023-11-15T03-46-31.104311.json new file mode 100644 index 0000000000000000000000000000000000000000..0fa8b919884335518a86ac70f679e22e201737dd --- /dev/null +++ b/eval-results/lgaalves/gpt2-xl_lima/results_2023-11-15T03-46-31.104311.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 581.279875294, + "end_time": 23493.985214598, + "total_evaluation_time_secondes": "22912.705339304", + "model_name": "lgaalves/gpt2-xl_lima", + "model_sha": "f7db5b1db521abd7578b95138e737637e0037ca5", + "model_dtype": "torch.float16", + "model_size": "2.91 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.2645051194539249, + "acc_stderr": 0.012889272949313368, + "acc_norm": 0.31143344709897613, + "acc_norm_stderr": 0.013532472099850949 + }, + "harness|hellaswag|10": { + "acc": 0.39842660824536946, + "acc_stderr": 0.004885735963346903, + "acc_norm": 0.5128460466042621, + "acc_norm_stderr": 0.004988134303021793 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768081, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768081 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.03712537833614865, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.03712537833614865 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.2236842105263158, + "acc_stderr": 0.033911609343436025, + "acc_norm": 0.2236842105263158, + "acc_norm_stderr": 0.033911609343436025 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.30566037735849055, + "acc_stderr": 0.028353298073322666, + "acc_norm": 0.30566037735849055, + "acc_norm_stderr": 0.028353298073322666 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.037455547914624576, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.037455547914624576 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621503, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621503 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3352601156069364, + "acc_stderr": 0.03599586301247078, + "acc_norm": 0.3352601156069364, + "acc_norm_stderr": 0.03599586301247078 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.13725490196078433, + "acc_stderr": 0.03424084669891522, + "acc_norm": 0.13725490196078433, + "acc_norm_stderr": 0.03424084669891522 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.251063829787234, + "acc_stderr": 0.028346963777162445, + "acc_norm": 0.251063829787234, + "acc_norm_stderr": 0.028346963777162445 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813344, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813344 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2827586206896552, + "acc_stderr": 0.03752833958003336, + "acc_norm": 0.2827586206896552, + "acc_norm_stderr": 0.03752833958003336 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.20105820105820105, + "acc_stderr": 0.020641810782370165, + "acc_norm": 0.20105820105820105, + "acc_norm_stderr": 0.020641810782370165 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.041049472699033945, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.041049472699033945 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.17, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.17, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.20967741935483872, + "acc_stderr": 0.02315787934908352, + "acc_norm": 0.20967741935483872, + "acc_norm_stderr": 0.02315787934908352 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.15763546798029557, + "acc_stderr": 0.025639014131172408, + "acc_norm": 0.15763546798029557, + "acc_norm_stderr": 0.025639014131172408 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.3151515151515151, + "acc_stderr": 0.0362773057502241, + "acc_norm": 0.3151515151515151, + "acc_norm_stderr": 0.0362773057502241 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3383838383838384, + "acc_stderr": 0.03371124142626302, + "acc_norm": 0.3383838383838384, + "acc_norm_stderr": 0.03371124142626302 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.22797927461139897, + "acc_stderr": 0.03027690994517825, + "acc_norm": 0.22797927461139897, + "acc_norm_stderr": 0.03027690994517825 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3564102564102564, + "acc_stderr": 0.024283140529467295, + "acc_norm": 0.3564102564102564, + "acc_norm_stderr": 0.024283140529467295 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2111111111111111, + "acc_stderr": 0.024882116857655078, + "acc_norm": 0.2111111111111111, + "acc_norm_stderr": 0.024882116857655078 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.226890756302521, + "acc_stderr": 0.02720537153827948, + "acc_norm": 0.226890756302521, + "acc_norm_stderr": 0.02720537153827948 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.03734535676787198, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.03734535676787198 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3522935779816514, + "acc_stderr": 0.020480568843999, + "acc_norm": 0.3522935779816514, + "acc_norm_stderr": 0.020480568843999 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4537037037037037, + "acc_stderr": 0.033953227263757976, + "acc_norm": 0.4537037037037037, + "acc_norm_stderr": 0.033953227263757976 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.20098039215686275, + "acc_stderr": 0.028125972265654373, + "acc_norm": 0.20098039215686275, + "acc_norm_stderr": 0.028125972265654373 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.20675105485232068, + "acc_stderr": 0.026361651668389094, + "acc_norm": 0.20675105485232068, + "acc_norm_stderr": 0.026361651668389094 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.14349775784753363, + "acc_stderr": 0.0235293712696182, + "acc_norm": 0.14349775784753363, + "acc_norm_stderr": 0.0235293712696182 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.24427480916030533, + "acc_stderr": 0.037683359597287434, + "acc_norm": 0.24427480916030533, + "acc_norm_stderr": 0.037683359597287434 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.15702479338842976, + "acc_stderr": 0.0332124484254713, + "acc_norm": 0.15702479338842976, + "acc_norm_stderr": 0.0332124484254713 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.27607361963190186, + "acc_stderr": 0.0351238528370505, + "acc_norm": 0.27607361963190186, + "acc_norm_stderr": 0.0351238528370505 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.042878587513404565, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.042878587513404565 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.27184466019417475, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.27184466019417475, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.23931623931623933, + "acc_stderr": 0.02795182680892433, + "acc_norm": 0.23931623931623933, + "acc_norm_stderr": 0.02795182680892433 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.26181353767560667, + "acc_stderr": 0.015720838678445256, + "acc_norm": 0.26181353767560667, + "acc_norm_stderr": 0.015720838678445256 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.023929155517351294, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.023929155517351294 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2057877813504823, + "acc_stderr": 0.022961339906764244, + "acc_norm": 0.2057877813504823, + "acc_norm_stderr": 0.022961339906764244 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25617283950617287, + "acc_stderr": 0.0242885336377261, + "acc_norm": 0.25617283950617287, + "acc_norm_stderr": 0.0242885336377261 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23049645390070922, + "acc_stderr": 0.02512373922687241, + "acc_norm": 0.23049645390070922, + "acc_norm_stderr": 0.02512373922687241 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24119947848761408, + "acc_stderr": 0.010926496102034956, + "acc_norm": 0.24119947848761408, + "acc_norm_stderr": 0.010926496102034956 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.19852941176470587, + "acc_stderr": 0.024231013370541107, + "acc_norm": 0.19852941176470587, + "acc_norm_stderr": 0.024231013370541107 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2434640522875817, + "acc_stderr": 0.017362473762146634, + "acc_norm": 0.2434640522875817, + "acc_norm_stderr": 0.017362473762146634 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.04265792110940588, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.04265792110940588 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2163265306122449, + "acc_stderr": 0.02635891633490403, + "acc_norm": 0.2163265306122449, + "acc_norm_stderr": 0.02635891633490403 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2537313432835821, + "acc_stderr": 0.03076944496729602, + "acc_norm": 0.2537313432835821, + "acc_norm_stderr": 0.03076944496729602 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909282, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909282 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.26506024096385544, + "acc_stderr": 0.03436024037944967, + "acc_norm": 0.26506024096385544, + "acc_norm_stderr": 0.03436024037944967 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2252141982864137, + "mc1_stderr": 0.014623240768023507, + "mc2": 0.3874325444900457, + "mc2_stderr": 0.014089660369122726 + }, + "harness|winogrande|5": { + "acc": 0.5722178374112076, + "acc_stderr": 0.013905134013839943 + }, + "harness|drop|3": { + "em": 0.002726510067114094, + "em_stderr": 0.0005340111700415908, + "f1": 0.04890100671140956, + "f1_stderr": 0.0013085576550093093 + }, + "harness|gsm8k|5": { + "acc": 0.009097801364670205, + "acc_stderr": 0.002615326510775673 + }, + "all": { + "acc": 0.2579848503192349, + "acc_stderr": 0.030758432385023834, + "acc_norm": 0.25961199994409145, + "acc_norm_stderr": 0.03153372055003476, + "mc1": 0.2252141982864137, + "mc1_stderr": 0.014623240768023507, + "mc2": 0.3874325444900457, + "mc2_stderr": 0.014089660369122726, + "em": 0.002726510067114094, + "em_stderr": 0.0005340111700415908, + "f1": 0.04890100671140956, + "f1_stderr": 0.0013085576550093093 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "7cfba60314e9471b", + "hash_cont_tokens": "ed17e576dbafa5da" + }, + "truncated": 1568, + "non_truncated": -396, + "padded": 3089, + "non_padded": 1598, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "35470ee46d7b21a4", + "hash_cont_tokens": "0875c25c8fc0a94d" + }, + "truncated": 1975, + "non_truncated": 8067, + "padded": 38120, + "non_padded": 2048, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "02b38e65730b4712", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "1fbed4b4bb27d865", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "98497e888319b56e", + "hash_cont_tokens": "18cfffb76bc8f0d1" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "034541338d86a1f8", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "e1e150bdc850c136", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1052, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "20a799d5f9c9a1a9", + "hash_cont_tokens": "16b3626c8a5e3797" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "41a4597e36c19ef0", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1c1131a4944856d5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "d41bd2267dc69a8e", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "649b58bd4b394223", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 20, + "non_truncated": 153, + "padded": 664, + "non_padded": 28, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "407265e46dfeaf24", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 400, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "189af9a9e0c85513", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "e23e27a5cb5fade6", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "8b2cbba16cd354a4", + "hash_cont_tokens": "21f0989f5760198a" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "8007052787e63032", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2e17edbbe8c5aa19", + "hash_cont_tokens": "f7d801bfd913884d" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "956704efed2d3de9", + "hash_cont_tokens": "23f9089575432d5a" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "56e24a6936981317", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "9280d83ca94167a7", + "hash_cont_tokens": "04b8293f2ab7fbbf" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a5c6dfe388cd8931", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 800, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a939a884c6c8d887", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 16, + "non_truncated": 84, + "padded": 384, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non_truncated": -495, + "padded": 0, + "non_padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "5d284ce4c7b0ca9a", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 788, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "2dd840e14eacd6bd", + "hash_cont_tokens": "7994d94bfa36d003" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "562915cf47265af9", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "563fd8cde62df13f", + "hash_cont_tokens": "a2c91752be5b1798" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "0310fb471b15978e", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ccf86436451daecc", + "hash_cont_tokens": "db71da66ed82b921" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ec2f001bd307f9a5", + "hash_cont_tokens": "e81cf9738ad7e157" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "ea68c7722d8f3a52", + "hash_cont_tokens": "4a2d5f00cb00d9b7" + }, + "truncated": 4, + "non_truncated": 212, + "padded": 860, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non_truncated": -612, + "padded": 0, + "non_padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "e9bcfaa6beefb456" + }, + "truncated": 948, + "non_truncated": -711, + "padded": 0, + "non_padded": 948, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "e1878600f1df37c7", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "0fdde6eb0830bf5f", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "6dc5ed9fa471d27d", + "hash_cont_tokens": "6f8215a3de7eebd1" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "8a0d33cb57eadb93", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 428, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "33bc8cbaf4b148b6", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "a0e12130e19d9a02", + "hash_cont_tokens": "aacac708cd4c5a61" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e6b0b33a41fda02f", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "c1d59b968d6d5787", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "95a56c538b0a74ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "0734c11b6c0450c2", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "12b681baaab8e9c9", + "hash_cont_tokens": "16b6c6e390eb7cea" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "d4f3662defa0365d", + "hash_cont_tokens": "4130880a19c4edb0" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "224661463bd8aae6", + "hash_cont_tokens": "96b81f570a84328b" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "ca40d870dd2c13f9", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1240, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "06681ff31df5feac", + "hash_cont_tokens": "e3a7592f84b44888" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "b2c1589afc80dbdd", + "hash_cont_tokens": "f9edf462e8201551" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 6136, + "non_truncated": -4602, + "padded": 0, + "non_padded": 6136, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "ecf7754754c2bb76" + }, + "truncated": 1032, + "non_truncated": -760, + "padded": 48, + "non_padded": 1040, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "a428fe3d64b0ef43", + "hash_cont_tokens": "30b07e31cf9b5c6f" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "2c0e453c0a702736", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 436, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "4d1dc7c4ad251829" + }, + "truncated": 980, + "non_truncated": -735, + "padded": 0, + "non_padded": 980, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "52d02a4f41926abc", + "hash_cont_tokens": "d36b9d9f0f4424fe" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 792, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "00c4ee3a60217a8b", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "728002327bd9798a", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "3b8028edcd45c58b", + "hash_cont_tokens": "a0a7af55ac7ae037" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "70a938aa2b5afaa9", + "hash_cont_tokens": "84fd36aa004c8578" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "0c6a4d96ca45d712", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "d7ceb336f05a010b" + }, + "truncated": 9290, + "non_truncated": 246, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "e7ccac1aef09bef6" + }, + "truncated": 917, + "non_truncated": 402, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "92a502fb67af5e45", + "hash_cont_tokens": "46455de97855fd58" + }, + "truncated": 24362, + "non_truncated": 13833, + "padded": 99211, + "non_padded": 25197, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/gpt2_camel_physics-platypus/results_2023-09-11T15-53-04.413591.json b/eval-results/lgaalves/gpt2_camel_physics-platypus/results_2023-09-11T15-53-04.413591.json new file mode 100644 index 0000000000000000000000000000000000000000..fc3de9aff731bc4c5b255ad1ca8a32492be949ca --- /dev/null +++ b/eval-results/lgaalves/gpt2_camel_physics-platypus/results_2023-09-11T15-53-04.413591.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "lgaalves/gpt2_camel_physics-platypus", + "model_sha": "66165ff32ed8de6c39f3524a810f5e97ba6d3347", + "model_size": "238.85 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.19795221843003413, + "acc_stderr": 0.011643990971573405, + "acc_norm": 0.23037542662116042, + "acc_norm_stderr": 0.01230492841874761 + }, + "harness|hellaswag|10": { + "acc": 0.29187412865962953, + "acc_stderr": 0.004536955796510544, + "acc_norm": 0.31318462457677754, + "acc_norm_stderr": 0.0046284090842187535 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.0416333199893227, + "acc_norm": 0.22, + "acc_norm_stderr": 0.0416333199893227 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.0391545063041425, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.0391545063041425 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.23026315789473684, + "acc_stderr": 0.03426059424403165, + "acc_norm": 0.23026315789473684, + "acc_norm_stderr": 0.03426059424403165 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2792452830188679, + "acc_stderr": 0.02761116340239972, + "acc_norm": 0.2792452830188679, + "acc_norm_stderr": 0.02761116340239972 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2708333333333333, + "acc_stderr": 0.03716177437566015, + "acc_norm": 0.2708333333333333, + "acc_norm_stderr": 0.03716177437566015 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2543352601156069, + "acc_stderr": 0.0332055644308557, + "acc_norm": 0.2543352601156069, + "acc_norm_stderr": 0.0332055644308557 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.04280105837364396, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.04280105837364396 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.23, + "acc_stderr": 0.042295258468165044, + "acc_norm": 0.23, + "acc_norm_stderr": 0.042295258468165044 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.25957446808510637, + "acc_stderr": 0.02865917937429232, + "acc_norm": 0.25957446808510637, + "acc_norm_stderr": 0.02865917937429232 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748142, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748142 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3103448275862069, + "acc_stderr": 0.03855289616378948, + "acc_norm": 0.3103448275862069, + "acc_norm_stderr": 0.03855289616378948 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24867724867724866, + "acc_stderr": 0.022261817692400175, + "acc_norm": 0.24867724867724866, + "acc_norm_stderr": 0.022261817692400175 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23015873015873015, + "acc_stderr": 0.03764950879790605, + "acc_norm": 0.23015873015873015, + "acc_norm_stderr": 0.03764950879790605 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.16, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.16, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2870967741935484, + "acc_stderr": 0.025736542745594528, + "acc_norm": 0.2870967741935484, + "acc_norm_stderr": 0.025736542745594528 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.28078817733990147, + "acc_stderr": 0.03161856335358611, + "acc_norm": 0.28078817733990147, + "acc_norm_stderr": 0.03161856335358611 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.03453131801885415, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.03453131801885415 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.35353535353535354, + "acc_stderr": 0.03406086723547153, + "acc_norm": 0.35353535353535354, + "acc_norm_stderr": 0.03406086723547153 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.37305699481865284, + "acc_stderr": 0.03490205592048573, + "acc_norm": 0.37305699481865284, + "acc_norm_stderr": 0.03490205592048573 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.34102564102564104, + "acc_stderr": 0.02403548967633507, + "acc_norm": 0.34102564102564104, + "acc_norm_stderr": 0.02403548967633507 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.026719240783712163, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.026719240783712163 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.026653531596715473, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.026653531596715473 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.25165562913907286, + "acc_stderr": 0.03543304234389985, + "acc_norm": 0.25165562913907286, + "acc_norm_stderr": 0.03543304234389985 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3486238532110092, + "acc_stderr": 0.020431254090714328, + "acc_norm": 0.3486238532110092, + "acc_norm_stderr": 0.020431254090714328 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.26582278481012656, + "acc_stderr": 0.028756799629658335, + "acc_norm": 0.26582278481012656, + "acc_norm_stderr": 0.028756799629658335 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.10762331838565023, + "acc_stderr": 0.020799400082879997, + "acc_norm": 0.10762331838565023, + "acc_norm_stderr": 0.020799400082879997 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.371900826446281, + "acc_stderr": 0.044120158066245044, + "acc_norm": 0.371900826446281, + "acc_norm_stderr": 0.044120158066245044 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2037037037037037, + "acc_stderr": 0.03893542518824847, + "acc_norm": 0.2037037037037037, + "acc_norm_stderr": 0.03893542518824847 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3006134969325153, + "acc_stderr": 0.03602511318806771, + "acc_norm": 0.3006134969325153, + "acc_norm_stderr": 0.03602511318806771 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.17857142857142858, + "acc_stderr": 0.036352091215778065, + "acc_norm": 0.17857142857142858, + "acc_norm_stderr": 0.036352091215778065 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3786407766990291, + "acc_stderr": 0.04802694698258972, + "acc_norm": 0.3786407766990291, + "acc_norm_stderr": 0.04802694698258972 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.19658119658119658, + "acc_stderr": 0.02603538609895129, + "acc_norm": 0.19658119658119658, + "acc_norm_stderr": 0.02603538609895129 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.20306513409961685, + "acc_stderr": 0.014385525076611578, + "acc_norm": 0.20306513409961685, + "acc_norm_stderr": 0.014385525076611578 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.21098265895953758, + "acc_stderr": 0.021966309947043124, + "acc_norm": 0.21098265895953758, + "acc_norm_stderr": 0.021966309947043124 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2435754189944134, + "acc_stderr": 0.01435591196476786, + "acc_norm": 0.2435754189944134, + "acc_norm_stderr": 0.01435591196476786 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.25163398692810457, + "acc_stderr": 0.024848018263875195, + "acc_norm": 0.25163398692810457, + "acc_norm_stderr": 0.024848018263875195 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.28938906752411575, + "acc_stderr": 0.025755865922632924, + "acc_norm": 0.28938906752411575, + "acc_norm_stderr": 0.025755865922632924 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.22530864197530864, + "acc_stderr": 0.023246202647819746, + "acc_norm": 0.22530864197530864, + "acc_norm_stderr": 0.023246202647819746 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2695035460992908, + "acc_stderr": 0.026469036818590638, + "acc_norm": 0.2695035460992908, + "acc_norm_stderr": 0.026469036818590638 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24511082138200782, + "acc_stderr": 0.010986307870045514, + "acc_norm": 0.24511082138200782, + "acc_norm_stderr": 0.010986307870045514 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.44485294117647056, + "acc_stderr": 0.030187532060329376, + "acc_norm": 0.44485294117647056, + "acc_norm_stderr": 0.030187532060329376 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.26143790849673204, + "acc_stderr": 0.017776947157528023, + "acc_norm": 0.26143790849673204, + "acc_norm_stderr": 0.017776947157528023 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.20909090909090908, + "acc_stderr": 0.03895091015724137, + "acc_norm": 0.20909090909090908, + "acc_norm_stderr": 0.03895091015724137 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4, + "acc_stderr": 0.031362502409358936, + "acc_norm": 0.4, + "acc_norm_stderr": 0.031362502409358936 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.21890547263681592, + "acc_stderr": 0.029239174636647, + "acc_norm": 0.21890547263681592, + "acc_norm_stderr": 0.029239174636647 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.1927710843373494, + "acc_stderr": 0.030709824050565274, + "acc_norm": 0.1927710843373494, + "acc_norm_stderr": 0.030709824050565274 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.25146198830409355, + "acc_stderr": 0.033275044238468436, + "acc_norm": 0.25146198830409355, + "acc_norm_stderr": 0.033275044238468436 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.22766217870257038, + "mc1_stderr": 0.01467925503211107, + "mc2": 0.3955559845281961, + "mc2_stderr": 0.014839540193741688 + }, + "all": { + "acc": 0.2683248305375654, + "acc_stderr": 0.03193677298130021, + "acc_norm": 0.2692355712851633, + "acc_norm_stderr": 0.0319495253666372, + "mc1": 0.22766217870257038, + "mc1_stderr": 0.01467925503211107, + "mc2": 0.3955559845281961, + "mc2_stderr": 0.014839540193741688 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "ed17e576dbafa5da" + }, + "truncated": 1568, + "non-truncated": 3119, + "padded": 3087, + "non-padded": 1600, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "0875c25c8fc0a94d" + }, + "truncated": 1975, + "non-truncated": 38193, + "padded": 38021, + "non-padded": 2147, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "18cfffb76bc8f0d1" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "16b3626c8a5e3797" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 20, + "non-truncated": 672, + "padded": 660, + "non-padded": 32, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "21f0989f5760198a" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "f7d801bfd913884d" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "23f9089575432d5a" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "04b8293f2ab7fbbf" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 16, + "non-truncated": 384, + "padded": 384, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "7994d94bfa36d003" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "a2c91752be5b1798" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "db71da66ed82b921" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "e81cf9738ad7e157" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "4a2d5f00cb00d9b7" + }, + "truncated": 4, + "non-truncated": 860, + "padded": 860, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "e9bcfaa6beefb456" + }, + "truncated": 948, + "non-truncated": 0, + "padded": 0, + "non-padded": 948, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "6f8215a3de7eebd1" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "aacac708cd4c5a61" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "16b6c6e390eb7cea" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "4130880a19c4edb0" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "96b81f570a84328b" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "e3a7592f84b44888" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "f9edf462e8201551" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 6136, + "non-truncated": 0, + "padded": 0, + "non-padded": 6136, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "ecf7754754c2bb76" + }, + "truncated": 1032, + "non-truncated": 56, + "padded": 48, + "non-padded": 1040, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "30b07e31cf9b5c6f" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "4d1dc7c4ad251829" + }, + "truncated": 980, + "non-truncated": 0, + "padded": 0, + "non-padded": 980, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "d36b9d9f0f4424fe" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "a0a7af55ac7ae037" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "84fd36aa004c8578" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "18a3fbefef0c4910", + "hash_cont_tokens": "24012b7d40528568" + }, + "total_evaluation_time_secondes": "1422.1547689437866", + "truncated": 14155, + "non-truncated": 96864, + "padded": 96540, + "non-padded": 14479, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/gpt2_camel_physics-platypus/results_2023-10-25T17-38-39.020163.json b/eval-results/lgaalves/gpt2_camel_physics-platypus/results_2023-10-25T17-38-39.020163.json new file mode 100644 index 0000000000000000000000000000000000000000..4f378bfd8e295ee918533e3110823462f92e3aeb --- /dev/null +++ b/eval-results/lgaalves/gpt2_camel_physics-platypus/results_2023-10-25T17-38-39.020163.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "lgaalves/gpt2_camel_physics-platypus", + "model_sha": "e6ce889b6929b9dc612a65f7ec6fddf04499ce4d", + "model_size": "238.85 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.002307046979865772, + "em_stderr": 0.0004913221265094493, + "f1": 0.04785339765100675, + "f1_stderr": 0.001366270058429369 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.4964483030781373, + "acc_stderr": 0.014052131146915873 + }, + "all": { + "em": 0.002307046979865772, + "em_stderr": 0.0004913221265094493, + "f1": 0.04785339765100675, + "f1_stderr": 0.001366270058429369, + "acc": 0.24822415153906865, + "acc_stderr": 0.007026065573457936 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "9882c82804b094be" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "da3b06f6923fe854" + }, + "truncated": 917, + "non-truncated": 402, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "6b3de69e2f87348c", + "hash_cont_tokens": "942fba3ebb9519e4" + }, + "total_evaluation_time_secondes": "5304.164784669876", + "truncated": 10207, + "non-truncated": 3182, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/gpt2_guanaco-dolly-platypus/results_2023-08-31T23-17-05.227048.json b/eval-results/lgaalves/gpt2_guanaco-dolly-platypus/results_2023-08-31T23-17-05.227048.json new file mode 100644 index 0000000000000000000000000000000000000000..d96f46fc6e7792e47b2f2057f5b9822577f6d853 --- /dev/null +++ b/eval-results/lgaalves/gpt2_guanaco-dolly-platypus/results_2023-08-31T23-17-05.227048.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "lgaalves/gpt2_guanaco-dolly-platypus", + "model_sha": "6bf0a8146cf255c829ec2ad83926c8b80945b431", + "model_dtype": "torch.float16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.20136518771331058, + "acc_stderr": 0.01171892747744427, + "acc_norm": 0.2354948805460751, + "acc_norm_stderr": 0.012399451855004748 + }, + "harness|hellaswag|10": { + "acc": 0.29187412865962953, + "acc_stderr": 0.004536955796510544, + "acc_norm": 0.31029675363473413, + "acc_norm_stderr": 0.004616695887762062 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.0416333199893227, + "acc_norm": 0.22, + "acc_norm_stderr": 0.0416333199893227 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.03972552884785139, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.03972552884785139 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.20394736842105263, + "acc_stderr": 0.03279000406310053, + "acc_norm": 0.20394736842105263, + "acc_norm_stderr": 0.03279000406310053 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2943396226415094, + "acc_stderr": 0.028049186315695248, + "acc_norm": 0.2943396226415094, + "acc_norm_stderr": 0.028049186315695248 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2916666666666667, + "acc_stderr": 0.03800968060554857, + "acc_norm": 0.2916666666666667, + "acc_norm_stderr": 0.03800968060554857 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653695, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653695 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.23699421965317918, + "acc_stderr": 0.03242414757483098, + "acc_norm": 0.23699421965317918, + "acc_norm_stderr": 0.03242414757483098 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171453, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171453 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.17, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.17, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.25957446808510637, + "acc_stderr": 0.028659179374292323, + "acc_norm": 0.25957446808510637, + "acc_norm_stderr": 0.028659179374292323 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.20175438596491227, + "acc_stderr": 0.037752050135836386, + "acc_norm": 0.20175438596491227, + "acc_norm_stderr": 0.037752050135836386 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2827586206896552, + "acc_stderr": 0.037528339580033376, + "acc_norm": 0.2827586206896552, + "acc_norm_stderr": 0.037528339580033376 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24867724867724866, + "acc_stderr": 0.022261817692400168, + "acc_norm": 0.24867724867724866, + "acc_norm_stderr": 0.022261817692400168 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2698412698412698, + "acc_stderr": 0.039701582732351734, + "acc_norm": 0.2698412698412698, + "acc_norm_stderr": 0.039701582732351734 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.15, + "acc_stderr": 0.0358870281282637, + "acc_norm": 0.15, + "acc_norm_stderr": 0.0358870281282637 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.27741935483870966, + "acc_stderr": 0.025470196835900055, + "acc_norm": 0.27741935483870966, + "acc_norm_stderr": 0.025470196835900055 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.28078817733990147, + "acc_stderr": 0.03161856335358611, + "acc_norm": 0.28078817733990147, + "acc_norm_stderr": 0.03161856335358611 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.24848484848484848, + "acc_stderr": 0.03374402644139406, + "acc_norm": 0.24848484848484848, + "acc_norm_stderr": 0.03374402644139406 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.35353535353535354, + "acc_stderr": 0.03406086723547153, + "acc_norm": 0.35353535353535354, + "acc_norm_stderr": 0.03406086723547153 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.36787564766839376, + "acc_stderr": 0.03480175668466036, + "acc_norm": 0.36787564766839376, + "acc_norm_stderr": 0.03480175668466036 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3230769230769231, + "acc_stderr": 0.02371088850197057, + "acc_norm": 0.3230769230769231, + "acc_norm_stderr": 0.02371088850197057 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.02684205787383371, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.02684205787383371 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.026265024608275886, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.026265024608275886 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23841059602649006, + "acc_stderr": 0.0347918557259966, + "acc_norm": 0.23841059602649006, + "acc_norm_stderr": 0.0347918557259966 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3486238532110092, + "acc_stderr": 0.020431254090714328, + "acc_norm": 0.3486238532110092, + "acc_norm_stderr": 0.020431254090714328 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.030190282453501943, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.030190282453501943 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.26582278481012656, + "acc_stderr": 0.028756799629658335, + "acc_norm": 0.26582278481012656, + "acc_norm_stderr": 0.028756799629658335 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.15695067264573992, + "acc_stderr": 0.024413587174907415, + "acc_norm": 0.15695067264573992, + "acc_norm_stderr": 0.024413587174907415 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.21374045801526717, + "acc_stderr": 0.0359546161177469, + "acc_norm": 0.21374045801526717, + "acc_norm_stderr": 0.0359546161177469 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.39669421487603307, + "acc_stderr": 0.04465869780531009, + "acc_norm": 0.39669421487603307, + "acc_norm_stderr": 0.04465869780531009 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.17592592592592593, + "acc_stderr": 0.036809181416738807, + "acc_norm": 0.17592592592592593, + "acc_norm_stderr": 0.036809181416738807 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.26993865030674846, + "acc_stderr": 0.034878251684978906, + "acc_norm": 0.26993865030674846, + "acc_norm_stderr": 0.034878251684978906 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.17857142857142858, + "acc_stderr": 0.036352091215778065, + "acc_norm": 0.17857142857142858, + "acc_norm_stderr": 0.036352091215778065 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3786407766990291, + "acc_stderr": 0.048026946982589726, + "acc_norm": 0.3786407766990291, + "acc_norm_stderr": 0.048026946982589726 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.1794871794871795, + "acc_stderr": 0.02514093595033544, + "acc_norm": 0.1794871794871795, + "acc_norm_stderr": 0.02514093595033544 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.20945083014048532, + "acc_stderr": 0.014551310568143709, + "acc_norm": 0.20945083014048532, + "acc_norm_stderr": 0.014551310568143709 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24277456647398843, + "acc_stderr": 0.023083658586984204, + "acc_norm": 0.24277456647398843, + "acc_norm_stderr": 0.023083658586984204 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24916201117318434, + "acc_stderr": 0.014465893829859933, + "acc_norm": 0.24916201117318434, + "acc_norm_stderr": 0.014465893829859933 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.25163398692810457, + "acc_stderr": 0.024848018263875195, + "acc_norm": 0.25163398692810457, + "acc_norm_stderr": 0.024848018263875195 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2604501607717042, + "acc_stderr": 0.024926723224845553, + "acc_norm": 0.2604501607717042, + "acc_norm_stderr": 0.024926723224845553 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.22530864197530864, + "acc_stderr": 0.023246202647819746, + "acc_norm": 0.22530864197530864, + "acc_norm_stderr": 0.023246202647819746 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2695035460992908, + "acc_stderr": 0.026469036818590638, + "acc_norm": 0.2695035460992908, + "acc_norm_stderr": 0.026469036818590638 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.26010430247718386, + "acc_stderr": 0.01120438288782383, + "acc_norm": 0.26010430247718386, + "acc_norm_stderr": 0.01120438288782383 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4485294117647059, + "acc_stderr": 0.030211479609121593, + "acc_norm": 0.4485294117647059, + "acc_norm_stderr": 0.030211479609121593 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2679738562091503, + "acc_stderr": 0.017917974069594726, + "acc_norm": 0.2679738562091503, + "acc_norm_stderr": 0.017917974069594726 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4, + "acc_stderr": 0.031362502409358936, + "acc_norm": 0.4, + "acc_norm_stderr": 0.031362502409358936 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.22885572139303484, + "acc_stderr": 0.029705284056772426, + "acc_norm": 0.22885572139303484, + "acc_norm_stderr": 0.029705284056772426 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.1927710843373494, + "acc_stderr": 0.030709824050565274, + "acc_norm": 0.1927710843373494, + "acc_norm_stderr": 0.030709824050565274 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.0330140594698725, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.0330140594698725 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.22643818849449204, + "mc1_stderr": 0.014651337324602574, + "mc2": 0.400227595117968, + "mc2_stderr": 0.014706588296799038 + }, + "all": { + "acc": 0.2634280915972625, + "acc_stderr": 0.03159930223583716, + "acc_norm": 0.2643188088482433, + "acc_norm_stderr": 0.0316121880742916, + "mc1": 0.22643818849449204, + "mc1_stderr": 0.014651337324602574, + "mc2": 0.400227595117968, + "mc2_stderr": 0.014706588296799038 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "d57e59a4130853e0" + }, + "truncated": 1568, + "non-truncated": 3119, + "padded": 3087, + "non-padded": 1600, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "d8973ec3a510d4bc" + }, + "truncated": 1975, + "non-truncated": 38193, + "padded": 38021, + "non-padded": 2147, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "4a75531cbfd07f95" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "accb7cef363cf18e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "16b3626c8a5e3797" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "14362f67beb028ba" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "69d91a3fd2e4511e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 20, + "non-truncated": 672, + "padded": 660, + "non-padded": 32, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "4468714c283b10f9" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "8d66c298f1a52c46" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "f23c2d0723d2f830" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "9cf4df701a8e97ca" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "120b77ffae8b0591" + }, + "truncated": 16, + "non-truncated": 384, + "padded": 384, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "1ba11ec0fba0a4bb" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "822c5217a581c95f" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "a745b56725d20832" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "969464bbd6828346" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "f00cfc03022d559a" + }, + "truncated": 4, + "non-truncated": 860, + "padded": 860, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "f6dd7cf291429cd9" + }, + "truncated": 948, + "non-truncated": 0, + "padded": 0, + "non-padded": 948, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "ad79993e5e453770" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "5904fef477924132" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "201895f1be790f02" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "38fadc6201499c0e" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "dcdd301556b5df9e" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "67c525ef797587ce" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "0d9fbe99f871c5c5" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 6136, + "non-truncated": 0, + "padded": 0, + "non-padded": 6136, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "01ddc79c7e1f2f6d" + }, + "truncated": 1032, + "non-truncated": 56, + "padded": 48, + "non-padded": 1040, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "fa0fc10c4bdd757c" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "6483ae9688e0a0d6" + }, + "truncated": 980, + "non-truncated": 0, + "padded": 0, + "non-padded": 980, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "9ec52ea7962c54f5" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "bc42db2c568e27d6" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "c8f2395107c4b82b" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "18a3fbefef0c4910", + "hash_cont_tokens": "f1f2fb65023f2668" + }, + "total_evaluation_time_secondes": "1153.3299033641815", + "truncated": 14155, + "non-truncated": 96864, + "padded": 96540, + "non-padded": 14479, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/gpt2_guanaco-dolly-platypus/results_2023-10-15T17-11-56.219131.json b/eval-results/lgaalves/gpt2_guanaco-dolly-platypus/results_2023-10-15T17-11-56.219131.json new file mode 100644 index 0000000000000000000000000000000000000000..48f4216d67f0d3e68feb3313fd72d7421648a58d --- /dev/null +++ b/eval-results/lgaalves/gpt2_guanaco-dolly-platypus/results_2023-10-15T17-11-56.219131.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "lgaalves/gpt2_guanaco-dolly-platypus", + "model_sha": "72fe7fd1e4a2c313f15945b9894e0f5813d248d8", + "model_size": "238.85 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0026216442953020135, + "em_stderr": 0.0005236685642965757, + "f1": 0.04961304530201346, + "f1_stderr": 0.001421455981669693 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5011838989739542, + "acc_stderr": 0.014052446290529012 + }, + "all": { + "em": 0.0026216442953020135, + "em_stderr": 0.0005236685642965757, + "f1": 0.04961304530201346, + "f1_stderr": 0.001421455981669693, + "acc": 0.2505919494869771, + "acc_stderr": 0.007026223145264506 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "66ee3e799c67d679" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "7fd536076038ba9e" + }, + "truncated": 917, + "non-truncated": 402, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "6b3de69e2f87348c", + "hash_cont_tokens": "2926f119c704ff04" + }, + "total_evaluation_time_secondes": "5504.074031352997", + "truncated": 10207, + "non-truncated": 3182, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/gpt2_open-platypus/results_2023-08-31T17-11-08.445217.json b/eval-results/lgaalves/gpt2_open-platypus/results_2023-08-31T17-11-08.445217.json new file mode 100644 index 0000000000000000000000000000000000000000..ce48d624241616a2be521a92546f7ecf4dd45762 --- /dev/null +++ b/eval-results/lgaalves/gpt2_open-platypus/results_2023-08-31T17-11-08.445217.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "lgaalves/gpt2_open-platypus", + "model_sha": "745c1864b752525789cad2b75166c519a327325e", + "model_dtype": "torch.float16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.18771331058020477, + "acc_stderr": 0.011411001314155128, + "acc_norm": 0.22184300341296928, + "acc_norm_stderr": 0.012141659068147887 + }, + "harness|hellaswag|10": { + "acc": 0.29376618203545113, + "acc_stderr": 0.004545552424153374, + "acc_norm": 0.312885879306911, + "acc_norm_stderr": 0.004627207073171274 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036843, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036843 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.03944624162501117, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.03944624162501117 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.25, + "acc_stderr": 0.03523807393012047, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03523807393012047 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.15, + "acc_stderr": 0.03588702812826372, + "acc_norm": 0.15, + "acc_norm_stderr": 0.03588702812826372 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2981132075471698, + "acc_stderr": 0.028152837942493857, + "acc_norm": 0.2981132075471698, + "acc_norm_stderr": 0.028152837942493857 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.24277456647398843, + "acc_stderr": 0.0326926380614177, + "acc_norm": 0.24277456647398843, + "acc_norm_stderr": 0.0326926380614177 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.18627450980392157, + "acc_stderr": 0.038739587141493524, + "acc_norm": 0.18627450980392157, + "acc_norm_stderr": 0.038739587141493524 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.251063829787234, + "acc_stderr": 0.02834696377716245, + "acc_norm": 0.251063829787234, + "acc_norm_stderr": 0.02834696377716245 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813365 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.296551724137931, + "acc_stderr": 0.03806142687309994, + "acc_norm": 0.296551724137931, + "acc_norm_stderr": 0.03806142687309994 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25132275132275134, + "acc_stderr": 0.022340482339643898, + "acc_norm": 0.25132275132275134, + "acc_norm_stderr": 0.022340482339643898 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.03893259610604675, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.03893259610604675 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.14, + "acc_stderr": 0.03487350880197771, + "acc_norm": 0.14, + "acc_norm_stderr": 0.03487350880197771 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24516129032258063, + "acc_stderr": 0.024472243840895525, + "acc_norm": 0.24516129032258063, + "acc_norm_stderr": 0.024472243840895525 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2660098522167488, + "acc_stderr": 0.03108982600293752, + "acc_norm": 0.2660098522167488, + "acc_norm_stderr": 0.03108982600293752 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.23030303030303031, + "acc_stderr": 0.03287666758603488, + "acc_norm": 0.23030303030303031, + "acc_norm_stderr": 0.03287666758603488 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.35353535353535354, + "acc_stderr": 0.03406086723547153, + "acc_norm": 0.35353535353535354, + "acc_norm_stderr": 0.03406086723547153 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.36787564766839376, + "acc_stderr": 0.03480175668466036, + "acc_norm": 0.36787564766839376, + "acc_norm_stderr": 0.03480175668466036 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.36153846153846153, + "acc_stderr": 0.024359581465396987, + "acc_norm": 0.36153846153846153, + "acc_norm_stderr": 0.024359581465396987 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.026842057873833706, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.026842057873833706 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2184873949579832, + "acc_stderr": 0.026841514322958945, + "acc_norm": 0.2184873949579832, + "acc_norm_stderr": 0.026841514322958945 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.03631329803969654, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.03631329803969654 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3486238532110092, + "acc_stderr": 0.020431254090714328, + "acc_norm": 0.3486238532110092, + "acc_norm_stderr": 0.020431254090714328 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.030778554678693268, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.030778554678693268 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.10762331838565023, + "acc_stderr": 0.020799400082879997, + "acc_norm": 0.10762331838565023, + "acc_norm_stderr": 0.020799400082879997 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.29770992366412213, + "acc_stderr": 0.040103589424622034, + "acc_norm": 0.29770992366412213, + "acc_norm_stderr": 0.040103589424622034 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.256198347107438, + "acc_stderr": 0.03984979653302871, + "acc_norm": 0.256198347107438, + "acc_norm_stderr": 0.03984979653302871 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.18518518518518517, + "acc_stderr": 0.03755265865037181, + "acc_norm": 0.18518518518518517, + "acc_norm_stderr": 0.03755265865037181 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2883435582822086, + "acc_stderr": 0.035590395316173425, + "acc_norm": 0.2883435582822086, + "acc_norm_stderr": 0.035590395316173425 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.15178571428571427, + "acc_stderr": 0.034057028381856945, + "acc_norm": 0.15178571428571427, + "acc_norm_stderr": 0.034057028381856945 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.36893203883495146, + "acc_stderr": 0.047776151811567386, + "acc_norm": 0.36893203883495146, + "acc_norm_stderr": 0.047776151811567386 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.027236013946196697, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.027236013946196697 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2081736909323116, + "acc_stderr": 0.014518592248904033, + "acc_norm": 0.2081736909323116, + "acc_norm_stderr": 0.014518592248904033 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2514450867052023, + "acc_stderr": 0.02335736578587404, + "acc_norm": 0.2514450867052023, + "acc_norm_stderr": 0.02335736578587404 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2446927374301676, + "acc_stderr": 0.014378169884098442, + "acc_norm": 0.2446927374301676, + "acc_norm_stderr": 0.014378169884098442 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2581699346405229, + "acc_stderr": 0.025058503316958154, + "acc_norm": 0.2581699346405229, + "acc_norm_stderr": 0.025058503316958154 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24115755627009647, + "acc_stderr": 0.024296594034763426, + "acc_norm": 0.24115755627009647, + "acc_norm_stderr": 0.024296594034763426 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.22530864197530864, + "acc_stderr": 0.023246202647819746, + "acc_norm": 0.22530864197530864, + "acc_norm_stderr": 0.023246202647819746 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2624113475177305, + "acc_stderr": 0.026244920349843014, + "acc_norm": 0.2624113475177305, + "acc_norm_stderr": 0.026244920349843014 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.25684485006518903, + "acc_stderr": 0.011158455853098846, + "acc_norm": 0.25684485006518903, + "acc_norm_stderr": 0.011158455853098846 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4485294117647059, + "acc_stderr": 0.030211479609121593, + "acc_norm": 0.4485294117647059, + "acc_norm_stderr": 0.030211479609121593 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.016906615927288145, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.016906615927288145 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.23636363636363636, + "acc_stderr": 0.04069306319721377, + "acc_norm": 0.23636363636363636, + "acc_norm_stderr": 0.04069306319721377 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4, + "acc_stderr": 0.031362502409358936, + "acc_norm": 0.4, + "acc_norm_stderr": 0.031362502409358936 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.1890547263681592, + "acc_stderr": 0.02768691358801302, + "acc_norm": 0.1890547263681592, + "acc_norm_stderr": 0.02768691358801302 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.1927710843373494, + "acc_stderr": 0.030709824050565274, + "acc_norm": 0.1927710843373494, + "acc_norm_stderr": 0.030709824050565274 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.30409356725146197, + "acc_stderr": 0.03528211258245233, + "acc_norm": 0.30409356725146197, + "acc_norm_stderr": 0.03528211258245233 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23255813953488372, + "mc1_stderr": 0.014789157531080501, + "mc2": 0.4035485299373991, + "mc2_stderr": 0.014943399102893608 + }, + "all": { + "acc": 0.26122419187689344, + "acc_stderr": 0.031488123357770466, + "acc_norm": 0.26212672391255826, + "acc_norm_stderr": 0.03150189136460116, + "mc1": 0.23255813953488372, + "mc1_stderr": 0.014789157531080501, + "mc2": 0.4035485299373991, + "mc2_stderr": 0.014943399102893608 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "d57e59a4130853e0" + }, + "truncated": 1568, + "non-truncated": 3119, + "padded": 3087, + "non-padded": 1600, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "d8973ec3a510d4bc" + }, + "truncated": 1975, + "non-truncated": 38193, + "padded": 38021, + "non-padded": 2147, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "4a75531cbfd07f95" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "accb7cef363cf18e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "16b3626c8a5e3797" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "14362f67beb028ba" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "69d91a3fd2e4511e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 20, + "non-truncated": 672, + "padded": 660, + "non-padded": 32, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "4468714c283b10f9" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "8d66c298f1a52c46" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "f23c2d0723d2f830" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "9cf4df701a8e97ca" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "120b77ffae8b0591" + }, + "truncated": 16, + "non-truncated": 384, + "padded": 384, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "1ba11ec0fba0a4bb" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "822c5217a581c95f" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "a745b56725d20832" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "969464bbd6828346" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "f00cfc03022d559a" + }, + "truncated": 4, + "non-truncated": 860, + "padded": 860, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "f6dd7cf291429cd9" + }, + "truncated": 948, + "non-truncated": 0, + "padded": 0, + "non-padded": 948, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "ad79993e5e453770" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "5904fef477924132" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "201895f1be790f02" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "38fadc6201499c0e" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "dcdd301556b5df9e" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "67c525ef797587ce" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "0d9fbe99f871c5c5" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 6136, + "non-truncated": 0, + "padded": 0, + "non-padded": 6136, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "01ddc79c7e1f2f6d" + }, + "truncated": 1032, + "non-truncated": 56, + "padded": 48, + "non-padded": 1040, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "fa0fc10c4bdd757c" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "6483ae9688e0a0d6" + }, + "truncated": 980, + "non-truncated": 0, + "padded": 0, + "non-padded": 980, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "9ec52ea7962c54f5" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "bc42db2c568e27d6" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "c8f2395107c4b82b" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "18a3fbefef0c4910", + "hash_cont_tokens": "f1f2fb65023f2668" + }, + "total_evaluation_time_secondes": "1169.302691936493", + "truncated": 14155, + "non-truncated": 96864, + "padded": 96540, + "non-padded": 14479, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/gpt2_open-platypus/results_2023-10-15T13-45-26.230063.json b/eval-results/lgaalves/gpt2_open-platypus/results_2023-10-15T13-45-26.230063.json new file mode 100644 index 0000000000000000000000000000000000000000..89c788e7a294a81b122a3b8552182e8cc18f9165 --- /dev/null +++ b/eval-results/lgaalves/gpt2_open-platypus/results_2023-10-15T13-45-26.230063.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "lgaalves/gpt2_open-platypus", + "model_sha": "b36291867368309c2cc94e5077d5ae5a917b3326", + "model_size": "238.85 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001363255033557047, + "em_stderr": 0.00037786091964607695, + "f1": 0.04636010906040263, + "f1_stderr": 0.0012972722820894797 + }, + "harness|gsm8k|5": { + "acc": 0.001516300227445034, + "acc_stderr": 0.0010717793485492632 + }, + "harness|winogrande|5": { + "acc": 0.5130228887134964, + "acc_stderr": 0.01404771839399767 + }, + "all": { + "em": 0.001363255033557047, + "em_stderr": 0.00037786091964607695, + "f1": 0.04636010906040263, + "f1_stderr": 0.0012972722820894797, + "acc": 0.25726959447047076, + "acc_stderr": 0.007559748871273466 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "ab5c1a26dc81d3db" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "b09fa1d611cd76a9" + }, + "truncated": 917, + "non-truncated": 402, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "6b3de69e2f87348c", + "hash_cont_tokens": "d2d2844282737759" + }, + "total_evaluation_time_secondes": "5292.907732248306", + "truncated": 10207, + "non-truncated": 3182, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/gpt2_platypus-camel_physics/results_2023-09-11T15-51-24.784876.json b/eval-results/lgaalves/gpt2_platypus-camel_physics/results_2023-09-11T15-51-24.784876.json new file mode 100644 index 0000000000000000000000000000000000000000..6d635da93a011e1985addbb36eec1fc46f1f8bce --- /dev/null +++ b/eval-results/lgaalves/gpt2_platypus-camel_physics/results_2023-09-11T15-51-24.784876.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "lgaalves/gpt2_platypus-camel_physics", + "model_sha": "66165ff32ed8de6c39f3524a810f5e97ba6d3347", + "model_size": "238.85 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.19795221843003413, + "acc_stderr": 0.011643990971573405, + "acc_norm": 0.23037542662116042, + "acc_norm_stderr": 0.01230492841874761 + }, + "harness|hellaswag|10": { + "acc": 0.29187412865962953, + "acc_stderr": 0.004536955796510544, + "acc_norm": 0.31318462457677754, + "acc_norm_stderr": 0.0046284090842187535 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.0416333199893227, + "acc_norm": 0.22, + "acc_norm_stderr": 0.0416333199893227 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.0391545063041425, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.0391545063041425 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.23026315789473684, + "acc_stderr": 0.03426059424403165, + "acc_norm": 0.23026315789473684, + "acc_norm_stderr": 0.03426059424403165 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2792452830188679, + "acc_stderr": 0.02761116340239972, + "acc_norm": 0.2792452830188679, + "acc_norm_stderr": 0.02761116340239972 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2708333333333333, + "acc_stderr": 0.03716177437566015, + "acc_norm": 0.2708333333333333, + "acc_norm_stderr": 0.03716177437566015 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2543352601156069, + "acc_stderr": 0.0332055644308557, + "acc_norm": 0.2543352601156069, + "acc_norm_stderr": 0.0332055644308557 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.04280105837364396, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.04280105837364396 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.23, + "acc_stderr": 0.042295258468165044, + "acc_norm": 0.23, + "acc_norm_stderr": 0.042295258468165044 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.25957446808510637, + "acc_stderr": 0.02865917937429232, + "acc_norm": 0.25957446808510637, + "acc_norm_stderr": 0.02865917937429232 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748142, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748142 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3103448275862069, + "acc_stderr": 0.03855289616378948, + "acc_norm": 0.3103448275862069, + "acc_norm_stderr": 0.03855289616378948 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24867724867724866, + "acc_stderr": 0.022261817692400175, + "acc_norm": 0.24867724867724866, + "acc_norm_stderr": 0.022261817692400175 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23015873015873015, + "acc_stderr": 0.03764950879790605, + "acc_norm": 0.23015873015873015, + "acc_norm_stderr": 0.03764950879790605 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.16, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.16, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2870967741935484, + "acc_stderr": 0.025736542745594528, + "acc_norm": 0.2870967741935484, + "acc_norm_stderr": 0.025736542745594528 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.28078817733990147, + "acc_stderr": 0.03161856335358611, + "acc_norm": 0.28078817733990147, + "acc_norm_stderr": 0.03161856335358611 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.03453131801885415, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.03453131801885415 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.35353535353535354, + "acc_stderr": 0.03406086723547153, + "acc_norm": 0.35353535353535354, + "acc_norm_stderr": 0.03406086723547153 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.37305699481865284, + "acc_stderr": 0.03490205592048573, + "acc_norm": 0.37305699481865284, + "acc_norm_stderr": 0.03490205592048573 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.34102564102564104, + "acc_stderr": 0.02403548967633507, + "acc_norm": 0.34102564102564104, + "acc_norm_stderr": 0.02403548967633507 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.026719240783712163, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.026719240783712163 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.026653531596715473, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.026653531596715473 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.25165562913907286, + "acc_stderr": 0.03543304234389985, + "acc_norm": 0.25165562913907286, + "acc_norm_stderr": 0.03543304234389985 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3486238532110092, + "acc_stderr": 0.020431254090714328, + "acc_norm": 0.3486238532110092, + "acc_norm_stderr": 0.020431254090714328 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.26582278481012656, + "acc_stderr": 0.028756799629658335, + "acc_norm": 0.26582278481012656, + "acc_norm_stderr": 0.028756799629658335 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.10762331838565023, + "acc_stderr": 0.020799400082879997, + "acc_norm": 0.10762331838565023, + "acc_norm_stderr": 0.020799400082879997 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.371900826446281, + "acc_stderr": 0.044120158066245044, + "acc_norm": 0.371900826446281, + "acc_norm_stderr": 0.044120158066245044 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2037037037037037, + "acc_stderr": 0.03893542518824847, + "acc_norm": 0.2037037037037037, + "acc_norm_stderr": 0.03893542518824847 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3006134969325153, + "acc_stderr": 0.03602511318806771, + "acc_norm": 0.3006134969325153, + "acc_norm_stderr": 0.03602511318806771 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.17857142857142858, + "acc_stderr": 0.036352091215778065, + "acc_norm": 0.17857142857142858, + "acc_norm_stderr": 0.036352091215778065 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3786407766990291, + "acc_stderr": 0.04802694698258972, + "acc_norm": 0.3786407766990291, + "acc_norm_stderr": 0.04802694698258972 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.19658119658119658, + "acc_stderr": 0.02603538609895129, + "acc_norm": 0.19658119658119658, + "acc_norm_stderr": 0.02603538609895129 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.20306513409961685, + "acc_stderr": 0.014385525076611578, + "acc_norm": 0.20306513409961685, + "acc_norm_stderr": 0.014385525076611578 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.21098265895953758, + "acc_stderr": 0.021966309947043124, + "acc_norm": 0.21098265895953758, + "acc_norm_stderr": 0.021966309947043124 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2435754189944134, + "acc_stderr": 0.01435591196476786, + "acc_norm": 0.2435754189944134, + "acc_norm_stderr": 0.01435591196476786 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.25163398692810457, + "acc_stderr": 0.024848018263875195, + "acc_norm": 0.25163398692810457, + "acc_norm_stderr": 0.024848018263875195 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.28938906752411575, + "acc_stderr": 0.025755865922632924, + "acc_norm": 0.28938906752411575, + "acc_norm_stderr": 0.025755865922632924 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.22530864197530864, + "acc_stderr": 0.023246202647819746, + "acc_norm": 0.22530864197530864, + "acc_norm_stderr": 0.023246202647819746 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2695035460992908, + "acc_stderr": 0.026469036818590638, + "acc_norm": 0.2695035460992908, + "acc_norm_stderr": 0.026469036818590638 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24511082138200782, + "acc_stderr": 0.010986307870045514, + "acc_norm": 0.24511082138200782, + "acc_norm_stderr": 0.010986307870045514 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.44485294117647056, + "acc_stderr": 0.030187532060329376, + "acc_norm": 0.44485294117647056, + "acc_norm_stderr": 0.030187532060329376 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.26143790849673204, + "acc_stderr": 0.017776947157528023, + "acc_norm": 0.26143790849673204, + "acc_norm_stderr": 0.017776947157528023 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.20909090909090908, + "acc_stderr": 0.03895091015724137, + "acc_norm": 0.20909090909090908, + "acc_norm_stderr": 0.03895091015724137 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4, + "acc_stderr": 0.031362502409358936, + "acc_norm": 0.4, + "acc_norm_stderr": 0.031362502409358936 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.21890547263681592, + "acc_stderr": 0.029239174636647, + "acc_norm": 0.21890547263681592, + "acc_norm_stderr": 0.029239174636647 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.1927710843373494, + "acc_stderr": 0.030709824050565274, + "acc_norm": 0.1927710843373494, + "acc_norm_stderr": 0.030709824050565274 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.25146198830409355, + "acc_stderr": 0.033275044238468436, + "acc_norm": 0.25146198830409355, + "acc_norm_stderr": 0.033275044238468436 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.22766217870257038, + "mc1_stderr": 0.01467925503211107, + "mc2": 0.3955559845281961, + "mc2_stderr": 0.014839540193741688 + }, + "all": { + "acc": 0.2683248305375654, + "acc_stderr": 0.03193677298130021, + "acc_norm": 0.2692355712851633, + "acc_norm_stderr": 0.0319495253666372, + "mc1": 0.22766217870257038, + "mc1_stderr": 0.01467925503211107, + "mc2": 0.3955559845281961, + "mc2_stderr": 0.014839540193741688 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "ed17e576dbafa5da" + }, + "truncated": 1568, + "non-truncated": 3119, + "padded": 3087, + "non-padded": 1600, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "0875c25c8fc0a94d" + }, + "truncated": 1975, + "non-truncated": 38193, + "padded": 38021, + "non-padded": 2147, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "18cfffb76bc8f0d1" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "16b3626c8a5e3797" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 20, + "non-truncated": 672, + "padded": 660, + "non-padded": 32, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "21f0989f5760198a" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "f7d801bfd913884d" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "23f9089575432d5a" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "04b8293f2ab7fbbf" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 16, + "non-truncated": 384, + "padded": 384, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "7994d94bfa36d003" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "a2c91752be5b1798" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "db71da66ed82b921" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "e81cf9738ad7e157" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "4a2d5f00cb00d9b7" + }, + "truncated": 4, + "non-truncated": 860, + "padded": 860, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "e9bcfaa6beefb456" + }, + "truncated": 948, + "non-truncated": 0, + "padded": 0, + "non-padded": 948, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "6f8215a3de7eebd1" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "aacac708cd4c5a61" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "16b6c6e390eb7cea" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "4130880a19c4edb0" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "96b81f570a84328b" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "e3a7592f84b44888" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "f9edf462e8201551" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 6136, + "non-truncated": 0, + "padded": 0, + "non-padded": 6136, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "ecf7754754c2bb76" + }, + "truncated": 1032, + "non-truncated": 56, + "padded": 48, + "non-padded": 1040, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "30b07e31cf9b5c6f" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "4d1dc7c4ad251829" + }, + "truncated": 980, + "non-truncated": 0, + "padded": 0, + "non-padded": 980, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "d36b9d9f0f4424fe" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "a0a7af55ac7ae037" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "84fd36aa004c8578" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "18a3fbefef0c4910", + "hash_cont_tokens": "24012b7d40528568" + }, + "total_evaluation_time_secondes": "1514.718344926834", + "truncated": 14155, + "non-truncated": 96864, + "padded": 96540, + "non-padded": 14479, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/gpt2_platypus-camel_physics/results_2023-10-25T13-50-32.288438.json b/eval-results/lgaalves/gpt2_platypus-camel_physics/results_2023-10-25T13-50-32.288438.json new file mode 100644 index 0000000000000000000000000000000000000000..b59b7ba3efcb4701a05eaf342a673678b258945e --- /dev/null +++ b/eval-results/lgaalves/gpt2_platypus-camel_physics/results_2023-10-25T13-50-32.288438.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "lgaalves/gpt2_platypus-camel_physics", + "model_sha": "e6ce889b6929b9dc612a65f7ec6fddf04499ce4d", + "model_size": "238.85 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.002307046979865772, + "em_stderr": 0.0004913221265094493, + "f1": 0.04785339765100675, + "f1_stderr": 0.001366270058429369 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.4964483030781373, + "acc_stderr": 0.014052131146915873 + }, + "all": { + "em": 0.002307046979865772, + "em_stderr": 0.0004913221265094493, + "f1": 0.04785339765100675, + "f1_stderr": 0.001366270058429369, + "acc": 0.24822415153906865, + "acc_stderr": 0.007026065573457936 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "9882c82804b094be" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "da3b06f6923fe854" + }, + "truncated": 917, + "non-truncated": 402, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "6b3de69e2f87348c", + "hash_cont_tokens": "942fba3ebb9519e4" + }, + "total_evaluation_time_secondes": "5286.984570741653", + "truncated": 10207, + "non-truncated": 3182, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/gpt2_platypus-dolly-guanaco/results_2023-08-31T20-05-00.341927.json b/eval-results/lgaalves/gpt2_platypus-dolly-guanaco/results_2023-08-31T20-05-00.341927.json new file mode 100644 index 0000000000000000000000000000000000000000..3f6fb205547994f11666f9d8e3eee71191564a95 --- /dev/null +++ b/eval-results/lgaalves/gpt2_platypus-dolly-guanaco/results_2023-08-31T20-05-00.341927.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "lgaalves/gpt2_platypus-dolly-guanaco", + "model_sha": "bfa144d3eb087e54f1798fd2e2fb17e894cc39d3", + "model_dtype": "torch.float16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.197098976109215, + "acc_stderr": 0.011625047669880638, + "acc_norm": 0.23208191126279865, + "acc_norm_stderr": 0.012336718284948854 + }, + "harness|hellaswag|10": { + "acc": 0.29087831109340767, + "acc_stderr": 0.0045323931112486865, + "acc_norm": 0.3103963353913563, + "acc_norm_stderr": 0.004617103280372034 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3111111111111111, + "acc_stderr": 0.039992628766177235, + "acc_norm": 0.3111111111111111, + "acc_norm_stderr": 0.039992628766177235 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.19736842105263158, + "acc_stderr": 0.03238981601699397, + "acc_norm": 0.19736842105263158, + "acc_norm_stderr": 0.03238981601699397 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.16, + "acc_stderr": 0.03684529491774708, + "acc_norm": 0.16, + "acc_norm_stderr": 0.03684529491774708 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.27169811320754716, + "acc_stderr": 0.027377706624670713, + "acc_norm": 0.27169811320754716, + "acc_norm_stderr": 0.027377706624670713 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.03745554791462458, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.03745554791462458 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653696, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653696 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.23699421965317918, + "acc_stderr": 0.03242414757483098, + "acc_norm": 0.23699421965317918, + "acc_norm_stderr": 0.03242414757483098 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171453, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171453 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036843, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036843 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2297872340425532, + "acc_stderr": 0.027501752944412424, + "acc_norm": 0.2297872340425532, + "acc_norm_stderr": 0.027501752944412424 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21929824561403508, + "acc_stderr": 0.03892431106518752, + "acc_norm": 0.21929824561403508, + "acc_norm_stderr": 0.03892431106518752 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2896551724137931, + "acc_stderr": 0.037800192304380135, + "acc_norm": 0.2896551724137931, + "acc_norm_stderr": 0.037800192304380135 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24603174603174602, + "acc_stderr": 0.022182037202948368, + "acc_norm": 0.24603174603174602, + "acc_norm_stderr": 0.022182037202948368 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23809523809523808, + "acc_stderr": 0.038095238095238106, + "acc_norm": 0.23809523809523808, + "acc_norm_stderr": 0.038095238095238106 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.16, + "acc_stderr": 0.03684529491774708, + "acc_norm": 0.16, + "acc_norm_stderr": 0.03684529491774708 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3032258064516129, + "acc_stderr": 0.02614868593067175, + "acc_norm": 0.3032258064516129, + "acc_norm_stderr": 0.02614868593067175 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.28078817733990147, + "acc_stderr": 0.0316185633535861, + "acc_norm": 0.28078817733990147, + "acc_norm_stderr": 0.0316185633535861 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.0340150671524904, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.0340150671524904 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.35353535353535354, + "acc_stderr": 0.03406086723547153, + "acc_norm": 0.35353535353535354, + "acc_norm_stderr": 0.03406086723547153 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.36787564766839376, + "acc_stderr": 0.03480175668466036, + "acc_norm": 0.36787564766839376, + "acc_norm_stderr": 0.03480175668466036 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2846153846153846, + "acc_stderr": 0.022878322799706294, + "acc_norm": 0.2846153846153846, + "acc_norm_stderr": 0.022878322799706294 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.026962424325073828, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.026962424325073828 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21008403361344538, + "acc_stderr": 0.026461398717471874, + "acc_norm": 0.21008403361344538, + "acc_norm_stderr": 0.026461398717471874 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.25165562913907286, + "acc_stderr": 0.03543304234389985, + "acc_norm": 0.25165562913907286, + "acc_norm_stderr": 0.03543304234389985 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3486238532110092, + "acc_stderr": 0.020431254090714328, + "acc_norm": 0.3486238532110092, + "acc_norm_stderr": 0.020431254090714328 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.23039215686274508, + "acc_stderr": 0.029554292605695066, + "acc_norm": 0.23039215686274508, + "acc_norm_stderr": 0.029554292605695066 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.26582278481012656, + "acc_stderr": 0.028756799629658335, + "acc_norm": 0.26582278481012656, + "acc_norm_stderr": 0.028756799629658335 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.15246636771300448, + "acc_stderr": 0.024126204813252883, + "acc_norm": 0.15246636771300448, + "acc_norm_stderr": 0.024126204813252883 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2366412213740458, + "acc_stderr": 0.03727673575596918, + "acc_norm": 0.2366412213740458, + "acc_norm_stderr": 0.03727673575596918 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.35537190082644626, + "acc_stderr": 0.04369236326573981, + "acc_norm": 0.35537190082644626, + "acc_norm_stderr": 0.04369236326573981 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.039578354719809805, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.039578354719809805 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.26380368098159507, + "acc_stderr": 0.034624199316156234, + "acc_norm": 0.26380368098159507, + "acc_norm_stderr": 0.034624199316156234 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.19642857142857142, + "acc_stderr": 0.03770970049347019, + "acc_norm": 0.19642857142857142, + "acc_norm_stderr": 0.03770970049347019 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.33980582524271846, + "acc_stderr": 0.046897659372781356, + "acc_norm": 0.33980582524271846, + "acc_norm_stderr": 0.046897659372781356 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.17094017094017094, + "acc_stderr": 0.02466249684520981, + "acc_norm": 0.17094017094017094, + "acc_norm_stderr": 0.02466249684520981 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.20689655172413793, + "acc_stderr": 0.014485656041669168, + "acc_norm": 0.20689655172413793, + "acc_norm_stderr": 0.014485656041669168 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.23410404624277456, + "acc_stderr": 0.022797110278071138, + "acc_norm": 0.23410404624277456, + "acc_norm_stderr": 0.022797110278071138 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24804469273743016, + "acc_stderr": 0.014444157808261462, + "acc_norm": 0.24804469273743016, + "acc_norm_stderr": 0.014444157808261462 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.25163398692810457, + "acc_stderr": 0.024848018263875195, + "acc_norm": 0.25163398692810457, + "acc_norm_stderr": 0.024848018263875195 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2508038585209003, + "acc_stderr": 0.02461977195669716, + "acc_norm": 0.2508038585209003, + "acc_norm_stderr": 0.02461977195669716 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.22530864197530864, + "acc_stderr": 0.023246202647819746, + "acc_norm": 0.22530864197530864, + "acc_norm_stderr": 0.023246202647819746 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2695035460992908, + "acc_stderr": 0.026469036818590638, + "acc_norm": 0.2695035460992908, + "acc_norm_stderr": 0.026469036818590638 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2620599739243807, + "acc_stderr": 0.011231552795890394, + "acc_norm": 0.2620599739243807, + "acc_norm_stderr": 0.011231552795890394 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4485294117647059, + "acc_stderr": 0.030211479609121593, + "acc_norm": 0.4485294117647059, + "acc_norm_stderr": 0.030211479609121593 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2581699346405229, + "acc_stderr": 0.017704531653250075, + "acc_norm": 0.2581699346405229, + "acc_norm_stderr": 0.017704531653250075 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4, + "acc_stderr": 0.031362502409358936, + "acc_norm": 0.4, + "acc_norm_stderr": 0.031362502409358936 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.20398009950248755, + "acc_stderr": 0.028493176245326088, + "acc_norm": 0.20398009950248755, + "acc_norm_stderr": 0.028493176245326088 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.1927710843373494, + "acc_stderr": 0.030709824050565274, + "acc_norm": 0.1927710843373494, + "acc_norm_stderr": 0.030709824050565274 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.22807017543859648, + "acc_stderr": 0.03218093795602357, + "acc_norm": 0.22807017543859648, + "acc_norm_stderr": 0.03218093795602357 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23011015911872704, + "mc1_stderr": 0.014734557959807765, + "mc2": 0.40309393921030356, + "mc2_stderr": 0.014704133902737452 + }, + "all": { + "acc": 0.2609766165765764, + "acc_stderr": 0.031559005333834204, + "acc_norm": 0.2619003616520261, + "acc_norm_stderr": 0.03157250331322728, + "mc1": 0.23011015911872704, + "mc1_stderr": 0.014734557959807765, + "mc2": 0.40309393921030356, + "mc2_stderr": 0.014704133902737452 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e641be907f06d33d", + "hash_cont_tokens": "d57e59a4130853e0" + }, + "truncated": 1568, + "non-truncated": 3119, + "padded": 3087, + "non-padded": 1600, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "faab28c8a52792fc", + "hash_cont_tokens": "d8973ec3a510d4bc" + }, + "truncated": 1975, + "non-truncated": 38193, + "padded": 38021, + "non-padded": 2147, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "38f6980885e34dfd", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3ed9431cd09b2a53", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "a79fd75ecff4dacc", + "hash_cont_tokens": "4a75531cbfd07f95" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "178d5666661bf5e1", + "hash_cont_tokens": "accb7cef363cf18e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c926698f7ff06973", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "242f772c5e78312a", + "hash_cont_tokens": "16b3626c8a5e3797" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 568, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8502d8627d2d7aad", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "a0d705ea2c235707", + "hash_cont_tokens": "14362f67beb028ba" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "ff09ef7f164943cd", + "hash_cont_tokens": "69d91a3fd2e4511e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "aca3949388066394", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 20, + "non-truncated": 672, + "padded": 660, + "non-padded": 32, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "c4240f372187f487", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "70a866a1c6ae11ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "29b68a5b3f3afa5f", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "a4a0fc579875cdf9", + "hash_cont_tokens": "4468714c283b10f9" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e1c0ec634eb17ebd", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "542453ad0f99dacf", + "hash_cont_tokens": "8d66c298f1a52c46" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dacff0458f665ef2", + "hash_cont_tokens": "f23c2d0723d2f830" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "61dec75d557c2e93", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "d0afdf91820cacc8", + "hash_cont_tokens": "9cf4df701a8e97ca" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75cd47b5490da17b", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "e369e98a1d0a7424", + "hash_cont_tokens": "120b77ffae8b0591" + }, + "truncated": 16, + "non-truncated": 384, + "padded": 384, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "502376958174bf81", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a4866b51f8a7a60e", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "90f755f89d9fdf5e", + "hash_cont_tokens": "1ba11ec0fba0a4bb" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fb590ff6d9d11883", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "551dbc75535ad2b8", + "hash_cont_tokens": "822c5217a581c95f" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d86fdf5706ec717c", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 940, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a81bca26abd92c41", + "hash_cont_tokens": "a745b56725d20832" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "9c10077b5cda495b", + "hash_cont_tokens": "969464bbd6828346" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "da0c215d66d16d3e", + "hash_cont_tokens": "f00cfc03022d559a" + }, + "truncated": 4, + "non-truncated": 860, + "padded": 860, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4885a382517deebf", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "c1d80e899c4c8872", + "hash_cont_tokens": "f6dd7cf291429cd9" + }, + "truncated": 948, + "non-truncated": 0, + "padded": 0, + "non-padded": 948, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "39da19ee58ce07e6", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f7e0441ab1c223e0", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "119859c5b8103d0b", + "hash_cont_tokens": "ad79993e5e453770" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6ec4910e741606cb", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "96d8b2554f777e3a", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 636, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "249811a7d891a411", + "hash_cont_tokens": "5904fef477924132" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e54df495ffeb4f92", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e9110fe64f420eb5", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "743df5701590c1c5", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "4a20a40ea36bad2d", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "10886977e5516586", + "hash_cont_tokens": "201895f1be790f02" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1372, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "66f56ab7c3b9d662", + "hash_cont_tokens": "38fadc6201499c0e" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "c05c54560499ea35", + "hash_cont_tokens": "dcdd301556b5df9e" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9639c3d92ff98a28", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "91e98834c3a8d8d9", + "hash_cont_tokens": "67c525ef797587ce" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "569fa47691c73088", + "hash_cont_tokens": "0d9fbe99f871c5c5" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1124, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "d93d397bd5db1db6", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 6136, + "non-truncated": 0, + "padded": 0, + "non-padded": 6136, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "7f8acbbde12cfb6b", + "hash_cont_tokens": "01ddc79c7e1f2f6d" + }, + "truncated": 1032, + "non-truncated": 56, + "padded": 48, + "non-padded": 1040, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3aa766c029099569", + "hash_cont_tokens": "fa0fc10c4bdd757c" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "87b924f88832986f", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "1aaa84da588878a6", + "hash_cont_tokens": "6483ae9688e0a0d6" + }, + "truncated": 980, + "non-truncated": 0, + "padded": 0, + "non-padded": 980, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fb555df6139eb2c8", + "hash_cont_tokens": "9ec52ea7962c54f5" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "56cf1eebb25eccb1", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "c6affac16ec860be", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "d2c5da5a69a6312e", + "hash_cont_tokens": "bc42db2c568e27d6" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "21ee2f46c9c3649e", + "hash_cont_tokens": "c8f2395107c4b82b" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "18a3fbefef0c4910", + "hash_cont_tokens": "f1f2fb65023f2668" + }, + "total_evaluation_time_secondes": "1179.568171262741", + "truncated": 14155, + "non-truncated": 96864, + "padded": 96540, + "non-padded": 14479, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/gpt2_platypus-dolly-guanaco/results_2023-09-28T14-27-44.520216.json b/eval-results/lgaalves/gpt2_platypus-dolly-guanaco/results_2023-09-28T14-27-44.520216.json new file mode 100644 index 0000000000000000000000000000000000000000..b2ed5e608576f4ceb952f0b3388c12fdd93be5b0 --- /dev/null +++ b/eval-results/lgaalves/gpt2_platypus-dolly-guanaco/results_2023-09-28T14-27-44.520216.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "lgaalves/gpt2_platypus-dolly-guanaco", + "model_sha": "1f376d80f5833cbddc6fd37c73803d50bd5ae18b", + "model_size": "238.85 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.002307046979865772, + "em_stderr": 0.0004913221265094559, + "f1": 0.04980704697986585, + "f1_stderr": 0.0013966099124026671 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5035516969218626, + "acc_stderr": 0.014052131146915848 + }, + "all": { + "em": 0.002307046979865772, + "em_stderr": 0.0004913221265094559, + "f1": 0.04980704697986585, + "f1_stderr": 0.0013966099124026671, + "acc": 0.2517758484609313, + "acc_stderr": 0.007026065573457924 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fa7a2e45b0104bc4", + "hash_cont_tokens": "79568493ed43b54b" + }, + "truncated": 9290, + "non-truncated": 246, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "52733972d41ebb11", + "hash_cont_tokens": "723868d6194c29b2" + }, + "truncated": 917, + "non-truncated": 402, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "84cacac1590bb0a5", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2426, + "non-padded": 108, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "6b3de69e2f87348c", + "hash_cont_tokens": "3cbca39cc08db01e" + }, + "total_evaluation_time_secondes": "5515.193847894669", + "truncated": 10207, + "non-truncated": 3182, + "padded": 2426, + "non-padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/llama-2-13b-chat-platypus/results_2023-09-12T04-54-55.763898.json b/eval-results/lgaalves/llama-2-13b-chat-platypus/results_2023-09-12T04-54-55.763898.json new file mode 100644 index 0000000000000000000000000000000000000000..d7af88583f51dd2da99698948025bb38cb7cd704 --- /dev/null +++ b/eval-results/lgaalves/llama-2-13b-chat-platypus/results_2023-09-12T04-54-55.763898.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "lgaalves/llama-2-13b-chat-platypus", + "model_sha": "828aa1020fc7d394fe8ee2c596e3211df7656eac", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.49402730375426623, + "acc_stderr": 0.014610348300255795, + "acc_norm": 0.53839590443686, + "acc_norm_stderr": 0.014568245550296356 + }, + "harness|hellaswag|10": { + "acc": 0.6008763194582752, + "acc_stderr": 0.004887174080003032, + "acc_norm": 0.8067118103963354, + "acc_norm_stderr": 0.003940700084503099 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421296, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421296 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5394736842105263, + "acc_stderr": 0.04056242252249033, + "acc_norm": 0.5394736842105263, + "acc_norm_stderr": 0.04056242252249033 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5584905660377358, + "acc_stderr": 0.030561590426731833, + "acc_norm": 0.5584905660377358, + "acc_norm_stderr": 0.030561590426731833 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5138888888888888, + "acc_stderr": 0.04179596617581, + "acc_norm": 0.5138888888888888, + "acc_norm_stderr": 0.04179596617581 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.43352601156069365, + "acc_stderr": 0.03778621079092056, + "acc_norm": 0.43352601156069365, + "acc_norm_stderr": 0.03778621079092056 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.046550104113196177, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.046550104113196177 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4085106382978723, + "acc_stderr": 0.03213418026701576, + "acc_norm": 0.4085106382978723, + "acc_norm_stderr": 0.03213418026701576 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.04166567577101579, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.04166567577101579 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.024796060602699947, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.024796060602699947 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.042857142857142816, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.042857142857142816 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6290322580645161, + "acc_stderr": 0.02748054188795359, + "acc_norm": 0.6290322580645161, + "acc_norm_stderr": 0.02748054188795359 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3497536945812808, + "acc_stderr": 0.03355400904969566, + "acc_norm": 0.3497536945812808, + "acc_norm_stderr": 0.03355400904969566 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.036085410115739666, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.036085410115739666 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6767676767676768, + "acc_stderr": 0.03332299921070644, + "acc_norm": 0.6767676767676768, + "acc_norm_stderr": 0.03332299921070644 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8134715025906736, + "acc_stderr": 0.028112091210117474, + "acc_norm": 0.8134715025906736, + "acc_norm_stderr": 0.028112091210117474 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5128205128205128, + "acc_stderr": 0.025342671293807257, + "acc_norm": 0.5128205128205128, + "acc_norm_stderr": 0.025342671293807257 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.027634907264178544, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.027634907264178544 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5336134453781513, + "acc_stderr": 0.03240501447690071, + "acc_norm": 0.5336134453781513, + "acc_norm_stderr": 0.03240501447690071 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3841059602649007, + "acc_stderr": 0.03971301814719198, + "acc_norm": 0.3841059602649007, + "acc_norm_stderr": 0.03971301814719198 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7247706422018348, + "acc_stderr": 0.019149093743155203, + "acc_norm": 0.7247706422018348, + "acc_norm_stderr": 0.019149093743155203 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4351851851851852, + "acc_stderr": 0.03381200005643525, + "acc_norm": 0.4351851851851852, + "acc_norm_stderr": 0.03381200005643525 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7058823529411765, + "acc_stderr": 0.03198001660115071, + "acc_norm": 0.7058823529411765, + "acc_norm_stderr": 0.03198001660115071 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7215189873417721, + "acc_stderr": 0.029178682304842555, + "acc_norm": 0.7215189873417721, + "acc_norm_stderr": 0.029178682304842555 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6457399103139013, + "acc_stderr": 0.032100621541349864, + "acc_norm": 0.6457399103139013, + "acc_norm_stderr": 0.032100621541349864 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6259541984732825, + "acc_stderr": 0.04243869242230524, + "acc_norm": 0.6259541984732825, + "acc_norm_stderr": 0.04243869242230524 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.768595041322314, + "acc_stderr": 0.038498560987940876, + "acc_norm": 0.768595041322314, + "acc_norm_stderr": 0.038498560987940876 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.04557239513497752, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.04557239513497752 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6625766871165644, + "acc_stderr": 0.03714908409935574, + "acc_norm": 0.6625766871165644, + "acc_norm_stderr": 0.03714908409935574 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285714, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285714 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7281553398058253, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.7281553398058253, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7991452991452992, + "acc_stderr": 0.02624677294689048, + "acc_norm": 0.7991452991452992, + "acc_norm_stderr": 0.02624677294689048 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.015671006009339586, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.015671006009339586 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5838150289017341, + "acc_stderr": 0.02653818910470548, + "acc_norm": 0.5838150289017341, + "acc_norm_stderr": 0.02653818910470548 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3195530726256983, + "acc_stderr": 0.0155955202941474, + "acc_norm": 0.3195530726256983, + "acc_norm_stderr": 0.0155955202941474 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.027914055510468008, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.027914055510468008 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6045016077170418, + "acc_stderr": 0.027770918531427838, + "acc_norm": 0.6045016077170418, + "acc_norm_stderr": 0.027770918531427838 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.027431623722415012, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.027431623722415012 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4078014184397163, + "acc_stderr": 0.02931601177634356, + "acc_norm": 0.4078014184397163, + "acc_norm_stderr": 0.02931601177634356 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.38070404172099087, + "acc_stderr": 0.012401430654645888, + "acc_norm": 0.38070404172099087, + "acc_norm_stderr": 0.012401430654645888 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5441176470588235, + "acc_stderr": 0.030254372573976715, + "acc_norm": 0.5441176470588235, + "acc_norm_stderr": 0.030254372573976715 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5375816993464052, + "acc_stderr": 0.020170614974969758, + "acc_norm": 0.5375816993464052, + "acc_norm_stderr": 0.020170614974969758 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.04350271442923243, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.04350271442923243 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6530612244897959, + "acc_stderr": 0.030472526026726496, + "acc_norm": 0.6530612244897959, + "acc_norm_stderr": 0.030472526026726496 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7313432835820896, + "acc_stderr": 0.03134328358208954, + "acc_norm": 0.7313432835820896, + "acc_norm_stderr": 0.03134328358208954 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5, + "acc_stderr": 0.03892494720807614, + "acc_norm": 0.5, + "acc_norm_stderr": 0.03892494720807614 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7602339181286549, + "acc_stderr": 0.03274485211946956, + "acc_norm": 0.7602339181286549, + "acc_norm_stderr": 0.03274485211946956 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3108935128518972, + "mc1_stderr": 0.016203316673559696, + "mc2": 0.46229050042733816, + "mc2_stderr": 0.014768134860028896 + }, + "all": { + "acc": 0.5445505675467086, + "acc_stderr": 0.03448981086978501, + "acc_norm": 0.548791314862313, + "acc_norm_stderr": 0.03447305533172637, + "mc1": 0.3108935128518972, + "mc1_stderr": 0.016203316673559696, + "mc2": 0.46229050042733816, + "mc2_stderr": 0.014768134860028896 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6332.267200708389", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/llama-2-13b-chat-platypus/results_2023-10-27T20-27-56.260953.json b/eval-results/lgaalves/llama-2-13b-chat-platypus/results_2023-10-27T20-27-56.260953.json new file mode 100644 index 0000000000000000000000000000000000000000..0a72963948da32cdc2b2dea80afcacfbe597dfbf --- /dev/null +++ b/eval-results/lgaalves/llama-2-13b-chat-platypus/results_2023-10-27T20-27-56.260953.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "lgaalves/llama-2-13b-chat-platypus", + "model_sha": "6170e7f94c4694af105ecb42a1195a80440f86d5", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0035654362416107383, + "em_stderr": 0.0006104082299890483, + "f1": 0.06259542785234914, + "f1_stderr": 0.001452272347431231 + }, + "harness|gsm8k|5": { + "acc": 0.12357846853677028, + "acc_stderr": 0.009065050306776914 + }, + "harness|winogrande|5": { + "acc": 0.7600631412786109, + "acc_stderr": 0.01200207862948574 + }, + "all": { + "em": 0.0035654362416107383, + "em_stderr": 0.0006104082299890483, + "f1": 0.06259542785234914, + "f1_stderr": 0.001452272347431231, + "acc": 0.44182080490769055, + "acc_stderr": 0.010533564468131328 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "0348034aec4eb3be" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "2dbe3af9b8a9074c" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "4eec92a69128ad17" + }, + "total_evaluation_time_secondes": "12935.740574598312", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/llama-2-13b-hf-platypus/results_2023-09-18T14-15-46.670153.json b/eval-results/lgaalves/llama-2-13b-hf-platypus/results_2023-09-18T14-15-46.670153.json new file mode 100644 index 0000000000000000000000000000000000000000..f00225c9a8c0517a78f701a00e564172c4b4485e --- /dev/null +++ b/eval-results/lgaalves/llama-2-13b-hf-platypus/results_2023-09-18T14-15-46.670153.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "lgaalves/llama-2-13b-hf-platypus", + "model_sha": "39e07f6213a64d79cf31e9c0773dea6224f7f021", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5511945392491467, + "acc_stderr": 0.014534599585097664, + "acc_norm": 0.5887372013651877, + "acc_norm_stderr": 0.014379441068522082 + }, + "harness|hellaswag|10": { + "acc": 0.6149173471420036, + "acc_stderr": 0.004856203374715453, + "acc_norm": 0.8213503286197968, + "acc_norm_stderr": 0.003822758343922915 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411021, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411021 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5263157894736842, + "acc_stderr": 0.04063302731486671, + "acc_norm": 0.5263157894736842, + "acc_norm_stderr": 0.04063302731486671 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5886792452830188, + "acc_stderr": 0.030285009259009794, + "acc_norm": 0.5886792452830188, + "acc_norm_stderr": 0.030285009259009794 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5902777777777778, + "acc_stderr": 0.04112490974670787, + "acc_norm": 0.5902777777777778, + "acc_norm_stderr": 0.04112490974670787 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.03809342081273957, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.03809342081273957 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.73, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.73, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4340425531914894, + "acc_stderr": 0.032400380867927465, + "acc_norm": 0.4340425531914894, + "acc_norm_stderr": 0.032400380867927465 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.04339138322579861, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.04339138322579861 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4896551724137931, + "acc_stderr": 0.041657747757287644, + "acc_norm": 0.4896551724137931, + "acc_norm_stderr": 0.041657747757287644 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3439153439153439, + "acc_stderr": 0.024464426625596433, + "acc_norm": 0.3439153439153439, + "acc_norm_stderr": 0.024464426625596433 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.04073524322147126, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.04073524322147126 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6612903225806451, + "acc_stderr": 0.026923446059302844, + "acc_norm": 0.6612903225806451, + "acc_norm_stderr": 0.026923446059302844 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.45320197044334976, + "acc_stderr": 0.035025446508458714, + "acc_norm": 0.45320197044334976, + "acc_norm_stderr": 0.035025446508458714 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.03713158067481913, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.03713158067481913 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.696969696969697, + "acc_stderr": 0.032742879140268674, + "acc_norm": 0.696969696969697, + "acc_norm_stderr": 0.032742879140268674 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7979274611398963, + "acc_stderr": 0.02897908979429673, + "acc_norm": 0.7979274611398963, + "acc_norm_stderr": 0.02897908979429673 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5153846153846153, + "acc_stderr": 0.025339003010106515, + "acc_norm": 0.5153846153846153, + "acc_norm_stderr": 0.025339003010106515 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2851851851851852, + "acc_stderr": 0.027528599210340496, + "acc_norm": 0.2851851851851852, + "acc_norm_stderr": 0.027528599210340496 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5672268907563025, + "acc_stderr": 0.03218358107742613, + "acc_norm": 0.5672268907563025, + "acc_norm_stderr": 0.03218358107742613 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3576158940397351, + "acc_stderr": 0.03913453431177258, + "acc_norm": 0.3576158940397351, + "acc_norm_stderr": 0.03913453431177258 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7577981651376147, + "acc_stderr": 0.01836817630659862, + "acc_norm": 0.7577981651376147, + "acc_norm_stderr": 0.01836817630659862 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.033509916046960415, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.033509916046960415 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7696078431372549, + "acc_stderr": 0.029554292605695053, + "acc_norm": 0.7696078431372549, + "acc_norm_stderr": 0.029554292605695053 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7383966244725738, + "acc_stderr": 0.028609516716994934, + "acc_norm": 0.7383966244725738, + "acc_norm_stderr": 0.028609516716994934 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5725190839694656, + "acc_stderr": 0.04338920305792401, + "acc_norm": 0.5725190839694656, + "acc_norm_stderr": 0.04338920305792401 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.04026187527591207, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.04026187527591207 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6851851851851852, + "acc_stderr": 0.04489931073591312, + "acc_norm": 0.6851851851851852, + "acc_norm_stderr": 0.04489931073591312 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6748466257668712, + "acc_stderr": 0.0368035037128646, + "acc_norm": 0.6748466257668712, + "acc_norm_stderr": 0.0368035037128646 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285713, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285713 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7378640776699029, + "acc_stderr": 0.04354631077260595, + "acc_norm": 0.7378640776699029, + "acc_norm_stderr": 0.04354631077260595 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7948717948717948, + "acc_stderr": 0.02645350805404033, + "acc_norm": 0.7948717948717948, + "acc_norm_stderr": 0.02645350805404033 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7509578544061303, + "acc_stderr": 0.015464676163395958, + "acc_norm": 0.7509578544061303, + "acc_norm_stderr": 0.015464676163395958 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.02607431485165708, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.02607431485165708 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.36983240223463687, + "acc_stderr": 0.016145881256056215, + "acc_norm": 0.36983240223463687, + "acc_norm_stderr": 0.016145881256056215 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6013071895424836, + "acc_stderr": 0.028036092273891776, + "acc_norm": 0.6013071895424836, + "acc_norm_stderr": 0.028036092273891776 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6334405144694534, + "acc_stderr": 0.027368078243971635, + "acc_norm": 0.6334405144694534, + "acc_norm_stderr": 0.027368078243971635 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6296296296296297, + "acc_stderr": 0.026869490744815257, + "acc_norm": 0.6296296296296297, + "acc_norm_stderr": 0.026869490744815257 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.40070921985815605, + "acc_stderr": 0.029233465745573086, + "acc_norm": 0.40070921985815605, + "acc_norm_stderr": 0.029233465745573086 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4041720990873533, + "acc_stderr": 0.012533504046491362, + "acc_norm": 0.4041720990873533, + "acc_norm_stderr": 0.012533504046491362 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5073529411764706, + "acc_stderr": 0.030369552523902173, + "acc_norm": 0.5073529411764706, + "acc_norm_stderr": 0.030369552523902173 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.020087362076702857, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.020087362076702857 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.03168091161233882, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.03168091161233882 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.736318407960199, + "acc_stderr": 0.031157150869355568, + "acc_norm": 0.736318407960199, + "acc_norm_stderr": 0.031157150869355568 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4578313253012048, + "acc_stderr": 0.038786267710023595, + "acc_norm": 0.4578313253012048, + "acc_norm_stderr": 0.038786267710023595 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7309941520467836, + "acc_stderr": 0.03401052620104089, + "acc_norm": 0.7309941520467836, + "acc_norm_stderr": 0.03401052620104089 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2876376988984088, + "mc1_stderr": 0.015846315101394802, + "mc2": 0.4284193316007184, + "mc2_stderr": 0.014486178746194435 + }, + "all": { + "acc": 0.5509125885849774, + "acc_stderr": 0.0344588285887975, + "acc_norm": 0.555047768984873, + "acc_norm_stderr": 0.03443868276596075, + "mc1": 0.2876376988984088, + "mc1_stderr": 0.015846315101394802, + "mc2": 0.4284193316007184, + "mc2_stderr": 0.014486178746194435 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6334.0009253025055", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/llama-2-13b-hf-platypus/results_2023-10-28T00-17-42.072889.json b/eval-results/lgaalves/llama-2-13b-hf-platypus/results_2023-10-28T00-17-42.072889.json new file mode 100644 index 0000000000000000000000000000000000000000..3b9e7d643c8464d6e23e43254243f14738ae3e64 --- /dev/null +++ b/eval-results/lgaalves/llama-2-13b-hf-platypus/results_2023-10-28T00-17-42.072889.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "lgaalves/llama-2-13b-hf-platypus", + "model_sha": "39e07f6213a64d79cf31e9c0773dea6224f7f021", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0017827181208053692, + "em_stderr": 0.00043200973460388544, + "f1": 0.05985213926174496, + "f1_stderr": 0.0013641672120704657 + }, + "harness|gsm8k|5": { + "acc": 0.09401061410159212, + "acc_stderr": 0.00803881981887246 + }, + "harness|winogrande|5": { + "acc": 0.771112865035517, + "acc_stderr": 0.011807360224025398 + }, + "all": { + "em": 0.0017827181208053692, + "em_stderr": 0.00043200973460388544, + "f1": 0.05985213926174496, + "f1_stderr": 0.0013641672120704657, + "acc": 0.4325617395685546, + "acc_stderr": 0.009923090021448928 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "9df2e1a9b50d351e" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "126a4e312bc62a12" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "19e255ef48f988ea" + }, + "total_evaluation_time_secondes": "13158.295419931412", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/llama-2-13b-hf-platypus/results_2023-10-28T02-33-59.939371.json b/eval-results/lgaalves/llama-2-13b-hf-platypus/results_2023-10-28T02-33-59.939371.json new file mode 100644 index 0000000000000000000000000000000000000000..ad35dcae868429b3c06775d447b7ae963d9f18c8 --- /dev/null +++ b/eval-results/lgaalves/llama-2-13b-hf-platypus/results_2023-10-28T02-33-59.939371.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "lgaalves/llama-2-13b-hf-platypus", + "model_sha": "39e07f6213a64d79cf31e9c0773dea6224f7f021", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0017827181208053692, + "em_stderr": 0.00043200973460388544, + "f1": 0.05985213926174496, + "f1_stderr": 0.0013641672120704657 + }, + "harness|gsm8k|5": { + "acc": 0.09401061410159212, + "acc_stderr": 0.00803881981887246 + }, + "harness|winogrande|5": { + "acc": 0.771112865035517, + "acc_stderr": 0.011807360224025398 + }, + "all": { + "em": 0.0017827181208053692, + "em_stderr": 0.00043200973460388544, + "f1": 0.05985213926174496, + "f1_stderr": 0.0013641672120704657, + "acc": 0.4325617395685546, + "acc_stderr": 0.009923090021448928 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "9df2e1a9b50d351e" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "126a4e312bc62a12" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "19e255ef48f988ea" + }, + "total_evaluation_time_secondes": "13486.26902103424", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/llama-2-7b-hf_open-platypus/results_2023-08-31T14-20-30.830996.json b/eval-results/lgaalves/llama-2-7b-hf_open-platypus/results_2023-08-31T14-20-30.830996.json new file mode 100644 index 0000000000000000000000000000000000000000..d44a69842d03691efbb83c392f128d2448445ef7 --- /dev/null +++ b/eval-results/lgaalves/llama-2-7b-hf_open-platypus/results_2023-08-31T14-20-30.830996.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "lgaalves/llama-2-7b-hf_open-platypus", + "model_sha": "c7e776f3f3afc0fa22cb7aff0d00522e571e9b29", + "model_dtype": "torch.float16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.48293515358361777, + "acc_stderr": 0.014602878388536598, + "acc_norm": 0.514505119453925, + "acc_norm_stderr": 0.014605241081370056 + }, + "harness|hellaswag|10": { + "acc": 0.5880302728540131, + "acc_stderr": 0.004911837730582202, + "acc_norm": 0.7862975502887871, + "acc_norm_stderr": 0.004090813948220233 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.03925523381052932, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.03925523381052932 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4, + "acc_stderr": 0.030151134457776278, + "acc_norm": 0.4, + "acc_norm_stderr": 0.030151134457776278 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4097222222222222, + "acc_stderr": 0.04112490974670787, + "acc_norm": 0.4097222222222222, + "acc_norm_stderr": 0.04112490974670787 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952344, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952344 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.36416184971098264, + "acc_stderr": 0.03669072477416906, + "acc_norm": 0.36416184971098264, + "acc_norm_stderr": 0.03669072477416906 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179964, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179964 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.39574468085106385, + "acc_stderr": 0.031967586978353627, + "acc_norm": 0.39574468085106385, + "acc_norm_stderr": 0.031967586978353627 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.04339138322579861, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.04339138322579861 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.41379310344827586, + "acc_stderr": 0.04104269211806232, + "acc_norm": 0.41379310344827586, + "acc_norm_stderr": 0.04104269211806232 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24867724867724866, + "acc_stderr": 0.022261817692400168, + "acc_norm": 0.24867724867724866, + "acc_norm_stderr": 0.022261817692400168 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04285714285714281, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04285714285714281 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.432258064516129, + "acc_stderr": 0.02818173972001941, + "acc_norm": 0.432258064516129, + "acc_norm_stderr": 0.02818173972001941 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2955665024630542, + "acc_stderr": 0.032104944337514575, + "acc_norm": 0.2955665024630542, + "acc_norm_stderr": 0.032104944337514575 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5696969696969697, + "acc_stderr": 0.03866225962879077, + "acc_norm": 0.5696969696969697, + "acc_norm_stderr": 0.03866225962879077 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.4696969696969697, + "acc_stderr": 0.03555804051763929, + "acc_norm": 0.4696969696969697, + "acc_norm_stderr": 0.03555804051763929 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6424870466321243, + "acc_stderr": 0.03458816042181012, + "acc_norm": 0.6424870466321243, + "acc_norm_stderr": 0.03458816042181012 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.37948717948717947, + "acc_stderr": 0.024603626924097417, + "acc_norm": 0.37948717948717947, + "acc_norm_stderr": 0.024603626924097417 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.22592592592592592, + "acc_stderr": 0.02549753263960955, + "acc_norm": 0.22592592592592592, + "acc_norm_stderr": 0.02549753263960955 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.031968769891957786, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.031968769891957786 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2251655629139073, + "acc_stderr": 0.03410435282008936, + "acc_norm": 0.2251655629139073, + "acc_norm_stderr": 0.03410435282008936 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.563302752293578, + "acc_stderr": 0.021264820158714205, + "acc_norm": 0.563302752293578, + "acc_norm_stderr": 0.021264820158714205 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.030058202704309846, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.030058202704309846 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.46568627450980393, + "acc_stderr": 0.03501038327635897, + "acc_norm": 0.46568627450980393, + "acc_norm_stderr": 0.03501038327635897 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.5232067510548524, + "acc_stderr": 0.032512152011410174, + "acc_norm": 0.5232067510548524, + "acc_norm_stderr": 0.032512152011410174 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5112107623318386, + "acc_stderr": 0.033549366530984746, + "acc_norm": 0.5112107623318386, + "acc_norm_stderr": 0.033549366530984746 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.48854961832061067, + "acc_stderr": 0.043841400240780176, + "acc_norm": 0.48854961832061067, + "acc_norm_stderr": 0.043841400240780176 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6446280991735537, + "acc_stderr": 0.0436923632657398, + "acc_norm": 0.6446280991735537, + "acc_norm_stderr": 0.0436923632657398 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.04830366024635331, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.04830366024635331 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4785276073619632, + "acc_stderr": 0.0392474687675113, + "acc_norm": 0.4785276073619632, + "acc_norm_stderr": 0.0392474687675113 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.0457237235873743, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.0457237235873743 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5339805825242718, + "acc_stderr": 0.0493929144727348, + "acc_norm": 0.5339805825242718, + "acc_norm_stderr": 0.0493929144727348 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7051282051282052, + "acc_stderr": 0.029872577708891204, + "acc_norm": 0.7051282051282052, + "acc_norm_stderr": 0.029872577708891204 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.52, + "acc_stderr": 0.05021167315686779, + "acc_norm": 0.52, + "acc_norm_stderr": 0.05021167315686779 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6245210727969349, + "acc_stderr": 0.017316613197182786, + "acc_norm": 0.6245210727969349, + "acc_norm_stderr": 0.017316613197182786 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.4479768786127168, + "acc_stderr": 0.02677299065336183, + "acc_norm": 0.4479768786127168, + "acc_norm_stderr": 0.02677299065336183 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.47058823529411764, + "acc_stderr": 0.028580341065138293, + "acc_norm": 0.47058823529411764, + "acc_norm_stderr": 0.028580341065138293 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5434083601286174, + "acc_stderr": 0.0282908690541976, + "acc_norm": 0.5434083601286174, + "acc_norm_stderr": 0.0282908690541976 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.47530864197530864, + "acc_stderr": 0.02778680093142745, + "acc_norm": 0.47530864197530864, + "acc_norm_stderr": 0.02778680093142745 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3546099290780142, + "acc_stderr": 0.02853865002887864, + "acc_norm": 0.3546099290780142, + "acc_norm_stderr": 0.02853865002887864 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3428943937418514, + "acc_stderr": 0.012123463271585892, + "acc_norm": 0.3428943937418514, + "acc_norm_stderr": 0.012123463271585892 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.48161764705882354, + "acc_stderr": 0.03035230339535196, + "acc_norm": 0.48161764705882354, + "acc_norm_stderr": 0.03035230339535196 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.434640522875817, + "acc_stderr": 0.02005426920072646, + "acc_norm": 0.434640522875817, + "acc_norm_stderr": 0.02005426920072646 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.4909090909090909, + "acc_stderr": 0.04788339768702861, + "acc_norm": 0.4909090909090909, + "acc_norm_stderr": 0.04788339768702861 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.46530612244897956, + "acc_stderr": 0.03193207024425314, + "acc_norm": 0.46530612244897956, + "acc_norm_stderr": 0.03193207024425314 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6069651741293532, + "acc_stderr": 0.0345368246603156, + "acc_norm": 0.6069651741293532, + "acc_norm_stderr": 0.0345368246603156 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.65, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.65, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.40963855421686746, + "acc_stderr": 0.03828401115079022, + "acc_norm": 0.40963855421686746, + "acc_norm_stderr": 0.03828401115079022 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6608187134502924, + "acc_stderr": 0.03631053496488905, + "acc_norm": 0.6608187134502924, + "acc_norm_stderr": 0.03631053496488905 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.29498164014687883, + "mc1_stderr": 0.015964400965589657, + "mc2": 0.43705224040583207, + "mc2_stderr": 0.014401937881119722 + }, + "all": { + "acc": 0.4393778750069193, + "acc_stderr": 0.03509495224786267, + "acc_norm": 0.4432734215036156, + "acc_norm_stderr": 0.03508107663617574, + "mc1": 0.29498164014687883, + "mc1_stderr": 0.015964400965589657, + "mc2": 0.43705224040583207, + "mc2_stderr": 0.014401937881119722 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "4118.240209579468", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/llama-2-7b-hf_open-platypus/results_2023-10-16T18-18-23.592235.json b/eval-results/lgaalves/llama-2-7b-hf_open-platypus/results_2023-10-16T18-18-23.592235.json new file mode 100644 index 0000000000000000000000000000000000000000..5c0591fe5937c25f33d1e9699872d1e5ff1e6aa5 --- /dev/null +++ b/eval-results/lgaalves/llama-2-7b-hf_open-platypus/results_2023-10-16T18-18-23.592235.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "lgaalves/llama-2-7b-hf_open-platypus", + "model_sha": "846f8b6fe96607cda16d9d85547ffab61a1c96ab", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0012583892617449664, + "em_stderr": 0.0003630560893118953, + "f1": 0.05986052852348985, + "f1_stderr": 0.0013631018920376853 + }, + "harness|gsm8k|5": { + "acc": 0.06595905989385899, + "acc_stderr": 0.006836951192034225 + }, + "harness|winogrande|5": { + "acc": 0.744277821625888, + "acc_stderr": 0.012261253845440474 + }, + "all": { + "em": 0.0012583892617449664, + "em_stderr": 0.0003630560893118953, + "f1": 0.05986052852348985, + "f1_stderr": 0.0013631018920376853, + "acc": 0.40511844075987347, + "acc_stderr": 0.00954910251873735 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "adecec7bf5d9932f" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "72c5a185cba56385" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "fe9b7760455fe66f" + }, + "total_evaluation_time_secondes": "10499.36984038353", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/mistral-7b-platypus1k/results_2023-10-11T12-58-49.551109.json b/eval-results/lgaalves/mistral-7b-platypus1k/results_2023-10-11T12-58-49.551109.json new file mode 100644 index 0000000000000000000000000000000000000000..727caf5a11824ce7fa66fbb0780b81e5a308e2cb --- /dev/null +++ b/eval-results/lgaalves/mistral-7b-platypus1k/results_2023-10-11T12-58-49.551109.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "lgaalves/mistral-7b-platypus1k", + "model_sha": "c34c4a249ecf0cc391beba142a1f9cb23154fcd1", + "model_size": "13.99 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5708191126279863, + "acc_stderr": 0.014464085894870653, + "acc_norm": 0.6160409556313993, + "acc_norm_stderr": 0.01421244498065189 + }, + "harness|hellaswag|10": { + "acc": 0.6269667396932882, + "acc_stderr": 0.004826224784850442, + "acc_norm": 0.8293168691495718, + "acc_norm_stderr": 0.003754629313275163 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6074074074074074, + "acc_stderr": 0.0421850621536888, + "acc_norm": 0.6074074074074074, + "acc_norm_stderr": 0.0421850621536888 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6118421052631579, + "acc_stderr": 0.03965842097512744, + "acc_norm": 0.6118421052631579, + "acc_norm_stderr": 0.03965842097512744 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6415094339622641, + "acc_stderr": 0.029514703583981762, + "acc_norm": 0.6415094339622641, + "acc_norm_stderr": 0.029514703583981762 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7013888888888888, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.7013888888888888, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6416184971098265, + "acc_stderr": 0.03656343653353159, + "acc_norm": 0.6416184971098265, + "acc_norm_stderr": 0.03656343653353159 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4215686274509804, + "acc_stderr": 0.04913595201274498, + "acc_norm": 0.4215686274509804, + "acc_norm_stderr": 0.04913595201274498 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.79, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5659574468085107, + "acc_stderr": 0.03240038086792747, + "acc_norm": 0.5659574468085107, + "acc_norm_stderr": 0.03240038086792747 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4649122807017544, + "acc_stderr": 0.04692008381368909, + "acc_norm": 0.4649122807017544, + "acc_norm_stderr": 0.04692008381368909 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.025197101074246483, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.025197101074246483 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.04426266681379909, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.04426266681379909 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7419354838709677, + "acc_stderr": 0.02489246917246284, + "acc_norm": 0.7419354838709677, + "acc_norm_stderr": 0.02489246917246284 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5073891625615764, + "acc_stderr": 0.0351760354036101, + "acc_norm": 0.5073891625615764, + "acc_norm_stderr": 0.0351760354036101 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7696969696969697, + "acc_stderr": 0.0328766675860349, + "acc_norm": 0.7696969696969697, + "acc_norm_stderr": 0.0328766675860349 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7727272727272727, + "acc_stderr": 0.029857515673386417, + "acc_norm": 0.7727272727272727, + "acc_norm_stderr": 0.029857515673386417 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8652849740932642, + "acc_stderr": 0.02463978909770944, + "acc_norm": 0.8652849740932642, + "acc_norm_stderr": 0.02463978909770944 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6717948717948717, + "acc_stderr": 0.023807633198657262, + "acc_norm": 0.6717948717948717, + "acc_norm_stderr": 0.023807633198657262 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32592592592592595, + "acc_stderr": 0.028578348365473065, + "acc_norm": 0.32592592592592595, + "acc_norm_stderr": 0.028578348365473065 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6512605042016807, + "acc_stderr": 0.030956636328566548, + "acc_norm": 0.6512605042016807, + "acc_norm_stderr": 0.030956636328566548 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.037345356767871984, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.037345356767871984 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8201834862385321, + "acc_stderr": 0.016465345467391538, + "acc_norm": 0.8201834862385321, + "acc_norm_stderr": 0.016465345467391538 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.033622774366080424, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.033622774366080424 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7647058823529411, + "acc_stderr": 0.02977177522814562, + "acc_norm": 0.7647058823529411, + "acc_norm_stderr": 0.02977177522814562 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7552742616033755, + "acc_stderr": 0.02798569938703643, + "acc_norm": 0.7552742616033755, + "acc_norm_stderr": 0.02798569938703643 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6547085201793722, + "acc_stderr": 0.03191100192835794, + "acc_norm": 0.6547085201793722, + "acc_norm_stderr": 0.03191100192835794 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7862595419847328, + "acc_stderr": 0.0359546161177469, + "acc_norm": 0.7862595419847328, + "acc_norm_stderr": 0.0359546161177469 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.768595041322314, + "acc_stderr": 0.03849856098794088, + "acc_norm": 0.768595041322314, + "acc_norm_stderr": 0.03849856098794088 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.042844679680521934, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.042844679680521934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7852760736196319, + "acc_stderr": 0.03226219377286775, + "acc_norm": 0.7852760736196319, + "acc_norm_stderr": 0.03226219377286775 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.48214285714285715, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.48214285714285715, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7864077669902912, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.7864077669902912, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8846153846153846, + "acc_stderr": 0.02093019318517933, + "acc_norm": 0.8846153846153846, + "acc_norm_stderr": 0.02093019318517933 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.73, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.73, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8109833971902938, + "acc_stderr": 0.014000791294407003, + "acc_norm": 0.8109833971902938, + "acc_norm_stderr": 0.014000791294407003 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6994219653179191, + "acc_stderr": 0.0246853168672578, + "acc_norm": 0.6994219653179191, + "acc_norm_stderr": 0.0246853168672578 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3229050279329609, + "acc_stderr": 0.015638440380241488, + "acc_norm": 0.3229050279329609, + "acc_norm_stderr": 0.015638440380241488 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7254901960784313, + "acc_stderr": 0.02555316999182652, + "acc_norm": 0.7254901960784313, + "acc_norm_stderr": 0.02555316999182652 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6945337620578779, + "acc_stderr": 0.02616058445014045, + "acc_norm": 0.6945337620578779, + "acc_norm_stderr": 0.02616058445014045 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7129629629629629, + "acc_stderr": 0.02517104191530968, + "acc_norm": 0.7129629629629629, + "acc_norm_stderr": 0.02517104191530968 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.029766675075873866, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.029766675075873866 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4367666232073012, + "acc_stderr": 0.012667701919603662, + "acc_norm": 0.4367666232073012, + "acc_norm_stderr": 0.012667701919603662 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6617647058823529, + "acc_stderr": 0.028739328513983572, + "acc_norm": 0.6617647058823529, + "acc_norm_stderr": 0.028739328513983572 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6568627450980392, + "acc_stderr": 0.019206606848825362, + "acc_norm": 0.6568627450980392, + "acc_norm_stderr": 0.019206606848825362 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.710204081632653, + "acc_stderr": 0.02904308868330433, + "acc_norm": 0.710204081632653, + "acc_norm_stderr": 0.02904308868330433 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8407960199004975, + "acc_stderr": 0.025870646766169146, + "acc_norm": 0.8407960199004975, + "acc_norm_stderr": 0.025870646766169146 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.91, + "acc_stderr": 0.028762349126466125, + "acc_norm": 0.91, + "acc_norm_stderr": 0.028762349126466125 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5542168674698795, + "acc_stderr": 0.038695433234721015, + "acc_norm": 0.5542168674698795, + "acc_norm_stderr": 0.038695433234721015 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8538011695906432, + "acc_stderr": 0.027097290118070806, + "acc_norm": 0.8538011695906432, + "acc_norm_stderr": 0.027097290118070806 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3182374541003672, + "mc1_stderr": 0.016305988648920626, + "mc2": 0.4695906948194394, + "mc2_stderr": 0.01494642651529255 + }, + "all": { + "acc": 0.6304998086039316, + "acc_stderr": 0.033058893340663746, + "acc_norm": 0.6346959437303671, + "acc_norm_stderr": 0.0330364656053113, + "mc1": 0.3182374541003672, + "mc1_stderr": 0.016305988648920626, + "mc2": 0.4695906948194394, + "mc2_stderr": 0.01494642651529255 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "4117.690157651901", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/mistral-7b-platypus1k/results_2023-11-04T18-46-29.797939.json b/eval-results/lgaalves/mistral-7b-platypus1k/results_2023-11-04T18-46-29.797939.json new file mode 100644 index 0000000000000000000000000000000000000000..f202fe4deeb79ec4b3c36fe4e7b849ce104e46b1 --- /dev/null +++ b/eval-results/lgaalves/mistral-7b-platypus1k/results_2023-11-04T18-46-29.797939.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "lgaalves/mistral-7b-platypus1k", + "model_sha": "5c931e9f83d8e956758d6d9fb69434fda7eb7997", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.0018875838926174498, + "em_stderr": 0.0004445109990558977, + "f1": 0.05987311241610734, + "f1_stderr": 0.001362358723340712 + }, + "harness|gsm8k|5": { + "acc": 0.16376042456406367, + "acc_stderr": 0.010193237214420947 + }, + "harness|winogrande|5": { + "acc": 0.7813733228097869, + "acc_stderr": 0.011616198215773223 + }, + "all": { + "em": 0.0018875838926174498, + "em_stderr": 0.0004445109990558977, + "f1": 0.05987311241610734, + "f1_stderr": 0.001362358723340712, + "acc": 0.4725668736869253, + "acc_stderr": 0.010904717715097085 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "02e8ef9a5a2b84a0" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "e5bcf7a1367ecb4e" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2397, + "non_padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "d6b42609900d5203" + }, + "truncated": 0, + "non_truncated": 12122, + "padded": 2397, + "non_padded": 10992, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/mistral-7b-platypus1k/results_2023-11-06T16-30-05.854824.json b/eval-results/lgaalves/mistral-7b-platypus1k/results_2023-11-06T16-30-05.854824.json new file mode 100644 index 0000000000000000000000000000000000000000..f202fe4deeb79ec4b3c36fe4e7b849ce104e46b1 --- /dev/null +++ b/eval-results/lgaalves/mistral-7b-platypus1k/results_2023-11-06T16-30-05.854824.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "lgaalves/mistral-7b-platypus1k", + "model_sha": "5c931e9f83d8e956758d6d9fb69434fda7eb7997", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.0018875838926174498, + "em_stderr": 0.0004445109990558977, + "f1": 0.05987311241610734, + "f1_stderr": 0.001362358723340712 + }, + "harness|gsm8k|5": { + "acc": 0.16376042456406367, + "acc_stderr": 0.010193237214420947 + }, + "harness|winogrande|5": { + "acc": 0.7813733228097869, + "acc_stderr": 0.011616198215773223 + }, + "all": { + "em": 0.0018875838926174498, + "em_stderr": 0.0004445109990558977, + "f1": 0.05987311241610734, + "f1_stderr": 0.001362358723340712, + "acc": 0.4725668736869253, + "acc_stderr": 0.010904717715097085 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "02e8ef9a5a2b84a0" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "e5bcf7a1367ecb4e" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2397, + "non_padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "d6b42609900d5203" + }, + "truncated": 0, + "non_truncated": 12122, + "padded": 2397, + "non_padded": 10992, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/mistral-7b_open_platypus/results_2023-11-18T19-20-26.136874.json b/eval-results/lgaalves/mistral-7b_open_platypus/results_2023-11-18T19-20-26.136874.json new file mode 100644 index 0000000000000000000000000000000000000000..bb798b72bc99e6477a52fa7f87bc34c0a925c082 --- /dev/null +++ b/eval-results/lgaalves/mistral-7b_open_platypus/results_2023-11-18T19-20-26.136874.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 156020.961315997, + "end_time": 170190.092480873, + "total_evaluation_time_secondes": "14169.131164875987", + "model_name": "lgaalves/mistral-7b_open_platypus", + "model_sha": "b9a60b9ad0fe06bd314ffe99d543f1df6ecd10da", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5332764505119454, + "acc_stderr": 0.014578995859605808, + "acc_norm": 0.5580204778156996, + "acc_norm_stderr": 0.014512682523128343 + }, + "harness|hellaswag|10": { + "acc": 0.6120294761999602, + "acc_stderr": 0.004862919176408075, + "acc_norm": 0.8212507468631747, + "acc_norm_stderr": 0.003823591814133036 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5703703703703704, + "acc_stderr": 0.042763494943765995, + "acc_norm": 0.5703703703703704, + "acc_norm_stderr": 0.042763494943765995 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6381578947368421, + "acc_stderr": 0.03910525752849724, + "acc_norm": 0.6381578947368421, + "acc_norm_stderr": 0.03910525752849724 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6566037735849056, + "acc_stderr": 0.02922452646912479, + "acc_norm": 0.6566037735849056, + "acc_norm_stderr": 0.02922452646912479 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6458333333333334, + "acc_stderr": 0.039994111357535424, + "acc_norm": 0.6458333333333334, + "acc_norm_stderr": 0.039994111357535424 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709390974, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709390974 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5895953757225434, + "acc_stderr": 0.03750757044895536, + "acc_norm": 0.5895953757225434, + "acc_norm_stderr": 0.03750757044895536 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3137254901960784, + "acc_stderr": 0.04617034827006717, + "acc_norm": 0.3137254901960784, + "acc_norm_stderr": 0.04617034827006717 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4978723404255319, + "acc_stderr": 0.03268572658667492, + "acc_norm": 0.4978723404255319, + "acc_norm_stderr": 0.03268572658667492 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4649122807017544, + "acc_stderr": 0.04692008381368909, + "acc_norm": 0.4649122807017544, + "acc_norm_stderr": 0.04692008381368909 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5103448275862069, + "acc_stderr": 0.041657747757287644, + "acc_norm": 0.5103448275862069, + "acc_norm_stderr": 0.041657747757287644 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.42328042328042326, + "acc_stderr": 0.025446365634406776, + "acc_norm": 0.42328042328042326, + "acc_norm_stderr": 0.025446365634406776 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.04325506042017086, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.04325506042017086 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6806451612903226, + "acc_stderr": 0.026522709674667765, + "acc_norm": 0.6806451612903226, + "acc_norm_stderr": 0.026522709674667765 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.46798029556650245, + "acc_stderr": 0.03510766597959217, + "acc_norm": 0.46798029556650245, + "acc_norm_stderr": 0.03510766597959217 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7393939393939394, + "acc_stderr": 0.034277431758165236, + "acc_norm": 0.7393939393939394, + "acc_norm_stderr": 0.034277431758165236 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.03173071239071724, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.03173071239071724 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8497409326424871, + "acc_stderr": 0.025787723180723872, + "acc_norm": 0.8497409326424871, + "acc_norm_stderr": 0.025787723180723872 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5512820512820513, + "acc_stderr": 0.025217315184846486, + "acc_norm": 0.5512820512820513, + "acc_norm_stderr": 0.025217315184846486 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2814814814814815, + "acc_stderr": 0.02742001935094528, + "acc_norm": 0.2814814814814815, + "acc_norm_stderr": 0.02742001935094528 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5672268907563025, + "acc_stderr": 0.032183581077426124, + "acc_norm": 0.5672268907563025, + "acc_norm_stderr": 0.032183581077426124 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.03879687024073327, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.03879687024073327 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7724770642201835, + "acc_stderr": 0.017974463578776502, + "acc_norm": 0.7724770642201835, + "acc_norm_stderr": 0.017974463578776502 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.39814814814814814, + "acc_stderr": 0.033384734032074016, + "acc_norm": 0.39814814814814814, + "acc_norm_stderr": 0.033384734032074016 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7696078431372549, + "acc_stderr": 0.02955429260569507, + "acc_norm": 0.7696078431372549, + "acc_norm_stderr": 0.02955429260569507 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7848101265822784, + "acc_stderr": 0.026750826994676177, + "acc_norm": 0.7848101265822784, + "acc_norm_stderr": 0.026750826994676177 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.726457399103139, + "acc_stderr": 0.029918586707798834, + "acc_norm": 0.726457399103139, + "acc_norm_stderr": 0.029918586707798834 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6564885496183206, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.6564885496183206, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8429752066115702, + "acc_stderr": 0.03321244842547128, + "acc_norm": 0.8429752066115702, + "acc_norm_stderr": 0.03321244842547128 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7037037037037037, + "acc_stderr": 0.044143436668549335, + "acc_norm": 0.7037037037037037, + "acc_norm_stderr": 0.044143436668549335 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7668711656441718, + "acc_stderr": 0.0332201579577674, + "acc_norm": 0.7668711656441718, + "acc_norm_stderr": 0.0332201579577674 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4642857142857143, + "acc_stderr": 0.04733667890053756, + "acc_norm": 0.4642857142857143, + "acc_norm_stderr": 0.04733667890053756 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.04541609446503948, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.04541609446503948 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.024414947304543678, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.024414947304543678 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7956577266922095, + "acc_stderr": 0.014419123980931895, + "acc_norm": 0.7956577266922095, + "acc_norm_stderr": 0.014419123980931895 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6936416184971098, + "acc_stderr": 0.024818350129436593, + "acc_norm": 0.6936416184971098, + "acc_norm_stderr": 0.024818350129436593 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.376536312849162, + "acc_stderr": 0.016204672385106603, + "acc_norm": 0.376536312849162, + "acc_norm_stderr": 0.016204672385106603 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6470588235294118, + "acc_stderr": 0.027363593284684972, + "acc_norm": 0.6470588235294118, + "acc_norm_stderr": 0.027363593284684972 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7041800643086816, + "acc_stderr": 0.025922371788818763, + "acc_norm": 0.7041800643086816, + "acc_norm_stderr": 0.025922371788818763 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7129629629629629, + "acc_stderr": 0.02517104191530968, + "acc_norm": 0.7129629629629629, + "acc_norm_stderr": 0.02517104191530968 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4858156028368794, + "acc_stderr": 0.02981549448368206, + "acc_norm": 0.4858156028368794, + "acc_norm_stderr": 0.02981549448368206 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44589308996088656, + "acc_stderr": 0.012695244711379774, + "acc_norm": 0.44589308996088656, + "acc_norm_stderr": 0.012695244711379774 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5919117647058824, + "acc_stderr": 0.029855261393483924, + "acc_norm": 0.5919117647058824, + "acc_norm_stderr": 0.029855261393483924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6209150326797386, + "acc_stderr": 0.01962744474841223, + "acc_norm": 0.6209150326797386, + "acc_norm_stderr": 0.01962744474841223 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6272727272727273, + "acc_stderr": 0.04631381319425465, + "acc_norm": 0.6272727272727273, + "acc_norm_stderr": 0.04631381319425465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6244897959183674, + "acc_stderr": 0.03100120903989484, + "acc_norm": 0.6244897959183674, + "acc_norm_stderr": 0.03100120903989484 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7512437810945274, + "acc_stderr": 0.030567675938916714, + "acc_norm": 0.7512437810945274, + "acc_norm_stderr": 0.030567675938916714 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4879518072289157, + "acc_stderr": 0.0389136449583582, + "acc_norm": 0.4879518072289157, + "acc_norm_stderr": 0.0389136449583582 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.031885780176863984, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.031885780176863984 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3292533659730722, + "mc1_stderr": 0.016451264440068232, + "mc2": 0.48869138188349615, + "mc2_stderr": 0.0147358552004315 + }, + "harness|winogrande|5": { + "acc": 0.7861089187056038, + "acc_stderr": 0.011524466954090254 + }, + "harness|drop|3": { + "em": 0.0036703020134228187, + "em_stderr": 0.0006192871806511272, + "f1": 0.06589450503355675, + "f1_stderr": 0.0014663770308574477 + }, + "harness|gsm8k|5": { + "acc": 0.12585291887793784, + "acc_stderr": 0.009136212598406307 + }, + "all": { + "acc": 0.5921618091275235, + "acc_stderr": 0.033165593817109554, + "acc_norm": 0.6007436240197009, + "acc_norm_stderr": 0.03392093055241413, + "mc1": 0.3292533659730722, + "mc1_stderr": 0.016451264440068232, + "mc2": 0.48869138188349615, + "mc2_stderr": 0.0147358552004315, + "em": 0.0036703020134228187, + "em_stderr": 0.0006192871806511272, + "f1": 0.06589450503355675, + "f1_stderr": 0.0014663770308574477 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "3b04c126e1a2a14a" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "4e6a989c3cde06b3" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "ca7fcf9306d84aeb" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/tinyllama-1.1b-chat-v0.3_platypus/results_2023-10-10T14-53-56.428911.json b/eval-results/lgaalves/tinyllama-1.1b-chat-v0.3_platypus/results_2023-10-10T14-53-56.428911.json new file mode 100644 index 0000000000000000000000000000000000000000..7d30a8ef44b165f375a0f99a3c210bc9a4283c87 --- /dev/null +++ b/eval-results/lgaalves/tinyllama-1.1b-chat-v0.3_platypus/results_2023-10-10T14-53-56.428911.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "lgaalves/tinyllama-1.1b-chat-v0.3_platypus", + "model_sha": "0bb6ebe1d41d394bae0ed9107ec8d776d9d76a68", + "model_size": "2.06 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.2764505119453925, + "acc_stderr": 0.013069662474252428, + "acc_norm": 0.302901023890785, + "acc_norm_stderr": 0.013428241573185349 + }, + "harness|hellaswag|10": { + "acc": 0.41963752240589525, + "acc_stderr": 0.0049249104331063566, + "acc_norm": 0.551185022903804, + "acc_norm_stderr": 0.00496356702912906 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04072314811876837, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04072314811876837 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.32894736842105265, + "acc_stderr": 0.03823428969926604, + "acc_norm": 0.32894736842105265, + "acc_norm_stderr": 0.03823428969926604 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.20754716981132076, + "acc_stderr": 0.024959918028911277, + "acc_norm": 0.20754716981132076, + "acc_norm_stderr": 0.024959918028911277 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366255, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366255 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2543352601156069, + "acc_stderr": 0.0332055644308557, + "acc_norm": 0.2543352601156069, + "acc_norm_stderr": 0.0332055644308557 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237655, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237655 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2170212765957447, + "acc_stderr": 0.026947483121496234, + "acc_norm": 0.2170212765957447, + "acc_norm_stderr": 0.026947483121496234 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.040493392977481404, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.040493392977481404 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.30344827586206896, + "acc_stderr": 0.038312260488503336, + "acc_norm": 0.30344827586206896, + "acc_norm_stderr": 0.038312260488503336 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.022418042891113942, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.022418042891113942 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.1984126984126984, + "acc_stderr": 0.03567016675276863, + "acc_norm": 0.1984126984126984, + "acc_norm_stderr": 0.03567016675276863 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.23870967741935484, + "acc_stderr": 0.024251071262208834, + "acc_norm": 0.23870967741935484, + "acc_norm_stderr": 0.024251071262208834 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.23645320197044334, + "acc_stderr": 0.029896114291733545, + "acc_norm": 0.23645320197044334, + "acc_norm_stderr": 0.029896114291733545 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2606060606060606, + "acc_stderr": 0.03427743175816525, + "acc_norm": 0.2606060606060606, + "acc_norm_stderr": 0.03427743175816525 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.25757575757575757, + "acc_stderr": 0.03115626951964686, + "acc_norm": 0.25757575757575757, + "acc_norm_stderr": 0.03115626951964686 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.24352331606217617, + "acc_stderr": 0.030975436386845447, + "acc_norm": 0.24352331606217617, + "acc_norm_stderr": 0.030975436386845447 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2358974358974359, + "acc_stderr": 0.021525965407408726, + "acc_norm": 0.2358974358974359, + "acc_norm_stderr": 0.021525965407408726 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.27037037037037037, + "acc_stderr": 0.02708037281514566, + "acc_norm": 0.27037037037037037, + "acc_norm_stderr": 0.02708037281514566 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.20168067226890757, + "acc_stderr": 0.026064313406304527, + "acc_norm": 0.20168067226890757, + "acc_norm_stderr": 0.026064313406304527 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.037345356767871984, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.037345356767871984 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.23302752293577983, + "acc_stderr": 0.01812566918086148, + "acc_norm": 0.23302752293577983, + "acc_norm_stderr": 0.01812566918086148 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.0305467452649532, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.0305467452649532 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.03198001660115071, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.03198001660115071 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.2062780269058296, + "acc_stderr": 0.02715715047956382, + "acc_norm": 0.2062780269058296, + "acc_norm_stderr": 0.02715715047956382 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.21374045801526717, + "acc_stderr": 0.0359546161177469, + "acc_norm": 0.21374045801526717, + "acc_norm_stderr": 0.0359546161177469 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.371900826446281, + "acc_stderr": 0.044120158066245044, + "acc_norm": 0.371900826446281, + "acc_norm_stderr": 0.044120158066245044 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.28703703703703703, + "acc_stderr": 0.043733130409147614, + "acc_norm": 0.28703703703703703, + "acc_norm_stderr": 0.043733130409147614 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3128834355828221, + "acc_stderr": 0.036429145782924034, + "acc_norm": 0.3128834355828221, + "acc_norm_stderr": 0.036429145782924034 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.038946411200447915, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.038946411200447915 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.22330097087378642, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.22330097087378642, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.3034188034188034, + "acc_stderr": 0.030118210106942635, + "acc_norm": 0.3034188034188034, + "acc_norm_stderr": 0.030118210106942635 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.17, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.17, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.26181353767560667, + "acc_stderr": 0.015720838678445266, + "acc_norm": 0.26181353767560667, + "acc_norm_stderr": 0.015720838678445266 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2947976878612717, + "acc_stderr": 0.024547617794803835, + "acc_norm": 0.2947976878612717, + "acc_norm_stderr": 0.024547617794803835 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24692737430167597, + "acc_stderr": 0.014422292204808835, + "acc_norm": 0.24692737430167597, + "acc_norm_stderr": 0.014422292204808835 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.25163398692810457, + "acc_stderr": 0.024848018263875195, + "acc_norm": 0.25163398692810457, + "acc_norm_stderr": 0.024848018263875195 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.28938906752411575, + "acc_stderr": 0.025755865922632924, + "acc_norm": 0.28938906752411575, + "acc_norm_stderr": 0.025755865922632924 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2932098765432099, + "acc_stderr": 0.02532988817190092, + "acc_norm": 0.2932098765432099, + "acc_norm_stderr": 0.02532988817190092 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.22695035460992907, + "acc_stderr": 0.02498710636564297, + "acc_norm": 0.22695035460992907, + "acc_norm_stderr": 0.02498710636564297 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2666232073011734, + "acc_stderr": 0.01129383603161214, + "acc_norm": 0.2666232073011734, + "acc_norm_stderr": 0.01129383603161214 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.025767252010855956, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.025767252010855956 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.017740899509177795, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.017740899509177795 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.24897959183673468, + "acc_stderr": 0.027682979522960227, + "acc_norm": 0.24897959183673468, + "acc_norm_stderr": 0.027682979522960227 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.030360490154014652, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.030360490154014652 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.25301204819277107, + "acc_stderr": 0.03384429155233135, + "acc_norm": 0.25301204819277107, + "acc_norm_stderr": 0.03384429155233135 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.26900584795321636, + "acc_stderr": 0.0340105262010409, + "acc_norm": 0.26900584795321636, + "acc_norm_stderr": 0.0340105262010409 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23133414932680538, + "mc1_stderr": 0.01476194517486267, + "mc2": 0.39153421911238995, + "mc2_stderr": 0.014139728525871488 + }, + "all": { + "acc": 0.2642090602312682, + "acc_stderr": 0.03197682325323511, + "acc_norm": 0.2668869926455615, + "acc_norm_stderr": 0.03198355606162418, + "mc1": 0.23133414932680538, + "mc1_stderr": 0.01476194517486267, + "mc2": 0.39153421911238995, + "mc2_stderr": 0.014139728525871488 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "2024.2723507881165", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lgaalves/tinyllama-1.1b-chat-v0.3_platypus/results_2023-10-23T23-05-04.270048.json b/eval-results/lgaalves/tinyllama-1.1b-chat-v0.3_platypus/results_2023-10-23T23-05-04.270048.json new file mode 100644 index 0000000000000000000000000000000000000000..6ef31172dfa86ed033fefcaa9b08743e2c213e49 --- /dev/null +++ b/eval-results/lgaalves/tinyllama-1.1b-chat-v0.3_platypus/results_2023-10-23T23-05-04.270048.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "lgaalves/tinyllama-1.1b-chat-v0.3_platypus", + "model_sha": "e53612cfbe610205e6f4d5de7397783056231d8e", + "model_size": "2.06 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0025167785234899327, + "em_stderr": 0.0005131152834514911, + "f1": 0.049414848993288615, + "f1_stderr": 0.0012773102707031435 + }, + "harness|gsm8k|5": { + "acc": 0.00530705079605762, + "acc_stderr": 0.002001305720948044 + }, + "harness|winogrande|5": { + "acc": 0.5580110497237569, + "acc_stderr": 0.013957584079108994 + }, + "all": { + "em": 0.0025167785234899327, + "em_stderr": 0.0005131152834514911, + "f1": 0.049414848993288615, + "f1_stderr": 0.0012773102707031435, + "acc": 0.2816590502599073, + "acc_stderr": 0.00797944490002852 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "b2fd463886a18b22" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "8ef96a9211c4f3c1" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "5134ebbc1457b850" + }, + "total_evaluation_time_secondes": "8470.12617778778", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lilloukas/GPlatty-30B/results_2023-07-19T13-09-17.218494.json b/eval-results/lilloukas/GPlatty-30B/results_2023-07-19T13-09-17.218494.json new file mode 100644 index 0000000000000000000000000000000000000000..d8dc8692c01928256fce36b1b8edf68a77a2eca8 --- /dev/null +++ b/eval-results/lilloukas/GPlatty-30B/results_2023-07-19T13-09-17.218494.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.636518771331058, + "acc_stderr": 0.014056207319068283, + "acc_norm": 0.6578498293515358, + "acc_norm_stderr": 0.013864152159177278 + }, + "harness|hellaswag|10": { + "acc": 0.6501692889862577, + "acc_stderr": 0.004759416464201141, + "acc_norm": 0.8479386576379208, + "acc_norm_stderr": 0.0035834648107534767 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5407407407407407, + "acc_stderr": 0.04304979692464242, + "acc_norm": 0.5407407407407407, + "acc_norm_stderr": 0.04304979692464242 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6776315789473685, + "acc_stderr": 0.03803510248351585, + "acc_norm": 0.6776315789473685, + "acc_norm_stderr": 0.03803510248351585 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.64, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.64, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6566037735849056, + "acc_stderr": 0.02922452646912479, + "acc_norm": 0.6566037735849056, + "acc_norm_stderr": 0.02922452646912479 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.03852084696008534, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.03852084696008534 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.05021167315686779, + "acc_norm": 0.52, + "acc_norm_stderr": 0.05021167315686779 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5549132947976878, + "acc_stderr": 0.03789401760283647, + "acc_norm": 0.5549132947976878, + "acc_norm_stderr": 0.03789401760283647 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.04784060704105655, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.04784060704105655 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.548936170212766, + "acc_stderr": 0.032529096196131965, + "acc_norm": 0.548936170212766, + "acc_norm_stderr": 0.032529096196131965 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.34210526315789475, + "acc_stderr": 0.04462917535336936, + "acc_norm": 0.34210526315789475, + "acc_norm_stderr": 0.04462917535336936 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5793103448275863, + "acc_stderr": 0.0411391498118926, + "acc_norm": 0.5793103448275863, + "acc_norm_stderr": 0.0411391498118926 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.025197101074246494, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.025197101074246494 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.04415438226743743, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.04415438226743743 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7483870967741936, + "acc_stderr": 0.024685979286239956, + "acc_norm": 0.7483870967741936, + "acc_norm_stderr": 0.024685979286239956 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.47783251231527096, + "acc_stderr": 0.0351452856217501, + "acc_norm": 0.47783251231527096, + "acc_norm_stderr": 0.0351452856217501 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.793939393939394, + "acc_stderr": 0.0315841532404771, + "acc_norm": 0.793939393939394, + "acc_norm_stderr": 0.0315841532404771 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8181818181818182, + "acc_stderr": 0.027479603010538804, + "acc_norm": 0.8181818181818182, + "acc_norm_stderr": 0.027479603010538804 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8756476683937824, + "acc_stderr": 0.023814477086593552, + "acc_norm": 0.8756476683937824, + "acc_norm_stderr": 0.023814477086593552 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6333333333333333, + "acc_stderr": 0.02443301646605246, + "acc_norm": 0.6333333333333333, + "acc_norm_stderr": 0.02443301646605246 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.31851851851851853, + "acc_stderr": 0.028406533090608463, + "acc_norm": 0.31851851851851853, + "acc_norm_stderr": 0.028406533090608463 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6974789915966386, + "acc_stderr": 0.029837962388291926, + "acc_norm": 0.6974789915966386, + "acc_norm_stderr": 0.029837962388291926 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8495412844036697, + "acc_stderr": 0.015328563932669235, + "acc_norm": 0.8495412844036697, + "acc_norm_stderr": 0.015328563932669235 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5231481481481481, + "acc_stderr": 0.03406315360711507, + "acc_norm": 0.5231481481481481, + "acc_norm_stderr": 0.03406315360711507 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8578431372549019, + "acc_stderr": 0.024509803921568627, + "acc_norm": 0.8578431372549019, + "acc_norm_stderr": 0.024509803921568627 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8523206751054853, + "acc_stderr": 0.023094329582595684, + "acc_norm": 0.8523206751054853, + "acc_norm_stderr": 0.023094329582595684 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7219730941704036, + "acc_stderr": 0.030069584874494033, + "acc_norm": 0.7219730941704036, + "acc_norm_stderr": 0.030069584874494033 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6946564885496184, + "acc_stderr": 0.04039314978724561, + "acc_norm": 0.6946564885496184, + "acc_norm_stderr": 0.04039314978724561 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8181818181818182, + "acc_stderr": 0.035208939510976534, + "acc_norm": 0.8181818181818182, + "acc_norm_stderr": 0.035208939510976534 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.754601226993865, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.754601226993865, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5089285714285714, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.5089285714285714, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8155339805825242, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.8155339805825242, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8760683760683761, + "acc_stderr": 0.021586494001281348, + "acc_norm": 0.8760683760683761, + "acc_norm_stderr": 0.021586494001281348 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7969348659003831, + "acc_stderr": 0.014385525076611578, + "acc_norm": 0.7969348659003831, + "acc_norm_stderr": 0.014385525076611578 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7023121387283237, + "acc_stderr": 0.024617055388677003, + "acc_norm": 0.7023121387283237, + "acc_norm_stderr": 0.024617055388677003 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.48379888268156424, + "acc_stderr": 0.016713720729501023, + "acc_norm": 0.48379888268156424, + "acc_norm_stderr": 0.016713720729501023 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.026787453111906497, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.026787453111906497 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.729903536977492, + "acc_stderr": 0.02521804037341063, + "acc_norm": 0.729903536977492, + "acc_norm_stderr": 0.02521804037341063 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7469135802469136, + "acc_stderr": 0.024191808600713, + "acc_norm": 0.7469135802469136, + "acc_norm_stderr": 0.024191808600713 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5106382978723404, + "acc_stderr": 0.02982074719142244, + "acc_norm": 0.5106382978723404, + "acc_norm_stderr": 0.02982074719142244 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.515645371577575, + "acc_stderr": 0.012763982838120956, + "acc_norm": 0.515645371577575, + "acc_norm_stderr": 0.012763982838120956 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.029896163033125474, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.029896163033125474 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.018926082916083376, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.018926082916083376 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.04582004841505417, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.04582004841505417 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7755102040816326, + "acc_stderr": 0.02671143055553842, + "acc_norm": 0.7755102040816326, + "acc_norm_stderr": 0.02671143055553842 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8308457711442786, + "acc_stderr": 0.026508590656233264, + "acc_norm": 0.8308457711442786, + "acc_norm_stderr": 0.026508590656233264 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.92, + "acc_stderr": 0.027265992434429093, + "acc_norm": 0.92, + "acc_norm_stderr": 0.027265992434429093 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5481927710843374, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.5481927710843374, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.03094445977853321, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.03094445977853321 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35495716034271724, + "mc1_stderr": 0.0167508623813759, + "mc2": 0.5244810329463128, + "mc2_stderr": 0.014783170145320674 + }, + "all": { + "acc": 0.6351772016254745, + "acc_stderr": 0.03287958787727761, + "acc_norm": 0.6388907681792397, + "acc_norm_stderr": 0.03285640132111933, + "mc1": 0.35495716034271724, + "mc1_stderr": 0.0167508623813759, + "mc2": 0.5244810329463128, + "mc2_stderr": 0.014783170145320674 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "lilloukas/GPlatty-30B", + "model_sha": "836cf4dcd60ebe2ff09415c72f809d94639e8d35", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/lilloukas/GPlatty-30B/results_2023-07-19T22-25-28.445280.json b/eval-results/lilloukas/GPlatty-30B/results_2023-07-19T22-25-28.445280.json new file mode 100644 index 0000000000000000000000000000000000000000..d8dc8692c01928256fce36b1b8edf68a77a2eca8 --- /dev/null +++ b/eval-results/lilloukas/GPlatty-30B/results_2023-07-19T22-25-28.445280.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.636518771331058, + "acc_stderr": 0.014056207319068283, + "acc_norm": 0.6578498293515358, + "acc_norm_stderr": 0.013864152159177278 + }, + "harness|hellaswag|10": { + "acc": 0.6501692889862577, + "acc_stderr": 0.004759416464201141, + "acc_norm": 0.8479386576379208, + "acc_norm_stderr": 0.0035834648107534767 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5407407407407407, + "acc_stderr": 0.04304979692464242, + "acc_norm": 0.5407407407407407, + "acc_norm_stderr": 0.04304979692464242 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6776315789473685, + "acc_stderr": 0.03803510248351585, + "acc_norm": 0.6776315789473685, + "acc_norm_stderr": 0.03803510248351585 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.64, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.64, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6566037735849056, + "acc_stderr": 0.02922452646912479, + "acc_norm": 0.6566037735849056, + "acc_norm_stderr": 0.02922452646912479 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.03852084696008534, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.03852084696008534 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.05021167315686779, + "acc_norm": 0.52, + "acc_norm_stderr": 0.05021167315686779 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5549132947976878, + "acc_stderr": 0.03789401760283647, + "acc_norm": 0.5549132947976878, + "acc_norm_stderr": 0.03789401760283647 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.04784060704105655, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.04784060704105655 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.548936170212766, + "acc_stderr": 0.032529096196131965, + "acc_norm": 0.548936170212766, + "acc_norm_stderr": 0.032529096196131965 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.34210526315789475, + "acc_stderr": 0.04462917535336936, + "acc_norm": 0.34210526315789475, + "acc_norm_stderr": 0.04462917535336936 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5793103448275863, + "acc_stderr": 0.0411391498118926, + "acc_norm": 0.5793103448275863, + "acc_norm_stderr": 0.0411391498118926 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.025197101074246494, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.025197101074246494 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.04415438226743743, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.04415438226743743 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7483870967741936, + "acc_stderr": 0.024685979286239956, + "acc_norm": 0.7483870967741936, + "acc_norm_stderr": 0.024685979286239956 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.47783251231527096, + "acc_stderr": 0.0351452856217501, + "acc_norm": 0.47783251231527096, + "acc_norm_stderr": 0.0351452856217501 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.793939393939394, + "acc_stderr": 0.0315841532404771, + "acc_norm": 0.793939393939394, + "acc_norm_stderr": 0.0315841532404771 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8181818181818182, + "acc_stderr": 0.027479603010538804, + "acc_norm": 0.8181818181818182, + "acc_norm_stderr": 0.027479603010538804 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8756476683937824, + "acc_stderr": 0.023814477086593552, + "acc_norm": 0.8756476683937824, + "acc_norm_stderr": 0.023814477086593552 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6333333333333333, + "acc_stderr": 0.02443301646605246, + "acc_norm": 0.6333333333333333, + "acc_norm_stderr": 0.02443301646605246 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.31851851851851853, + "acc_stderr": 0.028406533090608463, + "acc_norm": 0.31851851851851853, + "acc_norm_stderr": 0.028406533090608463 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6974789915966386, + "acc_stderr": 0.029837962388291926, + "acc_norm": 0.6974789915966386, + "acc_norm_stderr": 0.029837962388291926 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8495412844036697, + "acc_stderr": 0.015328563932669235, + "acc_norm": 0.8495412844036697, + "acc_norm_stderr": 0.015328563932669235 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5231481481481481, + "acc_stderr": 0.03406315360711507, + "acc_norm": 0.5231481481481481, + "acc_norm_stderr": 0.03406315360711507 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8578431372549019, + "acc_stderr": 0.024509803921568627, + "acc_norm": 0.8578431372549019, + "acc_norm_stderr": 0.024509803921568627 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8523206751054853, + "acc_stderr": 0.023094329582595684, + "acc_norm": 0.8523206751054853, + "acc_norm_stderr": 0.023094329582595684 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7219730941704036, + "acc_stderr": 0.030069584874494033, + "acc_norm": 0.7219730941704036, + "acc_norm_stderr": 0.030069584874494033 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6946564885496184, + "acc_stderr": 0.04039314978724561, + "acc_norm": 0.6946564885496184, + "acc_norm_stderr": 0.04039314978724561 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8181818181818182, + "acc_stderr": 0.035208939510976534, + "acc_norm": 0.8181818181818182, + "acc_norm_stderr": 0.035208939510976534 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.754601226993865, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.754601226993865, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5089285714285714, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.5089285714285714, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8155339805825242, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.8155339805825242, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8760683760683761, + "acc_stderr": 0.021586494001281348, + "acc_norm": 0.8760683760683761, + "acc_norm_stderr": 0.021586494001281348 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7969348659003831, + "acc_stderr": 0.014385525076611578, + "acc_norm": 0.7969348659003831, + "acc_norm_stderr": 0.014385525076611578 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7023121387283237, + "acc_stderr": 0.024617055388677003, + "acc_norm": 0.7023121387283237, + "acc_norm_stderr": 0.024617055388677003 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.48379888268156424, + "acc_stderr": 0.016713720729501023, + "acc_norm": 0.48379888268156424, + "acc_norm_stderr": 0.016713720729501023 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.026787453111906497, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.026787453111906497 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.729903536977492, + "acc_stderr": 0.02521804037341063, + "acc_norm": 0.729903536977492, + "acc_norm_stderr": 0.02521804037341063 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7469135802469136, + "acc_stderr": 0.024191808600713, + "acc_norm": 0.7469135802469136, + "acc_norm_stderr": 0.024191808600713 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5106382978723404, + "acc_stderr": 0.02982074719142244, + "acc_norm": 0.5106382978723404, + "acc_norm_stderr": 0.02982074719142244 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.515645371577575, + "acc_stderr": 0.012763982838120956, + "acc_norm": 0.515645371577575, + "acc_norm_stderr": 0.012763982838120956 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.029896163033125474, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.029896163033125474 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.018926082916083376, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.018926082916083376 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.04582004841505417, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.04582004841505417 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7755102040816326, + "acc_stderr": 0.02671143055553842, + "acc_norm": 0.7755102040816326, + "acc_norm_stderr": 0.02671143055553842 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8308457711442786, + "acc_stderr": 0.026508590656233264, + "acc_norm": 0.8308457711442786, + "acc_norm_stderr": 0.026508590656233264 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.92, + "acc_stderr": 0.027265992434429093, + "acc_norm": 0.92, + "acc_norm_stderr": 0.027265992434429093 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5481927710843374, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.5481927710843374, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.03094445977853321, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.03094445977853321 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35495716034271724, + "mc1_stderr": 0.0167508623813759, + "mc2": 0.5244810329463128, + "mc2_stderr": 0.014783170145320674 + }, + "all": { + "acc": 0.6351772016254745, + "acc_stderr": 0.03287958787727761, + "acc_norm": 0.6388907681792397, + "acc_norm_stderr": 0.03285640132111933, + "mc1": 0.35495716034271724, + "mc1_stderr": 0.0167508623813759, + "mc2": 0.5244810329463128, + "mc2_stderr": 0.014783170145320674 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "lilloukas/GPlatty-30B", + "model_sha": "836cf4dcd60ebe2ff09415c72f809d94639e8d35", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/lilloukas/GPlatty-30B/results_2023-09-23T00-38-16.456797.json b/eval-results/lilloukas/GPlatty-30B/results_2023-09-23T00-38-16.456797.json new file mode 100644 index 0000000000000000000000000000000000000000..19b04661dfc42425452c69f9d8a355e6bdfd37ad --- /dev/null +++ b/eval-results/lilloukas/GPlatty-30B/results_2023-09-23T00-38-16.456797.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "lilloukas/GPlatty-30B", + "model_sha": "a06dc5e381f2987749f0a559ffcaf44401df2239", + "model_size": "60.65 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.4629823825503356, + "em_stderr": 0.005106415513013176, + "f1": 0.5073416526845649, + "f1_stderr": 0.004906633817362961 + }, + "harness|gsm8k|5": { + "acc": 0.13874147081122062, + "acc_stderr": 0.009521649920798146 + }, + "harness|winogrande|5": { + "acc": 0.8097868981846882, + "acc_stderr": 0.01103033579861744 + }, + "all": { + "em": 0.4629823825503356, + "em_stderr": 0.005106415513013176, + "f1": 0.5073416526845649, + "f1_stderr": 0.004906633817362961, + "acc": 0.4742641844979544, + "acc_stderr": 0.010275992859707792 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "d1387bf776fa9571" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "cdef97fb070de180" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "7460b8cb37632084" + }, + "total_evaluation_time_secondes": "16943.983157634735", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/lilloukas/Platypus-30B/results_2023-07-19T22-45-02.696603.json b/eval-results/lilloukas/Platypus-30B/results_2023-07-19T22-45-02.696603.json new file mode 100644 index 0000000000000000000000000000000000000000..5e68c1741deb87a85086b4b66a0ba097bef55dbe --- /dev/null +++ b/eval-results/lilloukas/Platypus-30B/results_2023-07-19T22-45-02.696603.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.6126279863481229, + "acc_stderr": 0.014235872487909865, + "acc_norm": 0.6459044368600683, + "acc_norm_stderr": 0.013975454122756558 + }, + "harness|hellaswag|10": { + "acc": 0.6362278430591516, + "acc_stderr": 0.00480100965769044, + "acc_norm": 0.8423620792670783, + "acc_norm_stderr": 0.003636564286352681 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5407407407407407, + "acc_stderr": 0.04304979692464242, + "acc_norm": 0.5407407407407407, + "acc_norm_stderr": 0.04304979692464242 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6973684210526315, + "acc_stderr": 0.037385206761196686, + "acc_norm": 0.6973684210526315, + "acc_norm_stderr": 0.037385206761196686 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.64, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.64, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6415094339622641, + "acc_stderr": 0.029514703583981762, + "acc_norm": 0.6415094339622641, + "acc_norm_stderr": 0.029514703583981762 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.03745554791462457, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.03745554791462457 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5722543352601156, + "acc_stderr": 0.03772446857518026, + "acc_norm": 0.5722543352601156, + "acc_norm_stderr": 0.03772446857518026 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107224, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107224 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.73, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.73, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5702127659574469, + "acc_stderr": 0.03236214467715564, + "acc_norm": 0.5702127659574469, + "acc_norm_stderr": 0.03236214467715564 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.35964912280701755, + "acc_stderr": 0.045144961328736334, + "acc_norm": 0.35964912280701755, + "acc_norm_stderr": 0.045144961328736334 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.593103448275862, + "acc_stderr": 0.04093793981266236, + "acc_norm": 0.593103448275862, + "acc_norm_stderr": 0.04093793981266236 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.025424835086923992, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.025424835086923992 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.04444444444444449, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.04444444444444449 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7516129032258064, + "acc_stderr": 0.024580028921481003, + "acc_norm": 0.7516129032258064, + "acc_norm_stderr": 0.024580028921481003 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4975369458128079, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.4975369458128079, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421296, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421296 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8, + "acc_stderr": 0.03123475237772117, + "acc_norm": 0.8, + "acc_norm_stderr": 0.03123475237772117 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8232323232323232, + "acc_stderr": 0.027178752639044915, + "acc_norm": 0.8232323232323232, + "acc_norm_stderr": 0.027178752639044915 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8860103626943006, + "acc_stderr": 0.022935144053919443, + "acc_norm": 0.8860103626943006, + "acc_norm_stderr": 0.022935144053919443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6333333333333333, + "acc_stderr": 0.024433016466052455, + "acc_norm": 0.6333333333333333, + "acc_norm_stderr": 0.024433016466052455 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.029116617606083025, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.029116617606083025 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7100840336134454, + "acc_stderr": 0.029472485833136084, + "acc_norm": 0.7100840336134454, + "acc_norm_stderr": 0.029472485833136084 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8495412844036697, + "acc_stderr": 0.015328563932669235, + "acc_norm": 0.8495412844036697, + "acc_norm_stderr": 0.015328563932669235 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5231481481481481, + "acc_stderr": 0.03406315360711507, + "acc_norm": 0.5231481481481481, + "acc_norm_stderr": 0.03406315360711507 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8578431372549019, + "acc_stderr": 0.02450980392156861, + "acc_norm": 0.8578431372549019, + "acc_norm_stderr": 0.02450980392156861 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8438818565400844, + "acc_stderr": 0.023627159460318688, + "acc_norm": 0.8438818565400844, + "acc_norm_stderr": 0.023627159460318688 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7174887892376681, + "acc_stderr": 0.03021683101150878, + "acc_norm": 0.7174887892376681, + "acc_norm_stderr": 0.03021683101150878 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7251908396946565, + "acc_stderr": 0.039153454088478354, + "acc_norm": 0.7251908396946565, + "acc_norm_stderr": 0.039153454088478354 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8347107438016529, + "acc_stderr": 0.03390780612972776, + "acc_norm": 0.8347107438016529, + "acc_norm_stderr": 0.03390780612972776 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.75, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7484662576687117, + "acc_stderr": 0.03408997886857529, + "acc_norm": 0.7484662576687117, + "acc_norm_stderr": 0.03408997886857529 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5178571428571429, + "acc_stderr": 0.04742762361243011, + "acc_norm": 0.5178571428571429, + "acc_norm_stderr": 0.04742762361243011 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7961165048543689, + "acc_stderr": 0.039891398595317706, + "acc_norm": 0.7961165048543689, + "acc_norm_stderr": 0.039891398595317706 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8760683760683761, + "acc_stderr": 0.02158649400128137, + "acc_norm": 0.8760683760683761, + "acc_norm_stderr": 0.02158649400128137 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8007662835249042, + "acc_stderr": 0.01428337804429641, + "acc_norm": 0.8007662835249042, + "acc_norm_stderr": 0.01428337804429641 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7225433526011561, + "acc_stderr": 0.024105712607754307, + "acc_norm": 0.7225433526011561, + "acc_norm_stderr": 0.024105712607754307 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4893854748603352, + "acc_stderr": 0.01671873294119211, + "acc_norm": 0.4893854748603352, + "acc_norm_stderr": 0.01671873294119211 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6993464052287581, + "acc_stderr": 0.026256053835718964, + "acc_norm": 0.6993464052287581, + "acc_norm_stderr": 0.026256053835718964 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.729903536977492, + "acc_stderr": 0.02521804037341063, + "acc_norm": 0.729903536977492, + "acc_norm_stderr": 0.02521804037341063 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7469135802469136, + "acc_stderr": 0.024191808600712995, + "acc_norm": 0.7469135802469136, + "acc_norm_stderr": 0.024191808600712995 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5106382978723404, + "acc_stderr": 0.02982074719142244, + "acc_norm": 0.5106382978723404, + "acc_norm_stderr": 0.02982074719142244 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5273794002607562, + "acc_stderr": 0.012751075788015065, + "acc_norm": 0.5273794002607562, + "acc_norm_stderr": 0.012751075788015065 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5955882352941176, + "acc_stderr": 0.02981263070156974, + "acc_norm": 0.5955882352941176, + "acc_norm_stderr": 0.02981263070156974 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6928104575163399, + "acc_stderr": 0.018663359671463667, + "acc_norm": 0.6928104575163399, + "acc_norm_stderr": 0.018663359671463667 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7714285714285715, + "acc_stderr": 0.026882144922307744, + "acc_norm": 0.7714285714285715, + "acc_norm_stderr": 0.026882144922307744 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8258706467661692, + "acc_stderr": 0.026814951200421603, + "acc_norm": 0.8258706467661692, + "acc_norm_stderr": 0.026814951200421603 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.9, + "acc_stderr": 0.030151134457776348, + "acc_norm": 0.9, + "acc_norm_stderr": 0.030151134457776348 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.536144578313253, + "acc_stderr": 0.038823108508905954, + "acc_norm": 0.536144578313253, + "acc_norm_stderr": 0.038823108508905954 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.03094445977853321, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.03094445977853321 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.28886168910648713, + "mc1_stderr": 0.01586634640138431, + "mc2": 0.45354446138413185, + "mc2_stderr": 0.014302928533725114 + }, + "all": { + "acc": 0.6413452736121387, + "acc_stderr": 0.03283075263577414, + "acc_norm": 0.645403081861628, + "acc_norm_stderr": 0.03280660240295225, + "mc1": 0.28886168910648713, + "mc1_stderr": 0.01586634640138431, + "mc2": 0.45354446138413185, + "mc2_stderr": 0.014302928533725114 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "lilloukas/Platypus-30B", + "model_sha": "979ad39b58a8e4a9419b7bc7a0dc8419f3912e71", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/lilloukas/Platypus-30B/results_2023-09-17T05-57-25.138979.json b/eval-results/lilloukas/Platypus-30B/results_2023-09-17T05-57-25.138979.json new file mode 100644 index 0000000000000000000000000000000000000000..687e79eb3ec7f0233de1929f75cdeeb842aacaa0 --- /dev/null +++ b/eval-results/lilloukas/Platypus-30B/results_2023-09-17T05-57-25.138979.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "lilloukas/Platypus-30B", + "model_sha": "c5d21054f8dd71099696bd7790df07ac54990f29", + "model_size": "60.65 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.4152684563758389, + "em_stderr": 0.005046408282247135, + "f1": 0.4565257969798663, + "f1_stderr": 0.004890389225361096 + }, + "harness|gsm8k|5": { + "acc": 0.14404852160727824, + "acc_stderr": 0.009672110973065282 + }, + "harness|winogrande|5": { + "acc": 0.813733228097869, + "acc_stderr": 0.010941877955676211 + }, + "all": { + "em": 0.4152684563758389, + "em_stderr": 0.005046408282247135, + "f1": 0.4565257969798663, + "f1_stderr": 0.004890389225361096, + "acc": 0.4788908748525736, + "acc_stderr": 0.010306994464370747 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "271f73b75104c642" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "fb0f0a02cd1a4b11" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "db36d97895e0d382" + }, + "total_evaluation_time_secondes": "18774.01301765442", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/llm-agents/tora-13b-v1.0/results_2023-10-10T15-17-02.134278.json b/eval-results/llm-agents/tora-13b-v1.0/results_2023-10-10T15-17-02.134278.json new file mode 100644 index 0000000000000000000000000000000000000000..ed4202a42d2a074941be27fd50575a556849d800 --- /dev/null +++ b/eval-results/llm-agents/tora-13b-v1.0/results_2023-10-10T15-17-02.134278.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "llm-agents/tora-13b-v1.0", + "model_sha": "0636c1f582c979a5a292cc5f3dc293800b1494e2", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5580204778156996, + "acc_stderr": 0.014512682523128342, + "acc_norm": 0.5895904436860068, + "acc_norm_stderr": 0.014374922192642664 + }, + "harness|hellaswag|10": { + "acc": 0.6360286795459071, + "acc_stderr": 0.004801572028920793, + "acc_norm": 0.8231428002389962, + "acc_norm_stderr": 0.0038076803311729033 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621503, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621503 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5037037037037037, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.5037037037037037, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5197368421052632, + "acc_stderr": 0.040657710025626036, + "acc_norm": 0.5197368421052632, + "acc_norm_stderr": 0.040657710025626036 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6188679245283019, + "acc_stderr": 0.029890609686286637, + "acc_norm": 0.6188679245283019, + "acc_norm_stderr": 0.029890609686286637 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5763888888888888, + "acc_stderr": 0.0413212501972337, + "acc_norm": 0.5763888888888888, + "acc_norm_stderr": 0.0413212501972337 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5144508670520231, + "acc_stderr": 0.03810871630454764, + "acc_norm": 0.5144508670520231, + "acc_norm_stderr": 0.03810871630454764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.04389869956808778, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.04389869956808778 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4, + "acc_stderr": 0.03202563076101735, + "acc_norm": 0.4, + "acc_norm_stderr": 0.03202563076101735 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537315, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537315 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4689655172413793, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.4689655172413793, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.02479606060269995, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.02479606060269995 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.04390259265377562, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.04390259265377562 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6419354838709678, + "acc_stderr": 0.027273890594300645, + "acc_norm": 0.6419354838709678, + "acc_norm_stderr": 0.027273890594300645 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4236453201970443, + "acc_stderr": 0.03476725747649037, + "acc_norm": 0.4236453201970443, + "acc_norm_stderr": 0.03476725747649037 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6606060606060606, + "acc_stderr": 0.03697442205031596, + "acc_norm": 0.6606060606060606, + "acc_norm_stderr": 0.03697442205031596 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7121212121212122, + "acc_stderr": 0.03225883512300992, + "acc_norm": 0.7121212121212122, + "acc_norm_stderr": 0.03225883512300992 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7875647668393783, + "acc_stderr": 0.029519282616817234, + "acc_norm": 0.7875647668393783, + "acc_norm_stderr": 0.029519282616817234 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5333333333333333, + "acc_stderr": 0.02529460802398647, + "acc_norm": 0.5333333333333333, + "acc_norm_stderr": 0.02529460802398647 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.31851851851851853, + "acc_stderr": 0.02840653309060846, + "acc_norm": 0.31851851851851853, + "acc_norm_stderr": 0.02840653309060846 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.542016806722689, + "acc_stderr": 0.03236361111951941, + "acc_norm": 0.542016806722689, + "acc_norm_stderr": 0.03236361111951941 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.37748344370860926, + "acc_stderr": 0.03958027231121569, + "acc_norm": 0.37748344370860926, + "acc_norm_stderr": 0.03958027231121569 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7412844036697248, + "acc_stderr": 0.018776052319619627, + "acc_norm": 0.7412844036697248, + "acc_norm_stderr": 0.018776052319619627 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.03350991604696042, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.03350991604696042 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7549019607843137, + "acc_stderr": 0.030190282453501947, + "acc_norm": 0.7549019607843137, + "acc_norm_stderr": 0.030190282453501947 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.70042194092827, + "acc_stderr": 0.029818024749753095, + "acc_norm": 0.70042194092827, + "acc_norm_stderr": 0.029818024749753095 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6278026905829597, + "acc_stderr": 0.03244305283008731, + "acc_norm": 0.6278026905829597, + "acc_norm_stderr": 0.03244305283008731 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6412213740458015, + "acc_stderr": 0.04206739313864908, + "acc_norm": 0.6412213740458015, + "acc_norm_stderr": 0.04206739313864908 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7024793388429752, + "acc_stderr": 0.04173349148083499, + "acc_norm": 0.7024793388429752, + "acc_norm_stderr": 0.04173349148083499 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6134969325153374, + "acc_stderr": 0.03825825548848607, + "acc_norm": 0.6134969325153374, + "acc_norm_stderr": 0.03825825548848607 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.042878587513404565, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.042878587513404565 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8247863247863247, + "acc_stderr": 0.024904439098918214, + "acc_norm": 0.8247863247863247, + "acc_norm_stderr": 0.024904439098918214 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956914, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956914 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.735632183908046, + "acc_stderr": 0.015769984840690518, + "acc_norm": 0.735632183908046, + "acc_norm_stderr": 0.015769984840690518 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6329479768786127, + "acc_stderr": 0.02595005433765407, + "acc_norm": 0.6329479768786127, + "acc_norm_stderr": 0.02595005433765407 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.27150837988826815, + "acc_stderr": 0.014874252168095277, + "acc_norm": 0.27150837988826815, + "acc_norm_stderr": 0.014874252168095277 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5947712418300654, + "acc_stderr": 0.02811092849280907, + "acc_norm": 0.5947712418300654, + "acc_norm_stderr": 0.02811092849280907 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6205787781350482, + "acc_stderr": 0.027559949802347813, + "acc_norm": 0.6205787781350482, + "acc_norm_stderr": 0.027559949802347813 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6141975308641975, + "acc_stderr": 0.027085401226132143, + "acc_norm": 0.6141975308641975, + "acc_norm_stderr": 0.027085401226132143 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.40070921985815605, + "acc_stderr": 0.029233465745573083, + "acc_norm": 0.40070921985815605, + "acc_norm_stderr": 0.029233465745573083 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.408735332464146, + "acc_stderr": 0.012555701346703384, + "acc_norm": 0.408735332464146, + "acc_norm_stderr": 0.012555701346703384 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4889705882352941, + "acc_stderr": 0.030365446477275675, + "acc_norm": 0.4889705882352941, + "acc_norm_stderr": 0.030365446477275675 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5326797385620915, + "acc_stderr": 0.0201845833591022, + "acc_norm": 0.5326797385620915, + "acc_norm_stderr": 0.0201845833591022 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5877551020408164, + "acc_stderr": 0.03151236044674269, + "acc_norm": 0.5877551020408164, + "acc_norm_stderr": 0.03151236044674269 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7213930348258707, + "acc_stderr": 0.031700561834973086, + "acc_norm": 0.7213930348258707, + "acc_norm_stderr": 0.031700561834973086 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.43373493975903615, + "acc_stderr": 0.03858158940685517, + "acc_norm": 0.43373493975903615, + "acc_norm_stderr": 0.03858158940685517 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7602339181286549, + "acc_stderr": 0.03274485211946956, + "acc_norm": 0.7602339181286549, + "acc_norm_stderr": 0.03274485211946956 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2802937576499388, + "mc1_stderr": 0.015723139524608763, + "mc2": 0.4025446800568436, + "mc2_stderr": 0.015003901494005132 + }, + "all": { + "acc": 0.5490034558552397, + "acc_stderr": 0.03446388198618693, + "acc_norm": 0.5527099657969922, + "acc_norm_stderr": 0.0344447014433355, + "mc1": 0.2802937576499388, + "mc1_stderr": 0.015723139524608763, + "mc2": 0.4025446800568436, + "mc2_stderr": 0.015003901494005132 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6502.844645023346", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/llm-agents/tora-13b-v1.0/results_2023-10-29T06-57-18.434824.json b/eval-results/llm-agents/tora-13b-v1.0/results_2023-10-29T06-57-18.434824.json new file mode 100644 index 0000000000000000000000000000000000000000..542f2a43d2a3561dc413f1c02925289ddc692b12 --- /dev/null +++ b/eval-results/llm-agents/tora-13b-v1.0/results_2023-10-29T06-57-18.434824.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "llm-agents/tora-13b-v1.0", + "model_sha": "0636c1f582c979a5a292cc5f3dc293800b1494e2", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0024119127516778523, + "em_stderr": 0.0005023380498893349, + "f1": 0.06216652684563757, + "f1_stderr": 0.0014129871021706449 + }, + "harness|gsm8k|5": { + "acc": 0.09855951478392722, + "acc_stderr": 0.008210320350946335 + }, + "harness|winogrande|5": { + "acc": 0.7561168113654302, + "acc_stderr": 0.012068923278908189 + }, + "all": { + "em": 0.0024119127516778523, + "em_stderr": 0.0005023380498893349, + "f1": 0.06216652684563757, + "f1_stderr": 0.0014129871021706449, + "acc": 0.4273381630746787, + "acc_stderr": 0.010139621814927263 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "757868298b4853db" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "802b4fc9bb4ada2c" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "4091e1328d0333fb" + }, + "total_evaluation_time_secondes": "11880.883741617203", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/llm-agents/tora-13b-v1.0/results_2023-10-29T07-05-06.186132.json b/eval-results/llm-agents/tora-13b-v1.0/results_2023-10-29T07-05-06.186132.json new file mode 100644 index 0000000000000000000000000000000000000000..f097200bc6d0f5297372b1b6454bb2604c0fab06 --- /dev/null +++ b/eval-results/llm-agents/tora-13b-v1.0/results_2023-10-29T07-05-06.186132.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "llm-agents/tora-13b-v1.0", + "model_sha": "0636c1f582c979a5a292cc5f3dc293800b1494e2", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0024119127516778523, + "em_stderr": 0.0005023380498893349, + "f1": 0.06216652684563757, + "f1_stderr": 0.0014129871021706449 + }, + "harness|gsm8k|5": { + "acc": 0.09855951478392722, + "acc_stderr": 0.008210320350946335 + }, + "harness|winogrande|5": { + "acc": 0.7561168113654302, + "acc_stderr": 0.012068923278908189 + }, + "all": { + "em": 0.0024119127516778523, + "em_stderr": 0.0005023380498893349, + "f1": 0.06216652684563757, + "f1_stderr": 0.0014129871021706449, + "acc": 0.4273381630746787, + "acc_stderr": 0.010139621814927263 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "757868298b4853db" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "802b4fc9bb4ada2c" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "4091e1328d0333fb" + }, + "total_evaluation_time_secondes": "11854.715583562851", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/llm-agents/tora-70b-v1.0/results_2023-10-11T01-55-12.712768.json b/eval-results/llm-agents/tora-70b-v1.0/results_2023-10-11T01-55-12.712768.json new file mode 100644 index 0000000000000000000000000000000000000000..63a3c6ffd7fdf83c19765347e76fdeac2839a4a4 --- /dev/null +++ b/eval-results/llm-agents/tora-70b-v1.0/results_2023-10-11T01-55-12.712768.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "llm-agents/tora-70b-v1.0", + "model_sha": "e95fd7daf017e7c414ec07ebef4ddf013c16f9a4", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6424914675767918, + "acc_stderr": 0.014005494275916576, + "acc_norm": 0.6774744027303754, + "acc_norm_stderr": 0.013659980894277376 + }, + "harness|hellaswag|10": { + "acc": 0.6677952599083847, + "acc_stderr": 0.004700413824942559, + "acc_norm": 0.8582951603266281, + "acc_norm_stderr": 0.003480344142139512 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6296296296296297, + "acc_stderr": 0.04171654161354543, + "acc_norm": 0.6296296296296297, + "acc_norm_stderr": 0.04171654161354543 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.8026315789473685, + "acc_stderr": 0.03238981601699397, + "acc_norm": 0.8026315789473685, + "acc_norm_stderr": 0.03238981601699397 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.73, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.73, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7132075471698113, + "acc_stderr": 0.027834912527544074, + "acc_norm": 0.7132075471698113, + "acc_norm_stderr": 0.027834912527544074 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8194444444444444, + "acc_stderr": 0.03216600808802267, + "acc_norm": 0.8194444444444444, + "acc_norm_stderr": 0.03216600808802267 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6416184971098265, + "acc_stderr": 0.03656343653353159, + "acc_norm": 0.6416184971098265, + "acc_norm_stderr": 0.03656343653353159 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.39215686274509803, + "acc_stderr": 0.04858083574266345, + "acc_norm": 0.39215686274509803, + "acc_norm_stderr": 0.04858083574266345 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932262, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932262 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6680851063829787, + "acc_stderr": 0.030783736757745657, + "acc_norm": 0.6680851063829787, + "acc_norm_stderr": 0.030783736757745657 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.40350877192982454, + "acc_stderr": 0.046151869625837026, + "acc_norm": 0.40350877192982454, + "acc_norm_stderr": 0.046151869625837026 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5862068965517241, + "acc_stderr": 0.04104269211806232, + "acc_norm": 0.5862068965517241, + "acc_norm_stderr": 0.04104269211806232 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.025467149045469546, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.025467149045469546 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5238095238095238, + "acc_stderr": 0.04467062628403273, + "acc_norm": 0.5238095238095238, + "acc_norm_stderr": 0.04467062628403273 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8225806451612904, + "acc_stderr": 0.021732540689329286, + "acc_norm": 0.8225806451612904, + "acc_norm_stderr": 0.021732540689329286 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5369458128078818, + "acc_stderr": 0.03508370520442665, + "acc_norm": 0.5369458128078818, + "acc_norm_stderr": 0.03508370520442665 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8303030303030303, + "acc_stderr": 0.029311188674983134, + "acc_norm": 0.8303030303030303, + "acc_norm_stderr": 0.029311188674983134 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8888888888888888, + "acc_stderr": 0.022390787638216763, + "acc_norm": 0.8888888888888888, + "acc_norm_stderr": 0.022390787638216763 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9326424870466321, + "acc_stderr": 0.018088393839078912, + "acc_norm": 0.9326424870466321, + "acc_norm_stderr": 0.018088393839078912 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7128205128205128, + "acc_stderr": 0.022939925418530616, + "acc_norm": 0.7128205128205128, + "acc_norm_stderr": 0.022939925418530616 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.028742040903948496, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.028742040903948496 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.773109243697479, + "acc_stderr": 0.027205371538279472, + "acc_norm": 0.773109243697479, + "acc_norm_stderr": 0.027205371538279472 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.48344370860927155, + "acc_stderr": 0.0408024418562897, + "acc_norm": 0.48344370860927155, + "acc_norm_stderr": 0.0408024418562897 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8899082568807339, + "acc_stderr": 0.0134199390186812, + "acc_norm": 0.8899082568807339, + "acc_norm_stderr": 0.0134199390186812 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5972222222222222, + "acc_stderr": 0.03344887382997865, + "acc_norm": 0.5972222222222222, + "acc_norm_stderr": 0.03344887382997865 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9019607843137255, + "acc_stderr": 0.0208711184555521, + "acc_norm": 0.9019607843137255, + "acc_norm_stderr": 0.0208711184555521 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8649789029535865, + "acc_stderr": 0.022245776632003694, + "acc_norm": 0.8649789029535865, + "acc_norm_stderr": 0.022245776632003694 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.8071748878923767, + "acc_stderr": 0.026478240960489365, + "acc_norm": 0.8071748878923767, + "acc_norm_stderr": 0.026478240960489365 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8778625954198473, + "acc_stderr": 0.028718776889342337, + "acc_norm": 0.8778625954198473, + "acc_norm_stderr": 0.028718776889342337 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8842975206611571, + "acc_stderr": 0.029199802455622804, + "acc_norm": 0.8842975206611571, + "acc_norm_stderr": 0.029199802455622804 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7962962962962963, + "acc_stderr": 0.03893542518824847, + "acc_norm": 0.7962962962962963, + "acc_norm_stderr": 0.03893542518824847 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7975460122699386, + "acc_stderr": 0.031570650789119005, + "acc_norm": 0.7975460122699386, + "acc_norm_stderr": 0.031570650789119005 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4642857142857143, + "acc_stderr": 0.04733667890053756, + "acc_norm": 0.4642857142857143, + "acc_norm_stderr": 0.04733667890053756 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.037601780060266196, + "acc_norm": 0.8252427184466019, + "acc_norm_stderr": 0.037601780060266196 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9102564102564102, + "acc_stderr": 0.018724301741941635, + "acc_norm": 0.9102564102564102, + "acc_norm_stderr": 0.018724301741941635 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252607, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252607 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8620689655172413, + "acc_stderr": 0.012331009307795661, + "acc_norm": 0.8620689655172413, + "acc_norm_stderr": 0.012331009307795661 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7803468208092486, + "acc_stderr": 0.022289638852617893, + "acc_norm": 0.7803468208092486, + "acc_norm_stderr": 0.022289638852617893 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4201117318435754, + "acc_stderr": 0.016507671073256402, + "acc_norm": 0.4201117318435754, + "acc_norm_stderr": 0.016507671073256402 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7483660130718954, + "acc_stderr": 0.0248480182638752, + "acc_norm": 0.7483660130718954, + "acc_norm_stderr": 0.0248480182638752 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.797427652733119, + "acc_stderr": 0.022827317491059686, + "acc_norm": 0.797427652733119, + "acc_norm_stderr": 0.022827317491059686 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8240740740740741, + "acc_stderr": 0.02118589361522518, + "acc_norm": 0.8240740740740741, + "acc_norm_stderr": 0.02118589361522518 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5425531914893617, + "acc_stderr": 0.029719281272236834, + "acc_norm": 0.5425531914893617, + "acc_norm_stderr": 0.029719281272236834 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5365058670143416, + "acc_stderr": 0.012736153390214968, + "acc_norm": 0.5365058670143416, + "acc_norm_stderr": 0.012736153390214968 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7536764705882353, + "acc_stderr": 0.02617343857052, + "acc_norm": 0.7536764705882353, + "acc_norm_stderr": 0.02617343857052 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7516339869281046, + "acc_stderr": 0.017479487001364764, + "acc_norm": 0.7516339869281046, + "acc_norm_stderr": 0.017479487001364764 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7363636363636363, + "acc_stderr": 0.04220224692971987, + "acc_norm": 0.7363636363636363, + "acc_norm_stderr": 0.04220224692971987 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7959183673469388, + "acc_stderr": 0.025801283475090496, + "acc_norm": 0.7959183673469388, + "acc_norm_stderr": 0.025801283475090496 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8855721393034826, + "acc_stderr": 0.022509345325101706, + "acc_norm": 0.8855721393034826, + "acc_norm_stderr": 0.022509345325101706 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.91, + "acc_stderr": 0.028762349126466125, + "acc_norm": 0.91, + "acc_norm_stderr": 0.028762349126466125 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5421686746987951, + "acc_stderr": 0.0387862677100236, + "acc_norm": 0.5421686746987951, + "acc_norm_stderr": 0.0387862677100236 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8538011695906432, + "acc_stderr": 0.027097290118070806, + "acc_norm": 0.8538011695906432, + "acc_norm_stderr": 0.027097290118070806 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35862913096695226, + "mc1_stderr": 0.016789289499502022, + "mc2": 0.5178570244121696, + "mc2_stderr": 0.0147299196907601 + }, + "all": { + "acc": 0.6909864450520502, + "acc_stderr": 0.031168584647014733, + "acc_norm": 0.6948081880278438, + "acc_norm_stderr": 0.03114204934083774, + "mc1": 0.35862913096695226, + "mc1_stderr": 0.016789289499502022, + "mc2": 0.5178570244121696, + "mc2_stderr": 0.0147299196907601 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "44377.33392930031", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/llm-agents/tora-70b-v1.0/results_2023-10-28T23-04-49.210564.json b/eval-results/llm-agents/tora-70b-v1.0/results_2023-10-28T23-04-49.210564.json new file mode 100644 index 0000000000000000000000000000000000000000..f404d25ad53c010b603ae35214dcaf73e713027f --- /dev/null +++ b/eval-results/llm-agents/tora-70b-v1.0/results_2023-10-28T23-04-49.210564.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "llm-agents/tora-70b-v1.0", + "model_sha": "e95fd7daf017e7c414ec07ebef4ddf013c16f9a4", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.3409186241610738, + "em_stderr": 0.004854388549221249, + "f1": 0.40523280201342454, + "f1_stderr": 0.004724035643302926 + }, + "harness|gsm8k|5": { + "acc": 0.23805913570887036, + "acc_stderr": 0.011731278748420892 + }, + "harness|winogrande|5": { + "acc": 0.819258089976322, + "acc_stderr": 0.010814911009613978 + }, + "all": { + "em": 0.3409186241610738, + "em_stderr": 0.004854388549221249, + "f1": 0.40523280201342454, + "f1_stderr": 0.004724035643302926, + "acc": 0.5286586128425962, + "acc_stderr": 0.011273094879017436 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "d8625799a66963de" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "03ae1631527b7366" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "d7e4ca85db148ea2" + }, + "total_evaluation_time_secondes": "32840.96991086006", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/llm-agents/tora-7b-v1.0/results_2023-10-10T14-34-11.685092.json b/eval-results/llm-agents/tora-7b-v1.0/results_2023-10-10T14-34-11.685092.json new file mode 100644 index 0000000000000000000000000000000000000000..90c172e30c0932a4a272a8648a4abdce9d564679 --- /dev/null +++ b/eval-results/llm-agents/tora-7b-v1.0/results_2023-10-10T14-34-11.685092.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "llm-agents/tora-7b-v1.0", + "model_sha": "717edbee98945192b1a396fc9c337c5b32d6c79c", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.49402730375426623, + "acc_stderr": 0.014610348300255793, + "acc_norm": 0.5247440273037542, + "acc_norm_stderr": 0.014593487694937735 + }, + "harness|hellaswag|10": { + "acc": 0.6056562437761402, + "acc_stderr": 0.004877104939356237, + "acc_norm": 0.7867954590718981, + "acc_norm_stderr": 0.0040873390451062995 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4740740740740741, + "acc_stderr": 0.04313531696750575, + "acc_norm": 0.4740740740740741, + "acc_norm_stderr": 0.04313531696750575 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4407894736842105, + "acc_stderr": 0.04040311062490436, + "acc_norm": 0.4407894736842105, + "acc_norm_stderr": 0.04040311062490436 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.49433962264150944, + "acc_stderr": 0.030770900763851302, + "acc_norm": 0.49433962264150944, + "acc_norm_stderr": 0.030770900763851302 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4652777777777778, + "acc_stderr": 0.04171115858181617, + "acc_norm": 0.4652777777777778, + "acc_norm_stderr": 0.04171115858181617 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4277456647398844, + "acc_stderr": 0.03772446857518026, + "acc_norm": 0.4277456647398844, + "acc_norm_stderr": 0.03772446857518026 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.04023382273617746, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.04023382273617746 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.37872340425531914, + "acc_stderr": 0.03170995606040655, + "acc_norm": 0.37872340425531914, + "acc_norm_stderr": 0.03170995606040655 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322004, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322004 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5103448275862069, + "acc_stderr": 0.04165774775728763, + "acc_norm": 0.5103448275862069, + "acc_norm_stderr": 0.04165774775728763 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30952380952380953, + "acc_stderr": 0.023809523809523857, + "acc_norm": 0.30952380952380953, + "acc_norm_stderr": 0.023809523809523857 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23015873015873015, + "acc_stderr": 0.037649508797906045, + "acc_norm": 0.23015873015873015, + "acc_norm_stderr": 0.037649508797906045 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.49032258064516127, + "acc_stderr": 0.028438677998909558, + "acc_norm": 0.49032258064516127, + "acc_norm_stderr": 0.028438677998909558 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3399014778325123, + "acc_stderr": 0.033327690684107895, + "acc_norm": 0.3399014778325123, + "acc_norm_stderr": 0.033327690684107895 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6, + "acc_stderr": 0.03825460278380026, + "acc_norm": 0.6, + "acc_norm_stderr": 0.03825460278380026 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.035402943770953675, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.035402943770953675 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6839378238341969, + "acc_stderr": 0.033553973696861736, + "acc_norm": 0.6839378238341969, + "acc_norm_stderr": 0.033553973696861736 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4205128205128205, + "acc_stderr": 0.025028610276710862, + "acc_norm": 0.4205128205128205, + "acc_norm_stderr": 0.025028610276710862 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085622, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085622 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.0322529423239964, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.0322529423239964 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.036313298039696545, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.036313298039696545 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6587155963302752, + "acc_stderr": 0.020328612816592446, + "acc_norm": 0.6587155963302752, + "acc_norm_stderr": 0.020328612816592446 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.030546745264953195, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.030546745264953195 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6078431372549019, + "acc_stderr": 0.03426712349247273, + "acc_norm": 0.6078431372549019, + "acc_norm_stderr": 0.03426712349247273 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6033755274261603, + "acc_stderr": 0.03184399873811225, + "acc_norm": 0.6033755274261603, + "acc_norm_stderr": 0.03184399873811225 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5426008968609866, + "acc_stderr": 0.033435777055830646, + "acc_norm": 0.5426008968609866, + "acc_norm_stderr": 0.033435777055830646 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5267175572519084, + "acc_stderr": 0.04379024936553894, + "acc_norm": 0.5267175572519084, + "acc_norm_stderr": 0.04379024936553894 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6446280991735537, + "acc_stderr": 0.0436923632657398, + "acc_norm": 0.6446280991735537, + "acc_norm_stderr": 0.0436923632657398 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5648148148148148, + "acc_stderr": 0.04792898170907061, + "acc_norm": 0.5648148148148148, + "acc_norm_stderr": 0.04792898170907061 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4723926380368098, + "acc_stderr": 0.039223782906109894, + "acc_norm": 0.4723926380368098, + "acc_norm_stderr": 0.039223782906109894 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.38392857142857145, + "acc_stderr": 0.04616143075028547, + "acc_norm": 0.38392857142857145, + "acc_norm_stderr": 0.04616143075028547 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5728155339805825, + "acc_stderr": 0.04897957737781168, + "acc_norm": 0.5728155339805825, + "acc_norm_stderr": 0.04897957737781168 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6837606837606838, + "acc_stderr": 0.030463656747340265, + "acc_norm": 0.6837606837606838, + "acc_norm_stderr": 0.030463656747340265 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6232439335887612, + "acc_stderr": 0.017328292907303047, + "acc_norm": 0.6232439335887612, + "acc_norm_stderr": 0.017328292907303047 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.48265895953757226, + "acc_stderr": 0.026902900458666647, + "acc_norm": 0.48265895953757226, + "acc_norm_stderr": 0.026902900458666647 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23910614525139665, + "acc_stderr": 0.014265554192331144, + "acc_norm": 0.23910614525139665, + "acc_norm_stderr": 0.014265554192331144 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4673202614379085, + "acc_stderr": 0.028568699752225868, + "acc_norm": 0.4673202614379085, + "acc_norm_stderr": 0.028568699752225868 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.572347266881029, + "acc_stderr": 0.028099240775809553, + "acc_norm": 0.572347266881029, + "acc_norm_stderr": 0.028099240775809553 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.4783950617283951, + "acc_stderr": 0.027794760105008746, + "acc_norm": 0.4783950617283951, + "acc_norm_stderr": 0.027794760105008746 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3404255319148936, + "acc_stderr": 0.028267657482650144, + "acc_norm": 0.3404255319148936, + "acc_norm_stderr": 0.028267657482650144 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.35658409387222945, + "acc_stderr": 0.01223364298927389, + "acc_norm": 0.35658409387222945, + "acc_norm_stderr": 0.01223364298927389 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4742647058823529, + "acc_stderr": 0.030332578094555033, + "acc_norm": 0.4742647058823529, + "acc_norm_stderr": 0.030332578094555033 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4297385620915033, + "acc_stderr": 0.020027122784928547, + "acc_norm": 0.4297385620915033, + "acc_norm_stderr": 0.020027122784928547 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5181818181818182, + "acc_stderr": 0.04785964010794916, + "acc_norm": 0.5181818181818182, + "acc_norm_stderr": 0.04785964010794916 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.46122448979591835, + "acc_stderr": 0.03191282052669277, + "acc_norm": 0.46122448979591835, + "acc_norm_stderr": 0.03191282052669277 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6169154228855721, + "acc_stderr": 0.0343751933733825, + "acc_norm": 0.6169154228855721, + "acc_norm_stderr": 0.0343751933733825 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.39759036144578314, + "acc_stderr": 0.038099730845402184, + "acc_norm": 0.39759036144578314, + "acc_norm_stderr": 0.038099730845402184 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.672514619883041, + "acc_stderr": 0.035993357714560276, + "acc_norm": 0.672514619883041, + "acc_norm_stderr": 0.035993357714560276 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.26193390452876375, + "mc1_stderr": 0.015392118805015023, + "mc2": 0.3789924465917188, + "mc2_stderr": 0.014709754655502841 + }, + "all": { + "acc": 0.46210391677330115, + "acc_stderr": 0.0351995874362206, + "acc_norm": 0.4656946953977969, + "acc_norm_stderr": 0.035185915800634696, + "mc1": 0.26193390452876375, + "mc1_stderr": 0.015392118805015023, + "mc2": 0.3789924465917188, + "mc2_stderr": 0.014709754655502841 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4303.967225551605", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/llm-agents/tora-7b-v1.0/results_2023-10-27T12-52-31.057587.json b/eval-results/llm-agents/tora-7b-v1.0/results_2023-10-27T12-52-31.057587.json new file mode 100644 index 0000000000000000000000000000000000000000..0a2f6950fe303824d2612c93b75e042666a28439 --- /dev/null +++ b/eval-results/llm-agents/tora-7b-v1.0/results_2023-10-27T12-52-31.057587.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "llm-agents/tora-7b-v1.0", + "model_sha": "717edbee98945192b1a396fc9c337c5b32d6c79c", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.03166946308724832, + "em_stderr": 0.001793377907859907, + "f1": 0.0924370805369127, + "f1_stderr": 0.002203336567209257 + }, + "harness|gsm8k|5": { + "acc": 0.025018953752843062, + "acc_stderr": 0.0043020450465642845 + }, + "harness|winogrande|5": { + "acc": 0.7355958958168903, + "acc_stderr": 0.012394724896983799 + }, + "all": { + "em": 0.03166946308724832, + "em_stderr": 0.001793377907859907, + "f1": 0.0924370805369127, + "f1_stderr": 0.002203336567209257, + "acc": 0.3803074247848667, + "acc_stderr": 0.008348384971774042 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "81e5aa6a2d6acb8a" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "a2cc913feeb21277" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "3686cca8020eddc2" + }, + "total_evaluation_time_secondes": "9350.089348316193", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/llm-agents/tora-code-13b-v1.0/results_2023-10-10T14-56-19.008780.json b/eval-results/llm-agents/tora-code-13b-v1.0/results_2023-10-10T14-56-19.008780.json new file mode 100644 index 0000000000000000000000000000000000000000..018f695bf6c605efc2eefaafb7874014334276c3 --- /dev/null +++ b/eval-results/llm-agents/tora-code-13b-v1.0/results_2023-10-10T14-56-19.008780.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "llm-agents/tora-code-13b-v1.0", + "model_sha": "4bf5b528d95a507b435c24a8986afe80d5951782", + "model_size": "24.56 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4206484641638225, + "acc_stderr": 0.014426211252508406, + "acc_norm": 0.4445392491467577, + "acc_norm_stderr": 0.014521226405627077 + }, + "harness|hellaswag|10": { + "acc": 0.522903804023103, + "acc_stderr": 0.004984543540932333, + "acc_norm": 0.6928898625771759, + "acc_norm_stderr": 0.004603527017557854 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816507, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816507 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.34814814814814815, + "acc_stderr": 0.041153246103369526, + "acc_norm": 0.34814814814814815, + "acc_norm_stderr": 0.041153246103369526 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.40131578947368424, + "acc_stderr": 0.039889037033362836, + "acc_norm": 0.40131578947368424, + "acc_norm_stderr": 0.039889037033362836 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.3660377358490566, + "acc_stderr": 0.029647813539365245, + "acc_norm": 0.3660377358490566, + "acc_norm_stderr": 0.029647813539365245 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3472222222222222, + "acc_stderr": 0.039812405437178615, + "acc_norm": 0.3472222222222222, + "acc_norm_stderr": 0.039812405437178615 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3236994219653179, + "acc_stderr": 0.0356760379963917, + "acc_norm": 0.3236994219653179, + "acc_norm_stderr": 0.0356760379963917 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.03793281185307809, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.03793281185307809 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3446808510638298, + "acc_stderr": 0.03106898596312215, + "acc_norm": 0.3446808510638298, + "acc_norm_stderr": 0.03106898596312215 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.33793103448275863, + "acc_stderr": 0.0394170763206489, + "acc_norm": 0.33793103448275863, + "acc_norm_stderr": 0.0394170763206489 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.291005291005291, + "acc_stderr": 0.02339382650048487, + "acc_norm": 0.291005291005291, + "acc_norm_stderr": 0.02339382650048487 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.04163453031302859, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.04163453031302859 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.36451612903225805, + "acc_stderr": 0.027379871229943252, + "acc_norm": 0.36451612903225805, + "acc_norm_stderr": 0.027379871229943252 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.24630541871921183, + "acc_stderr": 0.030315099285617715, + "acc_norm": 0.24630541871921183, + "acc_norm_stderr": 0.030315099285617715 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.4909090909090909, + "acc_stderr": 0.0390369864774844, + "acc_norm": 0.4909090909090909, + "acc_norm_stderr": 0.0390369864774844 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3838383838383838, + "acc_stderr": 0.03464881675016338, + "acc_norm": 0.3838383838383838, + "acc_norm_stderr": 0.03464881675016338 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.39896373056994816, + "acc_stderr": 0.035339990940656964, + "acc_norm": 0.39896373056994816, + "acc_norm_stderr": 0.035339990940656964 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2743589743589744, + "acc_stderr": 0.022622765767493207, + "acc_norm": 0.2743589743589744, + "acc_norm_stderr": 0.022622765767493207 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085622, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085622 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2815126050420168, + "acc_stderr": 0.029213549414372184, + "acc_norm": 0.2815126050420168, + "acc_norm_stderr": 0.029213549414372184 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.41284403669724773, + "acc_stderr": 0.021109128133413906, + "acc_norm": 0.41284403669724773, + "acc_norm_stderr": 0.021109128133413906 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.25, + "acc_stderr": 0.029531221160930918, + "acc_norm": 0.25, + "acc_norm_stderr": 0.029531221160930918 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.46568627450980393, + "acc_stderr": 0.03501038327635897, + "acc_norm": 0.46568627450980393, + "acc_norm_stderr": 0.03501038327635897 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.5147679324894515, + "acc_stderr": 0.032533028078777386, + "acc_norm": 0.5147679324894515, + "acc_norm_stderr": 0.032533028078777386 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.42152466367713004, + "acc_stderr": 0.033141902221106564, + "acc_norm": 0.42152466367713004, + "acc_norm_stderr": 0.033141902221106564 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.37404580152671757, + "acc_stderr": 0.04243869242230524, + "acc_norm": 0.37404580152671757, + "acc_norm_stderr": 0.04243869242230524 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.4628099173553719, + "acc_stderr": 0.04551711196104218, + "acc_norm": 0.4628099173553719, + "acc_norm_stderr": 0.04551711196104218 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5092592592592593, + "acc_stderr": 0.04832853553437056, + "acc_norm": 0.5092592592592593, + "acc_norm_stderr": 0.04832853553437056 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.36809815950920244, + "acc_stderr": 0.03789213935838396, + "acc_norm": 0.36809815950920244, + "acc_norm_stderr": 0.03789213935838396 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.26785714285714285, + "acc_stderr": 0.04203277291467762, + "acc_norm": 0.26785714285714285, + "acc_norm_stderr": 0.04203277291467762 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.4174757281553398, + "acc_stderr": 0.04882840548212238, + "acc_norm": 0.4174757281553398, + "acc_norm_stderr": 0.04882840548212238 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.031937057262002924, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.031937057262002924 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.47126436781609193, + "acc_stderr": 0.01785041079438017, + "acc_norm": 0.47126436781609193, + "acc_norm_stderr": 0.01785041079438017 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.40173410404624277, + "acc_stderr": 0.026394104177643627, + "acc_norm": 0.40173410404624277, + "acc_norm_stderr": 0.026394104177643627 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217892, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217892 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4084967320261438, + "acc_stderr": 0.02814640599309636, + "acc_norm": 0.4084967320261438, + "acc_norm_stderr": 0.02814640599309636 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.43086816720257237, + "acc_stderr": 0.028125340983972714, + "acc_norm": 0.43086816720257237, + "acc_norm_stderr": 0.028125340983972714 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.3734567901234568, + "acc_stderr": 0.02691500301138015, + "acc_norm": 0.3734567901234568, + "acc_norm_stderr": 0.02691500301138015 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.29432624113475175, + "acc_stderr": 0.027187127011503793, + "acc_norm": 0.29432624113475175, + "acc_norm_stderr": 0.027187127011503793 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.29595827900912647, + "acc_stderr": 0.01165851852527704, + "acc_norm": 0.29595827900912647, + "acc_norm_stderr": 0.01165851852527704 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.22794117647058823, + "acc_stderr": 0.025483081468029804, + "acc_norm": 0.22794117647058823, + "acc_norm_stderr": 0.025483081468029804 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.019333142020797077, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.019333142020797077 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.42727272727272725, + "acc_stderr": 0.04738198703545483, + "acc_norm": 0.42727272727272725, + "acc_norm_stderr": 0.04738198703545483 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.40816326530612246, + "acc_stderr": 0.03146465712827424, + "acc_norm": 0.40816326530612246, + "acc_norm_stderr": 0.03146465712827424 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.43283582089552236, + "acc_stderr": 0.03503490923673281, + "acc_norm": 0.43283582089552236, + "acc_norm_stderr": 0.03503490923673281 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3674698795180723, + "acc_stderr": 0.03753267402120574, + "acc_norm": 0.3674698795180723, + "acc_norm_stderr": 0.03753267402120574 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.4327485380116959, + "acc_stderr": 0.037999786443706066, + "acc_norm": 0.4327485380116959, + "acc_norm_stderr": 0.037999786443706066 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2178702570379437, + "mc1_stderr": 0.014450846714123899, + "mc2": 0.3498430573399945, + "mc2_stderr": 0.01469641873096921 + }, + "all": { + "acc": 0.37026259300970477, + "acc_stderr": 0.03445992590442932, + "acc_norm": 0.3735486412052473, + "acc_norm_stderr": 0.03445507842357752, + "mc1": 0.2178702570379437, + "mc1_stderr": 0.014450846714123899, + "mc2": 0.3498430573399945, + "mc2_stderr": 0.01469641873096921 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6445.540361881256", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/llm-agents/tora-code-13b-v1.0/results_2023-10-23T13-29-53.824155.json b/eval-results/llm-agents/tora-code-13b-v1.0/results_2023-10-23T13-29-53.824155.json new file mode 100644 index 0000000000000000000000000000000000000000..ecedc33f25568df30fcb3a79c284f59221727332 --- /dev/null +++ b/eval-results/llm-agents/tora-code-13b-v1.0/results_2023-10-23T13-29-53.824155.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "llm-agents/tora-code-13b-v1.0", + "model_sha": "4bf5b528d95a507b435c24a8986afe80d5951782", + "model_size": "24.56 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0019924496644295304, + "em_stderr": 0.00045666764626670027, + "f1": 0.0450398489932886, + "f1_stderr": 0.0010718150921397497 + }, + "harness|gsm8k|5": { + "acc": 0.08188021228203184, + "acc_stderr": 0.007552338527716949 + }, + "harness|winogrande|5": { + "acc": 0.6258879242304657, + "acc_stderr": 0.013599792958329823 + }, + "all": { + "em": 0.0019924496644295304, + "em_stderr": 0.00045666764626670027, + "f1": 0.0450398489932886, + "f1_stderr": 0.0010718150921397497, + "acc": 0.35388406825624874, + "acc_stderr": 0.010576065743023385 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "65f0964d558fc435" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "200fce9bb1916218" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "b1ea91622d333551" + }, + "total_evaluation_time_secondes": "12656.947103500366", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/llm-agents/tora-code-34b-v1.0/results_2023-10-10T19-58-46.874384.json b/eval-results/llm-agents/tora-code-34b-v1.0/results_2023-10-10T19-58-46.874384.json new file mode 100644 index 0000000000000000000000000000000000000000..77fb40ccc6804c1118fe2b1991383074b5449544 --- /dev/null +++ b/eval-results/llm-agents/tora-code-34b-v1.0/results_2023-10-10T19-58-46.874384.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "llm-agents/tora-code-34b-v1.0", + "model_sha": "cbb33eea774cc03d4363c424d81e8c9d58332274", + "model_size": "63.23 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4735494880546075, + "acc_stderr": 0.014590931358120174, + "acc_norm": 0.5042662116040956, + "acc_norm_stderr": 0.014610858923956952 + }, + "harness|hellaswag|10": { + "acc": 0.5691097390957977, + "acc_stderr": 0.004941887610849033, + "acc_norm": 0.7554272057359092, + "acc_norm_stderr": 0.004289551633772027 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.04605661864718381, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04605661864718381 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3851851851851852, + "acc_stderr": 0.042039210401562783, + "acc_norm": 0.3851851851851852, + "acc_norm_stderr": 0.042039210401562783 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.46710526315789475, + "acc_stderr": 0.040601270352363966, + "acc_norm": 0.46710526315789475, + "acc_norm_stderr": 0.040601270352363966 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.49056603773584906, + "acc_stderr": 0.0307673947078081, + "acc_norm": 0.49056603773584906, + "acc_norm_stderr": 0.0307673947078081 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4583333333333333, + "acc_stderr": 0.04166666666666665, + "acc_norm": 0.4583333333333333, + "acc_norm_stderr": 0.04166666666666665 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4161849710982659, + "acc_stderr": 0.03758517775404948, + "acc_norm": 0.4161849710982659, + "acc_norm_stderr": 0.03758517775404948 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.04389869956808778, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.04389869956808778 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.046482319871173156, + "acc_norm": 0.69, + "acc_norm_stderr": 0.046482319871173156 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3872340425531915, + "acc_stderr": 0.03184389265339525, + "acc_norm": 0.3872340425531915, + "acc_norm_stderr": 0.03184389265339525 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.35964912280701755, + "acc_stderr": 0.04514496132873633, + "acc_norm": 0.35964912280701755, + "acc_norm_stderr": 0.04514496132873633 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.04161808503501528, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.04161808503501528 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.328042328042328, + "acc_stderr": 0.024180497164376896, + "acc_norm": 0.328042328042328, + "acc_norm_stderr": 0.024180497164376896 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.04263906892795133, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.04263906892795133 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5064516129032258, + "acc_stderr": 0.02844163823354051, + "acc_norm": 0.5064516129032258, + "acc_norm_stderr": 0.02844163823354051 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.33497536945812806, + "acc_stderr": 0.033208527423483104, + "acc_norm": 0.33497536945812806, + "acc_norm_stderr": 0.033208527423483104 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5757575757575758, + "acc_stderr": 0.03859268142070264, + "acc_norm": 0.5757575757575758, + "acc_norm_stderr": 0.03859268142070264 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5707070707070707, + "acc_stderr": 0.035265527246011986, + "acc_norm": 0.5707070707070707, + "acc_norm_stderr": 0.035265527246011986 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6113989637305699, + "acc_stderr": 0.03517739796373132, + "acc_norm": 0.6113989637305699, + "acc_norm_stderr": 0.03517739796373132 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3717948717948718, + "acc_stderr": 0.024503472557110932, + "acc_norm": 0.3717948717948718, + "acc_norm_stderr": 0.024503472557110932 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.0287420409039485, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.0287420409039485 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42016806722689076, + "acc_stderr": 0.03206183783236152, + "acc_norm": 0.42016806722689076, + "acc_norm_stderr": 0.03206183783236152 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6403669724770642, + "acc_stderr": 0.020575234660123776, + "acc_norm": 0.6403669724770642, + "acc_norm_stderr": 0.020575234660123776 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.03167468706828978, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.03167468706828978 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6127450980392157, + "acc_stderr": 0.034189312338333444, + "acc_norm": 0.6127450980392157, + "acc_norm_stderr": 0.034189312338333444 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6286919831223629, + "acc_stderr": 0.03145068600744859, + "acc_norm": 0.6286919831223629, + "acc_norm_stderr": 0.03145068600744859 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.452914798206278, + "acc_stderr": 0.03340867501923324, + "acc_norm": 0.452914798206278, + "acc_norm_stderr": 0.03340867501923324 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.4961832061068702, + "acc_stderr": 0.043851623256015534, + "acc_norm": 0.4961832061068702, + "acc_norm_stderr": 0.043851623256015534 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6694214876033058, + "acc_stderr": 0.04294340845212093, + "acc_norm": 0.6694214876033058, + "acc_norm_stderr": 0.04294340845212093 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.04803752235190192, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.04803752235190192 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5828220858895705, + "acc_stderr": 0.038741028598180814, + "acc_norm": 0.5828220858895705, + "acc_norm_stderr": 0.038741028598180814 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.04521829902833585, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.04521829902833585 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.04541609446503948, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.04541609446503948 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.02934311479809446, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.02934311479809446 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5951468710089399, + "acc_stderr": 0.01755324646772026, + "acc_norm": 0.5951468710089399, + "acc_norm_stderr": 0.01755324646772026 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5115606936416185, + "acc_stderr": 0.02691189868637792, + "acc_norm": 0.5115606936416185, + "acc_norm_stderr": 0.02691189868637792 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.48366013071895425, + "acc_stderr": 0.028614624752805407, + "acc_norm": 0.48366013071895425, + "acc_norm_stderr": 0.028614624752805407 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5144694533762058, + "acc_stderr": 0.02838619808417768, + "acc_norm": 0.5144694533762058, + "acc_norm_stderr": 0.02838619808417768 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.49074074074074076, + "acc_stderr": 0.027815973433878014, + "acc_norm": 0.49074074074074076, + "acc_norm_stderr": 0.027815973433878014 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3546099290780142, + "acc_stderr": 0.02853865002887864, + "acc_norm": 0.3546099290780142, + "acc_norm_stderr": 0.02853865002887864 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.34615384615384615, + "acc_stderr": 0.012150699768228565, + "acc_norm": 0.34615384615384615, + "acc_norm_stderr": 0.012150699768228565 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.27205882352941174, + "acc_stderr": 0.02703304115168146, + "acc_norm": 0.27205882352941174, + "acc_norm_stderr": 0.02703304115168146 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4166666666666667, + "acc_stderr": 0.01994491413687358, + "acc_norm": 0.4166666666666667, + "acc_norm_stderr": 0.01994491413687358 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5181818181818182, + "acc_stderr": 0.04785964010794916, + "acc_norm": 0.5181818181818182, + "acc_norm_stderr": 0.04785964010794916 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5959183673469388, + "acc_stderr": 0.031414708025865885, + "acc_norm": 0.5959183673469388, + "acc_norm_stderr": 0.031414708025865885 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.582089552238806, + "acc_stderr": 0.034875586404620636, + "acc_norm": 0.582089552238806, + "acc_norm_stderr": 0.034875586404620636 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.62, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.62, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.40963855421686746, + "acc_stderr": 0.03828401115079021, + "acc_norm": 0.40963855421686746, + "acc_norm_stderr": 0.03828401115079021 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6198830409356725, + "acc_stderr": 0.037229657413855394, + "acc_norm": 0.6198830409356725, + "acc_norm_stderr": 0.037229657413855394 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.26560587515299877, + "mc1_stderr": 0.015461027627253595, + "mc2": 0.3966399813178778, + "mc2_stderr": 0.015001622827420584 + }, + "all": { + "acc": 0.46960448692665785, + "acc_stderr": 0.03519478895140827, + "acc_norm": 0.4732830325230917, + "acc_norm_stderr": 0.03518407016477708, + "mc1": 0.26560587515299877, + "mc1_stderr": 0.015461027627253595, + "mc2": 0.3966399813178778, + "mc2_stderr": 0.015001622827420584 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "24175.409123420715", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/llm-agents/tora-code-34b-v1.0/results_2023-10-29T14-45-33.469419.json b/eval-results/llm-agents/tora-code-34b-v1.0/results_2023-10-29T14-45-33.469419.json new file mode 100644 index 0000000000000000000000000000000000000000..cb46a677dbd5c48143a13277a347a7b076bb9738 --- /dev/null +++ b/eval-results/llm-agents/tora-code-34b-v1.0/results_2023-10-29T14-45-33.469419.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "llm-agents/tora-code-34b-v1.0", + "model_sha": "cbb33eea774cc03d4363c424d81e8c9d58332274", + "model_size": "63.23 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0012583892617449664, + "em_stderr": 0.00036305608931189816, + "f1": 0.04579802852349004, + "f1_stderr": 0.0010433016886932766 + }, + "harness|gsm8k|5": { + "acc": 0.13115996967399546, + "acc_stderr": 0.009298499235587867 + }, + "harness|winogrande|5": { + "acc": 0.6819258089976322, + "acc_stderr": 0.013089285079884681 + }, + "all": { + "em": 0.0012583892617449664, + "em_stderr": 0.00036305608931189816, + "f1": 0.04579802852349004, + "f1_stderr": 0.0010433016886932766, + "acc": 0.40654288933581384, + "acc_stderr": 0.011193892157736274 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "a2e3d8261cadebbd" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "ecc13f47e6cfb89c" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "e773c5fc533e6c39" + }, + "total_evaluation_time_secondes": "34804.35703897476", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/llm-agents/tora-code-7b-v1.0/results_2023-10-10T14-12-45.914011.json b/eval-results/llm-agents/tora-code-7b-v1.0/results_2023-10-10T14-12-45.914011.json new file mode 100644 index 0000000000000000000000000000000000000000..908bc73c1cfa33b7a7529cda34b2bf74a709bada --- /dev/null +++ b/eval-results/llm-agents/tora-code-7b-v1.0/results_2023-10-10T14-12-45.914011.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "llm-agents/tora-code-7b-v1.0", + "model_sha": "777501b69bb0ba2675abdcaf7b1309ab05320c2e", + "model_size": "12.8 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.378839590443686, + "acc_stderr": 0.01417591549000032, + "acc_norm": 0.4069965870307167, + "acc_norm_stderr": 0.014356399418009126 + }, + "harness|hellaswag|10": { + "acc": 0.5027882891854212, + "acc_stderr": 0.004989703824167102, + "acc_norm": 0.6586337382991436, + "acc_norm_stderr": 0.004731989816563668 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.04605661864718381, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04605661864718381 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.03972552884785137, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.03972552884785137 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.26973684210526316, + "acc_stderr": 0.03611780560284898, + "acc_norm": 0.26973684210526316, + "acc_norm_stderr": 0.03611780560284898 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.30566037735849055, + "acc_stderr": 0.028353298073322666, + "acc_norm": 0.30566037735849055, + "acc_norm_stderr": 0.028353298073322666 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3472222222222222, + "acc_stderr": 0.039812405437178615, + "acc_norm": 0.3472222222222222, + "acc_norm_stderr": 0.039812405437178615 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.28901734104046245, + "acc_stderr": 0.034564257450869995, + "acc_norm": 0.28901734104046245, + "acc_norm_stderr": 0.034564257450869995 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.04023382273617749, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.04023382273617749 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3148936170212766, + "acc_stderr": 0.030363582197238167, + "acc_norm": 0.3148936170212766, + "acc_norm_stderr": 0.030363582197238167 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.042663394431593935, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.042663394431593935 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3103448275862069, + "acc_stderr": 0.03855289616378948, + "acc_norm": 0.3103448275862069, + "acc_norm_stderr": 0.03855289616378948 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.28835978835978837, + "acc_stderr": 0.0233306540545359, + "acc_norm": 0.28835978835978837, + "acc_norm_stderr": 0.0233306540545359 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.04006168083848878, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.04006168083848878 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.34516129032258064, + "acc_stderr": 0.027045746573534323, + "acc_norm": 0.34516129032258064, + "acc_norm_stderr": 0.027045746573534323 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.22660098522167488, + "acc_stderr": 0.029454863835292975, + "acc_norm": 0.22660098522167488, + "acc_norm_stderr": 0.029454863835292975 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.48484848484848486, + "acc_stderr": 0.03902551007374448, + "acc_norm": 0.48484848484848486, + "acc_norm_stderr": 0.03902551007374448 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.29797979797979796, + "acc_stderr": 0.032586303838365555, + "acc_norm": 0.29797979797979796, + "acc_norm_stderr": 0.032586303838365555 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.32124352331606215, + "acc_stderr": 0.033699508685490674, + "acc_norm": 0.32124352331606215, + "acc_norm_stderr": 0.033699508685490674 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.30512820512820515, + "acc_stderr": 0.023346335293325887, + "acc_norm": 0.30512820512820515, + "acc_norm_stderr": 0.023346335293325887 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.02620276653465215, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.02620276653465215 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.029344572500634335, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.029344572500634335 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2582781456953642, + "acc_stderr": 0.035737053147634576, + "acc_norm": 0.2582781456953642, + "acc_norm_stderr": 0.035737053147634576 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3504587155963303, + "acc_stderr": 0.020456077599824457, + "acc_norm": 0.3504587155963303, + "acc_norm_stderr": 0.020456077599824457 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.03054674526495318, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.03054674526495318 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.37254901960784315, + "acc_stderr": 0.033933885849584046, + "acc_norm": 0.37254901960784315, + "acc_norm_stderr": 0.033933885849584046 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.4978902953586498, + "acc_stderr": 0.032546938018020076, + "acc_norm": 0.4978902953586498, + "acc_norm_stderr": 0.032546938018020076 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.36771300448430494, + "acc_stderr": 0.03236198350928275, + "acc_norm": 0.36771300448430494, + "acc_norm_stderr": 0.03236198350928275 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3816793893129771, + "acc_stderr": 0.0426073515764456, + "acc_norm": 0.3816793893129771, + "acc_norm_stderr": 0.0426073515764456 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.33884297520661155, + "acc_stderr": 0.04320767807536669, + "acc_norm": 0.33884297520661155, + "acc_norm_stderr": 0.04320767807536669 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.39814814814814814, + "acc_stderr": 0.047323326159788154, + "acc_norm": 0.39814814814814814, + "acc_norm_stderr": 0.047323326159788154 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.27607361963190186, + "acc_stderr": 0.0351238528370505, + "acc_norm": 0.27607361963190186, + "acc_norm_stderr": 0.0351238528370505 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.22321428571428573, + "acc_stderr": 0.039523019677025116, + "acc_norm": 0.22321428571428573, + "acc_norm_stderr": 0.039523019677025116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3300970873786408, + "acc_stderr": 0.0465614711001235, + "acc_norm": 0.3300970873786408, + "acc_norm_stderr": 0.0465614711001235 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.5042735042735043, + "acc_stderr": 0.032754892643821316, + "acc_norm": 0.5042735042735043, + "acc_norm_stderr": 0.032754892643821316 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.421455938697318, + "acc_stderr": 0.017657976412654857, + "acc_norm": 0.421455938697318, + "acc_norm_stderr": 0.017657976412654857 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.34104046242774566, + "acc_stderr": 0.025522474632121612, + "acc_norm": 0.34104046242774566, + "acc_norm_stderr": 0.025522474632121612 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23910614525139665, + "acc_stderr": 0.014265554192331144, + "acc_norm": 0.23910614525139665, + "acc_norm_stderr": 0.014265554192331144 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.3562091503267974, + "acc_stderr": 0.02742047766262925, + "acc_norm": 0.3562091503267974, + "acc_norm_stderr": 0.02742047766262925 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.36977491961414793, + "acc_stderr": 0.027417996705631, + "acc_norm": 0.36977491961414793, + "acc_norm_stderr": 0.027417996705631 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.3271604938271605, + "acc_stderr": 0.026105673861409818, + "acc_norm": 0.3271604938271605, + "acc_norm_stderr": 0.026105673861409818 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2765957446808511, + "acc_stderr": 0.026684564340461004, + "acc_norm": 0.2765957446808511, + "acc_norm_stderr": 0.026684564340461004 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2926988265971317, + "acc_stderr": 0.011620949195849535, + "acc_norm": 0.2926988265971317, + "acc_norm_stderr": 0.011620949195849535 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.22794117647058823, + "acc_stderr": 0.025483081468029807, + "acc_norm": 0.22794117647058823, + "acc_norm_stderr": 0.025483081468029807 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.3055555555555556, + "acc_stderr": 0.018635594034423976, + "acc_norm": 0.3055555555555556, + "acc_norm_stderr": 0.018635594034423976 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.39090909090909093, + "acc_stderr": 0.046737523336702363, + "acc_norm": 0.39090909090909093, + "acc_norm_stderr": 0.046737523336702363 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4, + "acc_stderr": 0.03136250240935893, + "acc_norm": 0.4, + "acc_norm_stderr": 0.03136250240935893 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.4129353233830846, + "acc_stderr": 0.034815208033673474, + "acc_norm": 0.4129353233830846, + "acc_norm_stderr": 0.034815208033673474 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3433734939759036, + "acc_stderr": 0.036965843170106004, + "acc_norm": 0.3433734939759036, + "acc_norm_stderr": 0.036965843170106004 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.4269005847953216, + "acc_stderr": 0.03793620616529917, + "acc_norm": 0.4269005847953216, + "acc_norm_stderr": 0.03793620616529917 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.22276621787025705, + "mc1_stderr": 0.014566506961396743, + "mc2": 0.3484016861988524, + "mc2_stderr": 0.014498737856096499 + }, + "all": { + "acc": 0.3370080390784741, + "acc_stderr": 0.03404249985969968, + "acc_norm": 0.34012672459882587, + "acc_norm_stderr": 0.03404119087529977, + "mc1": 0.22276621787025705, + "mc1_stderr": 0.014566506961396743, + "mc2": 0.3484016861988524, + "mc2_stderr": 0.014498737856096499 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4246.858571052551", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/llm-agents/tora-code-7b-v1.0/results_2023-10-28T11-50-58.128612.json b/eval-results/llm-agents/tora-code-7b-v1.0/results_2023-10-28T11-50-58.128612.json new file mode 100644 index 0000000000000000000000000000000000000000..c33696a6a9921f50d1ce231623dd912032116366 --- /dev/null +++ b/eval-results/llm-agents/tora-code-7b-v1.0/results_2023-10-28T11-50-58.128612.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "llm-agents/tora-code-7b-v1.0", + "model_sha": "777501b69bb0ba2675abdcaf7b1309ab05320c2e", + "model_size": "12.8 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0010486577181208054, + "em_stderr": 0.00033145814652192884, + "f1": 0.04895343959731551, + "f1_stderr": 0.0011757746481772687 + }, + "harness|gsm8k|5": { + "acc": 0.04927975739196361, + "acc_stderr": 0.005962150655812473 + }, + "harness|winogrande|5": { + "acc": 0.6156274664561957, + "acc_stderr": 0.013671567600836194 + }, + "all": { + "em": 0.0010486577181208054, + "em_stderr": 0.00033145814652192884, + "f1": 0.04895343959731551, + "f1_stderr": 0.0011757746481772687, + "acc": 0.33245361192407963, + "acc_stderr": 0.009816859128324334 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "f408589cd6912259" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "aebd21abde351377" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "7fb763c6234a2b4c" + }, + "total_evaluation_time_secondes": "9477.29330420494", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/medalpaca/medalpaca-7b/results_2023-07-19T16-30-25.304813.json b/eval-results/medalpaca/medalpaca-7b/results_2023-07-19T16-30-25.304813.json new file mode 100644 index 0000000000000000000000000000000000000000..ecd6be89d4e8e56731fd70b0d5d5d5fd5a7be67e --- /dev/null +++ b/eval-results/medalpaca/medalpaca-7b/results_2023-07-19T16-30-25.304813.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.48976109215017066, + "acc_stderr": 0.014608326906285019, + "acc_norm": 0.5409556313993175, + "acc_norm_stderr": 0.01456229107360123 + }, + "harness|hellaswag|10": { + "acc": 0.6155148376817368, + "acc_stderr": 0.004854791378656995, + "acc_norm": 0.8042222664807808, + "acc_norm_stderr": 0.003959872578165267 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680814, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4962962962962963, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.4962962962962963, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.03690677986137282, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.03690677986137282 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4716981132075472, + "acc_stderr": 0.0307235352490061, + "acc_norm": 0.4716981132075472, + "acc_norm_stderr": 0.0307235352490061 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4513888888888889, + "acc_stderr": 0.04161402398403279, + "acc_norm": 0.4513888888888889, + "acc_norm_stderr": 0.04161402398403279 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.23, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.23, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.42196531791907516, + "acc_stderr": 0.0376574669386515, + "acc_norm": 0.42196531791907516, + "acc_norm_stderr": 0.0376574669386515 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.04440521906179328, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.04440521906179328 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4127659574468085, + "acc_stderr": 0.03218471141400352, + "acc_norm": 0.4127659574468085, + "acc_norm_stderr": 0.03218471141400352 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813365 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.35172413793103446, + "acc_stderr": 0.0397923663749741, + "acc_norm": 0.35172413793103446, + "acc_norm_stderr": 0.0397923663749741 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24867724867724866, + "acc_stderr": 0.02226181769240018, + "acc_norm": 0.24867724867724866, + "acc_norm_stderr": 0.02226181769240018 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.19047619047619047, + "acc_stderr": 0.035122074123020514, + "acc_norm": 0.19047619047619047, + "acc_norm_stderr": 0.035122074123020514 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5032258064516129, + "acc_stderr": 0.02844341422643833, + "acc_norm": 0.5032258064516129, + "acc_norm_stderr": 0.02844341422643833 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3399014778325123, + "acc_stderr": 0.033327690684107895, + "acc_norm": 0.3399014778325123, + "acc_norm_stderr": 0.033327690684107895 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6060606060606061, + "acc_stderr": 0.038154943086889305, + "acc_norm": 0.6060606060606061, + "acc_norm_stderr": 0.038154943086889305 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.35858585858585856, + "acc_stderr": 0.03416903640391521, + "acc_norm": 0.35858585858585856, + "acc_norm_stderr": 0.03416903640391521 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.5129533678756477, + "acc_stderr": 0.036072280610477486, + "acc_norm": 0.5129533678756477, + "acc_norm_stderr": 0.036072280610477486 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.32564102564102565, + "acc_stderr": 0.02375966576741229, + "acc_norm": 0.32564102564102565, + "acc_norm_stderr": 0.02375966576741229 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23333333333333334, + "acc_stderr": 0.025787874220959333, + "acc_norm": 0.23333333333333334, + "acc_norm_stderr": 0.025787874220959333 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3067226890756303, + "acc_stderr": 0.029953823891887037, + "acc_norm": 0.3067226890756303, + "acc_norm_stderr": 0.029953823891887037 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.25165562913907286, + "acc_stderr": 0.035433042343899844, + "acc_norm": 0.25165562913907286, + "acc_norm_stderr": 0.035433042343899844 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6275229357798165, + "acc_stderr": 0.020728368457638497, + "acc_norm": 0.6275229357798165, + "acc_norm_stderr": 0.020728368457638497 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.027920963147993662, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.027920963147993662 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6127450980392157, + "acc_stderr": 0.03418931233833344, + "acc_norm": 0.6127450980392157, + "acc_norm_stderr": 0.03418931233833344 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.5780590717299579, + "acc_stderr": 0.032148146302403695, + "acc_norm": 0.5780590717299579, + "acc_norm_stderr": 0.032148146302403695 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5381165919282511, + "acc_stderr": 0.033460150119732274, + "acc_norm": 0.5381165919282511, + "acc_norm_stderr": 0.033460150119732274 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5419847328244275, + "acc_stderr": 0.04369802690578756, + "acc_norm": 0.5419847328244275, + "acc_norm_stderr": 0.04369802690578756 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5619834710743802, + "acc_stderr": 0.04529146804435792, + "acc_norm": 0.5619834710743802, + "acc_norm_stderr": 0.04529146804435792 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.46296296296296297, + "acc_stderr": 0.04820403072760627, + "acc_norm": 0.46296296296296297, + "acc_norm_stderr": 0.04820403072760627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4294478527607362, + "acc_stderr": 0.038890666191127216, + "acc_norm": 0.4294478527607362, + "acc_norm_stderr": 0.038890666191127216 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3392857142857143, + "acc_stderr": 0.044939490686135376, + "acc_norm": 0.3392857142857143, + "acc_norm_stderr": 0.044939490686135376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.4077669902912621, + "acc_stderr": 0.048657775704107675, + "acc_norm": 0.4077669902912621, + "acc_norm_stderr": 0.048657775704107675 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6196581196581197, + "acc_stderr": 0.03180425204384099, + "acc_norm": 0.6196581196581197, + "acc_norm_stderr": 0.03180425204384099 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5504469987228607, + "acc_stderr": 0.017788725283507337, + "acc_norm": 0.5504469987228607, + "acc_norm_stderr": 0.017788725283507337 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.430635838150289, + "acc_stderr": 0.026658800273672373, + "acc_norm": 0.430635838150289, + "acc_norm_stderr": 0.026658800273672373 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5196078431372549, + "acc_stderr": 0.028607893699576066, + "acc_norm": 0.5196078431372549, + "acc_norm_stderr": 0.028607893699576066 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.4340836012861736, + "acc_stderr": 0.028150232244535608, + "acc_norm": 0.4340836012861736, + "acc_norm_stderr": 0.028150232244535608 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.02764847787741332, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.02764847787741332 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.31560283687943264, + "acc_stderr": 0.027724989449509314, + "acc_norm": 0.31560283687943264, + "acc_norm_stderr": 0.027724989449509314 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3428943937418514, + "acc_stderr": 0.012123463271585895, + "acc_norm": 0.3428943937418514, + "acc_norm_stderr": 0.012123463271585895 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5992647058823529, + "acc_stderr": 0.029768263528933105, + "acc_norm": 0.5992647058823529, + "acc_norm_stderr": 0.029768263528933105 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5, + "acc_stderr": 0.020227834851568375, + "acc_norm": 0.5, + "acc_norm_stderr": 0.020227834851568375 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.509090909090909, + "acc_stderr": 0.04788339768702861, + "acc_norm": 0.509090909090909, + "acc_norm_stderr": 0.04788339768702861 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3877551020408163, + "acc_stderr": 0.031192230726795656, + "acc_norm": 0.3877551020408163, + "acc_norm_stderr": 0.031192230726795656 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.4577114427860697, + "acc_stderr": 0.035228658640995975, + "acc_norm": 0.4577114427860697, + "acc_norm_stderr": 0.035228658640995975 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4939759036144578, + "acc_stderr": 0.03892212195333045, + "acc_norm": 0.4939759036144578, + "acc_norm_stderr": 0.03892212195333045 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.4327485380116959, + "acc_stderr": 0.03799978644370608, + "acc_norm": 0.4327485380116959, + "acc_norm_stderr": 0.03799978644370608 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.25703794369645044, + "mc1_stderr": 0.015298077509485076, + "mc2": 0.4046224421319521, + "mc2_stderr": 0.015012572023050848 + }, + "all": { + "acc": 0.4193625530628919, + "acc_stderr": 0.03474006835088891, + "acc_norm": 0.42342868811455614, + "acc_norm_stderr": 0.034724119967275764, + "mc1": 0.25703794369645044, + "mc1_stderr": 0.015298077509485076, + "mc2": 0.4046224421319521, + "mc2_stderr": 0.015012572023050848 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "medalpaca/medalpaca-7b", + "model_sha": "b57b9f5ff34059e485b769973d023021fc66a8f7", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "4d41ef08f7f15a87", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "e5cdbeaabd59fe25", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "f761c98a583630b0", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "80e0c48ccffa00ed", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "2b329c6ca67607dc", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bde478206654aa12", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "de5e1db4f0637b77", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "0150fa239f4db10c", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "0bee7f47bee63c79", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "e718ffe7615f023a", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "786c85630e928fad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "3c2f62f5c1fe6a2e", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f4deb8123d2bfd4a", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "c0e1e1b475ae50f7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "42a3769ad3670df3", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "907d28b129d51d56", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e456dc04a081add9", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "c1ff30907d03d949", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "b83e70701038ee89", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "690d1342b56ec0c0", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "fd7dcca51bc36ed3", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "73042d776e504db9", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "ae788fb4bea00ecf", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "58b516ecb02f2e40", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "0e66256865617a7e", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ef84b9228c4a8a14", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "21e09eeedb3bd8d1", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5c67d84b5de728e", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "a8adfaec234b6241", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "20cb425a37d2f8a3", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "083b9ab4a12ec287", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "ec8bbb3a1fd686ab", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "3913b529e3c7d97c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "8dd5a70368b6dbaf", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "a3507093fa9ca1ac", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "808f16991ac2375a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "8f047748092d60d7", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "93059d5f2bb285ab", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "f252fe7542a09428", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af226d6094825afe", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "fe5ba86eaf1086de", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "475603bced2f3d7f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "418f52434a830ea3", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7faa6a56dccc3884", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "3ed3790cd29fec29", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "fc91e4b12993bad0", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "39719c3315549563", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "be1b34046d49026f", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "df4a6015393b6489", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "31b608e60b8f9a26", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "f04dcc40aa61dff1", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "924078ed7bcb5027", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "34ef07b913df3c85", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "7910870d0c39ccf4", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "3acc80abbdef09b7", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "7a0a1a457d1b44b3", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "fdd0a4eda46435e3", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "e8d34404f8d50781", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/medalpaca/medalpaca-7b/results_2023-10-13T02-37-55.174881.json b/eval-results/medalpaca/medalpaca-7b/results_2023-10-13T02-37-55.174881.json new file mode 100644 index 0000000000000000000000000000000000000000..fc7872983db01dcb67c0980a95c6ccef1c154053 --- /dev/null +++ b/eval-results/medalpaca/medalpaca-7b/results_2023-10-13T02-37-55.174881.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "medalpaca/medalpaca-7b", + "model_sha": "b57b9f5ff34059e485b769973d023021fc66a8f7", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.1761744966442953, + "em_stderr": 0.003901474629801755, + "f1": 0.24214345637583887, + "f1_stderr": 0.003972046949089224 + }, + "harness|gsm8k|5": { + "acc": 0.030326004548900682, + "acc_stderr": 0.004723487465514772 + }, + "harness|winogrande|5": { + "acc": 0.7119179163378059, + "acc_stderr": 0.012727884724248115 + }, + "all": { + "em": 0.1761744966442953, + "em_stderr": 0.003901474629801755, + "f1": 0.24214345637583887, + "f1_stderr": 0.003972046949089224, + "acc": 0.37112196044335327, + "acc_stderr": 0.008725686094881443 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "afa3f956b5946008", + "hash_cont_tokens": "b9f6431c5af3ed59" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6f81fd8346219949", + "hash_cont_tokens": "053f407ba115fbb5" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "b84988137a00c1f4", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "fb75bd68fb756923", + "hash_cont_tokens": "bfebfbaf274eb23d" + }, + "total_evaluation_time_secondes": "8227.470661401749", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mediocredev/open-llama-3b-v2-instruct/results_2023-12-16T15-28-20.399841.json b/eval-results/mediocredev/open-llama-3b-v2-instruct/results_2023-12-16T15-28-20.399841.json new file mode 100644 index 0000000000000000000000000000000000000000..50dce12c21d462c3b625dbe97b773e11552900a8 --- /dev/null +++ b/eval-results/mediocredev/open-llama-3b-v2-instruct/results_2023-12-16T15-28-20.399841.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 369290.578910306, + "end_time": 371457.294649277, + "total_evaluation_time_secondes": "2166.715738971019", + "model_name": "mediocredev/open-llama-3b-v2-instruct", + "model_sha": "4d50e134af1d9806cbdf6bc90795b44ae689deca", + "model_dtype": "torch.float16", + "model_size": "6.4 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.35409556313993173, + "acc_stderr": 0.01397545412275655, + "acc_norm": 0.3848122866894198, + "acc_norm_stderr": 0.014218371065251104 + }, + "harness|hellaswag|10": { + "acc": 0.5142401911969727, + "acc_stderr": 0.0049877573147698445, + "acc_norm": 0.7024497112129058, + "acc_norm_stderr": 0.004562462665505218 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847415, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847415 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45185185185185184, + "acc_stderr": 0.04299268905480864, + "acc_norm": 0.45185185185185184, + "acc_norm_stderr": 0.04299268905480864 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3815789473684211, + "acc_stderr": 0.03953173377749194, + "acc_norm": 0.3815789473684211, + "acc_norm_stderr": 0.03953173377749194 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4377358490566038, + "acc_stderr": 0.03053333843046751, + "acc_norm": 0.4377358490566038, + "acc_norm_stderr": 0.03053333843046751 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3680555555555556, + "acc_stderr": 0.04032999053960719, + "acc_norm": 0.3680555555555556, + "acc_norm_stderr": 0.04032999053960719 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.43352601156069365, + "acc_stderr": 0.03778621079092055, + "acc_norm": 0.43352601156069365, + "acc_norm_stderr": 0.03778621079092055 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179961, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179961 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3659574468085106, + "acc_stderr": 0.0314895582974553, + "acc_norm": 0.3659574468085106, + "acc_norm_stderr": 0.0314895582974553 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.042663394431593935, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.042663394431593935 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3931034482758621, + "acc_stderr": 0.0407032901370707, + "acc_norm": 0.3931034482758621, + "acc_norm_stderr": 0.0407032901370707 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2698412698412698, + "acc_stderr": 0.022860838309232072, + "acc_norm": 0.2698412698412698, + "acc_norm_stderr": 0.022860838309232072 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.041049472699033945, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.041049472699033945 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.4161290322580645, + "acc_stderr": 0.028040981380761547, + "acc_norm": 0.4161290322580645, + "acc_norm_stderr": 0.028040981380761547 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2660098522167488, + "acc_stderr": 0.03108982600293753, + "acc_norm": 0.2660098522167488, + "acc_norm_stderr": 0.03108982600293753 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939098, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939098 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.4484848484848485, + "acc_stderr": 0.038835659779569286, + "acc_norm": 0.4484848484848485, + "acc_norm_stderr": 0.038835659779569286 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5454545454545454, + "acc_stderr": 0.03547601494006937, + "acc_norm": 0.5454545454545454, + "acc_norm_stderr": 0.03547601494006937 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.48186528497409326, + "acc_stderr": 0.03606065001832919, + "acc_norm": 0.48186528497409326, + "acc_norm_stderr": 0.03606065001832919 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.35128205128205126, + "acc_stderr": 0.024203665177902796, + "acc_norm": 0.35128205128205126, + "acc_norm_stderr": 0.024203665177902796 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.026719240783712166, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.026719240783712166 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.031124619309328177, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.031124619309328177 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.037345356767871984, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.037345356767871984 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.4972477064220184, + "acc_stderr": 0.02143699835976532, + "acc_norm": 0.4972477064220184, + "acc_norm_stderr": 0.02143699835976532 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3055555555555556, + "acc_stderr": 0.03141554629402544, + "acc_norm": 0.3055555555555556, + "acc_norm_stderr": 0.03141554629402544 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.46078431372549017, + "acc_stderr": 0.03498501649369527, + "acc_norm": 0.46078431372549017, + "acc_norm_stderr": 0.03498501649369527 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.5611814345991561, + "acc_stderr": 0.032302649315470375, + "acc_norm": 0.5611814345991561, + "acc_norm_stderr": 0.032302649315470375 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.4977578475336323, + "acc_stderr": 0.033557465352232634, + "acc_norm": 0.4977578475336323, + "acc_norm_stderr": 0.033557465352232634 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.46564885496183206, + "acc_stderr": 0.04374928560599738, + "acc_norm": 0.46564885496183206, + "acc_norm_stderr": 0.04374928560599738 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.47107438016528924, + "acc_stderr": 0.04556710331269498, + "acc_norm": 0.47107438016528924, + "acc_norm_stderr": 0.04556710331269498 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.4166666666666667, + "acc_stderr": 0.04766075165356461, + "acc_norm": 0.4166666666666667, + "acc_norm_stderr": 0.04766075165356461 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4110429447852761, + "acc_stderr": 0.038656978537853624, + "acc_norm": 0.4110429447852761, + "acc_norm_stderr": 0.038656978537853624 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5533980582524272, + "acc_stderr": 0.04922424153458933, + "acc_norm": 0.5533980582524272, + "acc_norm_stderr": 0.04922424153458933 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.5341880341880342, + "acc_stderr": 0.03267942734081228, + "acc_norm": 0.5341880341880342, + "acc_norm_stderr": 0.03267942734081228 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5261813537675607, + "acc_stderr": 0.01785543455404199, + "acc_norm": 0.5261813537675607, + "acc_norm_stderr": 0.01785543455404199 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.4277456647398844, + "acc_stderr": 0.02663653974111608, + "acc_norm": 0.4277456647398844, + "acc_norm_stderr": 0.02663653974111608 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25139664804469275, + "acc_stderr": 0.014508979453553984, + "acc_norm": 0.25139664804469275, + "acc_norm_stderr": 0.014508979453553984 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.42810457516339867, + "acc_stderr": 0.028332397483664274, + "acc_norm": 0.42810457516339867, + "acc_norm_stderr": 0.028332397483664274 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.40192926045016075, + "acc_stderr": 0.027846476005930473, + "acc_norm": 0.40192926045016075, + "acc_norm_stderr": 0.027846476005930473 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.41358024691358025, + "acc_stderr": 0.027402042040269952, + "acc_norm": 0.41358024691358025, + "acc_norm_stderr": 0.027402042040269952 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3049645390070922, + "acc_stderr": 0.027464708442022128, + "acc_norm": 0.3049645390070922, + "acc_norm_stderr": 0.027464708442022128 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.30964797913950454, + "acc_stderr": 0.01180859826250332, + "acc_norm": 0.30964797913950454, + "acc_norm_stderr": 0.01180859826250332 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.3786764705882353, + "acc_stderr": 0.029465133639776132, + "acc_norm": 0.3786764705882353, + "acc_norm_stderr": 0.029465133639776132 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.36437908496732024, + "acc_stderr": 0.019469518221573702, + "acc_norm": 0.36437908496732024, + "acc_norm_stderr": 0.019469518221573702 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5, + "acc_stderr": 0.04789131426105757, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04789131426105757 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.32653061224489793, + "acc_stderr": 0.030021056238440286, + "acc_norm": 0.32653061224489793, + "acc_norm_stderr": 0.030021056238440286 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.4527363184079602, + "acc_stderr": 0.035197027175769155, + "acc_norm": 0.4527363184079602, + "acc_norm_stderr": 0.035197027175769155 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.39156626506024095, + "acc_stderr": 0.03799857454479636, + "acc_norm": 0.39156626506024095, + "acc_norm_stderr": 0.03799857454479636 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.49707602339181284, + "acc_stderr": 0.03834759370936839, + "acc_norm": 0.49707602339181284, + "acc_norm_stderr": 0.03834759370936839 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2252141982864137, + "mc1_stderr": 0.014623240768023498, + "mc2": 0.3795634078796446, + "mc2_stderr": 0.014273839655133331 + }, + "harness|winogrande|5": { + "acc": 0.6574585635359116, + "acc_stderr": 0.013337483579075923 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "all": { + "acc": 0.3958981300034306, + "acc_stderr": 0.034198998112262805, + "acc_norm": 0.4018856544108267, + "acc_norm_stderr": 0.035129135992579406, + "mc1": 0.2252141982864137, + "mc1_stderr": 0.014623240768023498, + "mc2": 0.3795634078796446, + "mc2_stderr": 0.014273839655133331 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "59c328d432da064f", + "hash_cont_tokens": "2e8835aa03b9c2cf" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4676, + "non_padded": 11, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "9eaa83dae54ba52a", + "hash_cont_tokens": "18a48de3edcef462" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 39987, + "non_padded": 181, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "4129e579fbf0ebc2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "85c455354ae2ebd0", + "hash_cont_tokens": "1d81fa80e3039a08" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "221506ab8405000a", + "hash_cont_tokens": "247dc44c6b578728" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "16c21dd1ddd4ee38", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "24b21e9d78658e4d", + "hash_cont_tokens": "26e3b69d5fb27bb2" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "770d74c6a8c9c0b7", + "hash_cont_tokens": "bbda31842f3930d5" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 568, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "7dea1631558d65ac", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "22600976f0f9ffc6", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "564ae334c5a56510", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "bce86eecdc3bb76a", + "hash_cont_tokens": "894854ed7bec57f7" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 688, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "1188d9d525ab28e7", + "hash_cont_tokens": "13130ec6de384bbb" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "692856445804bec5", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "5ade2ffc8b9f5d4a", + "hash_cont_tokens": "29089b8b7020611e" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "9b766b5e103ce426", + "hash_cont_tokens": "efc596dfa1a1f073" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "dd9935cf301e82f9", + "hash_cont_tokens": "70817a7ac9f44af2" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 560, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "78c8ba2ecf6e0dc2", + "hash_cont_tokens": "937cd53d06cc6e16" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "661893e4f7f37eba", + "hash_cont_tokens": "eec972abe0fc0f5a" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "4a8d10395fdc21f0", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "816c7d936dbe01da", + "hash_cont_tokens": "94971ccfe8e59c25" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "769ab5386fedf26e", + "hash_cont_tokens": "a78e38b59778a04c" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "5b6bcda94f3ca2df", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "142b719c7d7d4fe0", + "hash_cont_tokens": "91dc522e4e4e91c3" + }, + "truncated": 660, + "non_truncated": -495, + "padded": 0, + "non_padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "281dcc445ad0af4a", + "hash_cont_tokens": "f275c901b3d285f9" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "bb8f5852975ec963", + "hash_cont_tokens": "85eb58f423437cce" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 770, + "non_padded": 2, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "e769357a349b7644", + "hash_cont_tokens": "39a93706184f896b" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "4ab345e3c0507320", + "hash_cont_tokens": "d41065d20b689af3" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "52ec665069da063e", + "hash_cont_tokens": "28c1f7c11bf85409" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "f23b89453c7c6050", + "hash_cont_tokens": "78c510e6c5d316ac" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "bb0f46fa5669c46e", + "hash_cont_tokens": "0ba4ecffc67603c5" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "db3276d6935c41ac", + "hash_cont_tokens": "4a0339e9ad3efa6d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a54112084a848a44", + "hash_cont_tokens": "2529d55ec490f81f" + }, + "truncated": 816, + "non_truncated": -612, + "padded": 0, + "non_padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "89cf33fb840f27be", + "hash_cont_tokens": "21808b54f5df97b2" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "ecf9f32ac289d1be", + "hash_cont_tokens": "92acdd467ed943e1" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ebf05f3ed8d69562", + "hash_cont_tokens": "a6034ed95a124315" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "b0d9e6f90b58599e", + "hash_cont_tokens": "223fbf3fd106c04b" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "ddb8c4eaa3d71594", + "hash_cont_tokens": "7c8e30f486ff156a" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 428, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "a04883884a711ebf", + "hash_cont_tokens": "b4cc4a8d31bbaa03" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 636, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "d5511967956880ea", + "hash_cont_tokens": "7f0e1289ec188e82" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "8c35c18f5a96b3b3", + "hash_cont_tokens": "66b726b356a02feb" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "a80e346390d1f88c", + "hash_cont_tokens": "f08457005b652d25" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "5caf5eb895cd3ccd", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "795c466e9f87e4c1", + "hash_cont_tokens": "647bcbd68f292558" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "505a224f2325b0ec", + "hash_cont_tokens": "6849b7fe56c50dda" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1368, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3f767d07e9ec8662", + "hash_cont_tokens": "81585ec455b1e3e5" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "0bc8cefb3f763640", + "hash_cont_tokens": "471b68eb20e5d34b" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "36e85ac3fd3f3c64", + "hash_cont_tokens": "6e39384b9c0a8cc2" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "1b04a90b19ce0623", + "hash_cont_tokens": "bfe513578190093f" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "8db39e7efe9edb93", + "hash_cont_tokens": "9ce431b67350b312" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "f638aace411a0bd9", + "hash_cont_tokens": "0ff990d9cc38024d" + }, + "truncated": 168, + "non_truncated": 1366, + "padded": 5968, + "non_padded": 168, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c0f160879d378d4d", + "hash_cont_tokens": "bc3c70e15bc7dce0" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "a66dcd2d6795f6ec", + "hash_cont_tokens": "58464ea26d81f908" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5263b25641f9702c", + "hash_cont_tokens": "eaf6a5d3ddd39a12" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "0350ab02a3d50c5f", + "hash_cont_tokens": "618fd4f954253134" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "2c8688ec4c1a1673", + "hash_cont_tokens": "b4962d9e583b12c0" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "c24ed5c990a2b92c", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "59ca81fd3abf68b3", + "hash_cont_tokens": "397a75462a9735e3" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4cebe9a8da92320d", + "hash_cont_tokens": "de629d1414e01de8" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "3e6036a8ea87ff4f", + "hash_cont_tokens": "df48bc66e06781f2" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "0591af93c06ece74", + "hash_cont_tokens": "828897df1f4f08a1" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bf7d8c6b5e4f7948", + "hash_cont_tokens": "ebd28527a3bdf34a" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "8864448e1d4b68e8", + "hash_cont_tokens": "8ee9b9af48a45616" + }, + "truncated": 1644, + "non_truncated": 27015, + "padded": 111639, + "non_padded": 3233, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/meta-math/MetaMath-13B-V1.0/results_2023-10-03T19-47-07.095350.json b/eval-results/meta-math/MetaMath-13B-V1.0/results_2023-10-03T19-47-07.095350.json new file mode 100644 index 0000000000000000000000000000000000000000..136a9cb92662b337cd75e1e8a4caa10dc0c5de9a --- /dev/null +++ b/eval-results/meta-math/MetaMath-13B-V1.0/results_2023-10-03T19-47-07.095350.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "meta-math/MetaMath-13B-V1.0", + "model_sha": "0b448f6f64808f8bca94dc871e96a3eae7e95621", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4667235494880546, + "acc_stderr": 0.014578995859605811, + "acc_norm": 0.4948805460750853, + "acc_norm_stderr": 0.014610624890309157 + }, + "harness|hellaswag|10": { + "acc": 0.5875323640709023, + "acc_stderr": 0.004912723848944791, + "acc_norm": 0.7647878908583947, + "acc_norm_stderr": 0.004232645108976139 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847415, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847415 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45185185185185184, + "acc_stderr": 0.04299268905480864, + "acc_norm": 0.45185185185185184, + "acc_norm_stderr": 0.04299268905480864 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4605263157894737, + "acc_stderr": 0.04056242252249034, + "acc_norm": 0.4605263157894737, + "acc_norm_stderr": 0.04056242252249034 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5245283018867924, + "acc_stderr": 0.030735822206205608, + "acc_norm": 0.5245283018867924, + "acc_norm_stderr": 0.030735822206205608 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.04174752578923185, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.04174752578923185 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3930635838150289, + "acc_stderr": 0.0372424959581773, + "acc_norm": 0.3930635838150289, + "acc_norm_stderr": 0.0372424959581773 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.04023382273617748, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.04023382273617748 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4, + "acc_stderr": 0.03202563076101735, + "acc_norm": 0.4, + "acc_norm_stderr": 0.03202563076101735 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3508771929824561, + "acc_stderr": 0.044895393502707, + "acc_norm": 0.3508771929824561, + "acc_norm_stderr": 0.044895393502707 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4206896551724138, + "acc_stderr": 0.0411391498118926, + "acc_norm": 0.4206896551724138, + "acc_norm_stderr": 0.0411391498118926 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.02455229220934265, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.02455229220934265 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.0393253768039287, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.0393253768039287 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5548387096774193, + "acc_stderr": 0.028272410186214906, + "acc_norm": 0.5548387096774193, + "acc_norm_stderr": 0.028272410186214906 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3842364532019704, + "acc_stderr": 0.03422398565657551, + "acc_norm": 0.3842364532019704, + "acc_norm_stderr": 0.03422398565657551 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6060606060606061, + "acc_stderr": 0.0381549430868893, + "acc_norm": 0.6060606060606061, + "acc_norm_stderr": 0.0381549430868893 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6717171717171717, + "acc_stderr": 0.03345678422756776, + "acc_norm": 0.6717171717171717, + "acc_norm_stderr": 0.03345678422756776 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7150259067357513, + "acc_stderr": 0.032577140777096614, + "acc_norm": 0.7150259067357513, + "acc_norm_stderr": 0.032577140777096614 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.44358974358974357, + "acc_stderr": 0.0251891498947642, + "acc_norm": 0.44358974358974357, + "acc_norm_stderr": 0.0251891498947642 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.026067159222275798, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.026067159222275798 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.47478991596638653, + "acc_stderr": 0.0324371805513741, + "acc_norm": 0.47478991596638653, + "acc_norm_stderr": 0.0324371805513741 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.671559633027523, + "acc_stderr": 0.02013590279729841, + "acc_norm": 0.671559633027523, + "acc_norm_stderr": 0.02013590279729841 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.03256850570293647, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.03256850570293647 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6862745098039216, + "acc_stderr": 0.032566854844603886, + "acc_norm": 0.6862745098039216, + "acc_norm_stderr": 0.032566854844603886 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6160337552742616, + "acc_stderr": 0.031658678064106674, + "acc_norm": 0.6160337552742616, + "acc_norm_stderr": 0.031658678064106674 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5695067264573991, + "acc_stderr": 0.0332319730294294, + "acc_norm": 0.5695067264573991, + "acc_norm_stderr": 0.0332319730294294 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5038167938931297, + "acc_stderr": 0.04385162325601553, + "acc_norm": 0.5038167938931297, + "acc_norm_stderr": 0.04385162325601553 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6198347107438017, + "acc_stderr": 0.04431324501968431, + "acc_norm": 0.6198347107438017, + "acc_norm_stderr": 0.04431324501968431 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6018518518518519, + "acc_stderr": 0.04732332615978814, + "acc_norm": 0.6018518518518519, + "acc_norm_stderr": 0.04732332615978814 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5153374233128835, + "acc_stderr": 0.039265223787088424, + "acc_norm": 0.5153374233128835, + "acc_norm_stderr": 0.039265223787088424 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6601941747572816, + "acc_stderr": 0.04689765937278135, + "acc_norm": 0.6601941747572816, + "acc_norm_stderr": 0.04689765937278135 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7264957264957265, + "acc_stderr": 0.029202540153431166, + "acc_norm": 0.7264957264957265, + "acc_norm_stderr": 0.029202540153431166 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6845466155810983, + "acc_stderr": 0.01661750173876338, + "acc_norm": 0.6845466155810983, + "acc_norm_stderr": 0.01661750173876338 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5491329479768786, + "acc_stderr": 0.02678881193156275, + "acc_norm": 0.5491329479768786, + "acc_norm_stderr": 0.02678881193156275 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.30726256983240224, + "acc_stderr": 0.015430158846469621, + "acc_norm": 0.30726256983240224, + "acc_norm_stderr": 0.015430158846469621 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4738562091503268, + "acc_stderr": 0.028590752958852394, + "acc_norm": 0.4738562091503268, + "acc_norm_stderr": 0.028590752958852394 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5466237942122186, + "acc_stderr": 0.028274359854894248, + "acc_norm": 0.5466237942122186, + "acc_norm_stderr": 0.028274359854894248 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5432098765432098, + "acc_stderr": 0.027716661650194038, + "acc_norm": 0.5432098765432098, + "acc_norm_stderr": 0.027716661650194038 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3049645390070922, + "acc_stderr": 0.027464708442022128, + "acc_norm": 0.3049645390070922, + "acc_norm_stderr": 0.027464708442022128 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.333116036505867, + "acc_stderr": 0.012037930451512054, + "acc_norm": 0.333116036505867, + "acc_norm_stderr": 0.012037930451512054 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.36764705882352944, + "acc_stderr": 0.029289413409403192, + "acc_norm": 0.36764705882352944, + "acc_norm_stderr": 0.029289413409403192 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.434640522875817, + "acc_stderr": 0.02005426920072646, + "acc_norm": 0.434640522875817, + "acc_norm_stderr": 0.02005426920072646 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6, + "acc_stderr": 0.0469237132203465, + "acc_norm": 0.6, + "acc_norm_stderr": 0.0469237132203465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.46530612244897956, + "acc_stderr": 0.03193207024425314, + "acc_norm": 0.46530612244897956, + "acc_norm_stderr": 0.03193207024425314 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6417910447761194, + "acc_stderr": 0.03390393042268813, + "acc_norm": 0.6417910447761194, + "acc_norm_stderr": 0.03390393042268813 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.73, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.73, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.43373493975903615, + "acc_stderr": 0.03858158940685517, + "acc_norm": 0.43373493975903615, + "acc_norm_stderr": 0.03858158940685517 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6842105263157895, + "acc_stderr": 0.03565079670708311, + "acc_norm": 0.6842105263157895, + "acc_norm_stderr": 0.03565079670708311 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2827417380660955, + "mc1_stderr": 0.01576477083677731, + "mc2": 0.41575339609808976, + "mc2_stderr": 0.01560446973515796 + }, + "all": { + "acc": 0.47912070413560215, + "acc_stderr": 0.03491107886872343, + "acc_norm": 0.48260227232839065, + "acc_norm_stderr": 0.03490008819568503, + "mc1": 0.2827417380660955, + "mc1_stderr": 0.01576477083677731, + "mc2": 0.41575339609808976, + "mc2_stderr": 0.01560446973515796 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6604.0977602005005", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/meta-math/MetaMath-13B-V1.0/results_2023-10-24T08-44-27.100360.json b/eval-results/meta-math/MetaMath-13B-V1.0/results_2023-10-24T08-44-27.100360.json new file mode 100644 index 0000000000000000000000000000000000000000..b68f54abf21acc151d7a7d323295dde592f794eb --- /dev/null +++ b/eval-results/meta-math/MetaMath-13B-V1.0/results_2023-10-24T08-44-27.100360.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "meta-math/MetaMath-13B-V1.0", + "model_sha": "4f7ca097739f741fccdbfea14928bd0699737fd5", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0010486577181208054, + "em_stderr": 0.0003314581465219155, + "f1": 0.05377516778523499, + "f1_stderr": 0.0012884573852120769 + }, + "harness|gsm8k|5": { + "acc": 0.2850644427596664, + "acc_stderr": 0.012435042334904002 + }, + "harness|winogrande|5": { + "acc": 0.7245461720599842, + "acc_stderr": 0.012555690055709527 + }, + "all": { + "em": 0.0010486577181208054, + "em_stderr": 0.0003314581465219155, + "f1": 0.05377516778523499, + "f1_stderr": 0.0012884573852120769, + "acc": 0.5048053074098253, + "acc_stderr": 0.012495366195306765 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "7d4a4f916047a168" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "9520d05f275fa5aa" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "2c5a621d80c351ce" + }, + "total_evaluation_time_secondes": "13786.425826787949", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/meta-math/MetaMath-70B-V1.0/results_2023-10-04T06-01-20.870650.json b/eval-results/meta-math/MetaMath-70B-V1.0/results_2023-10-04T06-01-20.870650.json new file mode 100644 index 0000000000000000000000000000000000000000..dc568234afa9aa27ebb80c4d68213ad427507f27 --- /dev/null +++ b/eval-results/meta-math/MetaMath-70B-V1.0/results_2023-10-04T06-01-20.870650.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "meta-math/MetaMath-70B-V1.0", + "model_sha": "783a3c7d5d0a75e6e11074f2577b90dd219ef7b1", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6416382252559727, + "acc_stderr": 0.014012883334859857, + "acc_norm": 0.6800341296928327, + "acc_norm_stderr": 0.013631345807016193 + }, + "harness|hellaswag|10": { + "acc": 0.6786496713802032, + "acc_stderr": 0.004660405565338756, + "acc_norm": 0.8684524995020912, + "acc_norm_stderr": 0.003373073863582288 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6148148148148148, + "acc_stderr": 0.042039210401562783, + "acc_norm": 0.6148148148148148, + "acc_norm_stderr": 0.042039210401562783 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.8223684210526315, + "acc_stderr": 0.03110318238312338, + "acc_norm": 0.8223684210526315, + "acc_norm_stderr": 0.03110318238312338 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7283018867924528, + "acc_stderr": 0.027377706624670713, + "acc_norm": 0.7283018867924528, + "acc_norm_stderr": 0.027377706624670713 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8541666666666666, + "acc_stderr": 0.029514245964291766, + "acc_norm": 0.8541666666666666, + "acc_norm_stderr": 0.029514245964291766 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6705202312138728, + "acc_stderr": 0.03583901754736412, + "acc_norm": 0.6705202312138728, + "acc_norm_stderr": 0.03583901754736412 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.04784060704105653, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.04784060704105653 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.76, + "acc_stderr": 0.04292346959909281, + "acc_norm": 0.76, + "acc_norm_stderr": 0.04292346959909281 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6808510638297872, + "acc_stderr": 0.030472973363380035, + "acc_norm": 0.6808510638297872, + "acc_norm_stderr": 0.030472973363380035 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.42105263157894735, + "acc_stderr": 0.046446020912223177, + "acc_norm": 0.42105263157894735, + "acc_norm_stderr": 0.046446020912223177 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.593103448275862, + "acc_stderr": 0.04093793981266236, + "acc_norm": 0.593103448275862, + "acc_norm_stderr": 0.04093793981266236 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4603174603174603, + "acc_stderr": 0.02567008063690919, + "acc_norm": 0.4603174603174603, + "acc_norm_stderr": 0.02567008063690919 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8032258064516129, + "acc_stderr": 0.02261640942074202, + "acc_norm": 0.8032258064516129, + "acc_norm_stderr": 0.02261640942074202 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5221674876847291, + "acc_stderr": 0.03514528562175008, + "acc_norm": 0.5221674876847291, + "acc_norm_stderr": 0.03514528562175008 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8363636363636363, + "acc_stderr": 0.02888787239548795, + "acc_norm": 0.8363636363636363, + "acc_norm_stderr": 0.02888787239548795 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8787878787878788, + "acc_stderr": 0.023253157951942095, + "acc_norm": 0.8787878787878788, + "acc_norm_stderr": 0.023253157951942095 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9430051813471503, + "acc_stderr": 0.016731085293607555, + "acc_norm": 0.9430051813471503, + "acc_norm_stderr": 0.016731085293607555 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7076923076923077, + "acc_stderr": 0.023060438380857733, + "acc_norm": 0.7076923076923077, + "acc_norm_stderr": 0.023060438380857733 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.028742040903948485, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.028742040903948485 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7647058823529411, + "acc_stderr": 0.02755361446786381, + "acc_norm": 0.7647058823529411, + "acc_norm_stderr": 0.02755361446786381 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.423841059602649, + "acc_stderr": 0.04034846678603397, + "acc_norm": 0.423841059602649, + "acc_norm_stderr": 0.04034846678603397 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8880733944954129, + "acc_stderr": 0.013517352714958788, + "acc_norm": 0.8880733944954129, + "acc_norm_stderr": 0.013517352714958788 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5740740740740741, + "acc_stderr": 0.033723432716530624, + "acc_norm": 0.5740740740740741, + "acc_norm_stderr": 0.033723432716530624 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9313725490196079, + "acc_stderr": 0.017744453647073312, + "acc_norm": 0.9313725490196079, + "acc_norm_stderr": 0.017744453647073312 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8734177215189873, + "acc_stderr": 0.021644195727955173, + "acc_norm": 0.8734177215189873, + "acc_norm_stderr": 0.021644195727955173 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7982062780269058, + "acc_stderr": 0.02693611191280227, + "acc_norm": 0.7982062780269058, + "acc_norm_stderr": 0.02693611191280227 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8625954198473282, + "acc_stderr": 0.030194823996804475, + "acc_norm": 0.8625954198473282, + "acc_norm_stderr": 0.030194823996804475 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8677685950413223, + "acc_stderr": 0.03092278832044579, + "acc_norm": 0.8677685950413223, + "acc_norm_stderr": 0.03092278832044579 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.03602814176392645, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.03602814176392645 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8159509202453987, + "acc_stderr": 0.030446777687971726, + "acc_norm": 0.8159509202453987, + "acc_norm_stderr": 0.030446777687971726 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5267857142857143, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.5267857142857143, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8446601941747572, + "acc_stderr": 0.03586594738573974, + "acc_norm": 0.8446601941747572, + "acc_norm_stderr": 0.03586594738573974 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9017094017094017, + "acc_stderr": 0.019503444900757567, + "acc_norm": 0.9017094017094017, + "acc_norm_stderr": 0.019503444900757567 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8659003831417624, + "acc_stderr": 0.012185528166499978, + "acc_norm": 0.8659003831417624, + "acc_norm_stderr": 0.012185528166499978 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7832369942196532, + "acc_stderr": 0.022183477668412856, + "acc_norm": 0.7832369942196532, + "acc_norm_stderr": 0.022183477668412856 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.41787709497206704, + "acc_stderr": 0.016495400635820084, + "acc_norm": 0.41787709497206704, + "acc_norm_stderr": 0.016495400635820084 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7418300653594772, + "acc_stderr": 0.025058503316958154, + "acc_norm": 0.7418300653594772, + "acc_norm_stderr": 0.025058503316958154 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.797427652733119, + "acc_stderr": 0.022827317491059686, + "acc_norm": 0.797427652733119, + "acc_norm_stderr": 0.022827317491059686 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8395061728395061, + "acc_stderr": 0.020423955354778034, + "acc_norm": 0.8395061728395061, + "acc_norm_stderr": 0.020423955354778034 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5460992907801419, + "acc_stderr": 0.029700453247291474, + "acc_norm": 0.5460992907801419, + "acc_norm_stderr": 0.029700453247291474 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5371577574967406, + "acc_stderr": 0.01273492357953206, + "acc_norm": 0.5371577574967406, + "acc_norm_stderr": 0.01273492357953206 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7389705882352942, + "acc_stderr": 0.026679252270103128, + "acc_norm": 0.7389705882352942, + "acc_norm_stderr": 0.026679252270103128 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7401960784313726, + "acc_stderr": 0.01774089950917779, + "acc_norm": 0.7401960784313726, + "acc_norm_stderr": 0.01774089950917779 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7363636363636363, + "acc_stderr": 0.04220224692971987, + "acc_norm": 0.7363636363636363, + "acc_norm_stderr": 0.04220224692971987 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.8081632653061225, + "acc_stderr": 0.025206963154225395, + "acc_norm": 0.8081632653061225, + "acc_norm_stderr": 0.025206963154225395 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8955223880597015, + "acc_stderr": 0.021628920516700637, + "acc_norm": 0.8955223880597015, + "acc_norm_stderr": 0.021628920516700637 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.93, + "acc_stderr": 0.0256432399976243, + "acc_norm": 0.93, + "acc_norm_stderr": 0.0256432399976243 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.536144578313253, + "acc_stderr": 0.03882310850890594, + "acc_norm": 0.536144578313253, + "acc_norm_stderr": 0.03882310850890594 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8596491228070176, + "acc_stderr": 0.0266405825391332, + "acc_norm": 0.8596491228070176, + "acc_norm_stderr": 0.0266405825391332 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3390452876376989, + "mc1_stderr": 0.016571797910626615, + "mc2": 0.5097969029790534, + "mc2_stderr": 0.014915889066271937 + }, + "all": { + "acc": 0.6919665391533253, + "acc_stderr": 0.03077850676465074, + "acc_norm": 0.6958343142814397, + "acc_norm_stderr": 0.030750220845504973, + "mc1": 0.3390452876376989, + "mc1_stderr": 0.016571797910626615, + "mc2": 0.5097969029790534, + "mc2_stderr": 0.014915889066271937 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "43875.36873936653", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/meta-math/MetaMath-70B-V1.0/results_2023-10-27T06-53-02.758124.json b/eval-results/meta-math/MetaMath-70B-V1.0/results_2023-10-27T06-53-02.758124.json new file mode 100644 index 0000000000000000000000000000000000000000..b3850e658779f2a60a1e08f83ddb781944bb9d11 --- /dev/null +++ b/eval-results/meta-math/MetaMath-70B-V1.0/results_2023-10-27T06-53-02.758124.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "meta-math/MetaMath-70B-V1.0", + "model_sha": "a5419673321fef896aeca32fbbc9a4f345ca7d1e", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.035968959731543626, + "em_stderr": 0.0019069930004768872, + "f1": 0.13366401006711418, + "f1_stderr": 0.0024535730972056486 + }, + "harness|gsm8k|5": { + "acc": 0.44655041698256254, + "acc_stderr": 0.013693566549743144 + }, + "harness|winogrande|5": { + "acc": 0.8232044198895028, + "acc_stderr": 0.010721923287918735 + }, + "all": { + "em": 0.035968959731543626, + "em_stderr": 0.0019069930004768872, + "f1": 0.13366401006711418, + "f1_stderr": 0.0024535730972056486, + "acc": 0.6348774184360326, + "acc_stderr": 0.01220774491883094 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "8827ad4c6289de1d" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "1985a64b7a98a5f5" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "47433173599d70d5" + }, + "total_evaluation_time_secondes": "47122.08239245415", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/meta-math/MetaMath-Llemma-7B/results_2023-12-10T10-48-07.737490.json b/eval-results/meta-math/MetaMath-Llemma-7B/results_2023-12-10T10-48-07.737490.json new file mode 100644 index 0000000000000000000000000000000000000000..ca3906d219c1ca7c9e056d3dedea896e1ccfe635 --- /dev/null +++ b/eval-results/meta-math/MetaMath-Llemma-7B/results_2023-12-10T10-48-07.737490.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 652661.633250162, + "end_time": 660161.642052035, + "total_evaluation_time_secondes": "7500.008801872958", + "model_name": "meta-math/MetaMath-Llemma-7B", + "model_sha": "e31ec61dccd8fa24f44f0592a518491ef76a2235", + "model_dtype": "torch.float16", + "model_size": "12.61 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.439419795221843, + "acc_stderr": 0.014503747823580125, + "acc_norm": 0.46501706484641636, + "acc_norm_stderr": 0.01457558392201967 + }, + "harness|hellaswag|10": { + "acc": 0.4731129257120096, + "acc_stderr": 0.004982561815214125, + "acc_norm": 0.6169089822744473, + "acc_norm_stderr": 0.004851466623601442 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.04244633238353228, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.04244633238353228 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5263157894736842, + "acc_stderr": 0.04063302731486671, + "acc_norm": 0.5263157894736842, + "acc_norm_stderr": 0.04063302731486671 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.47924528301886793, + "acc_stderr": 0.030746349975723463, + "acc_norm": 0.47924528301886793, + "acc_norm_stderr": 0.030746349975723463 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4930555555555556, + "acc_stderr": 0.04180806750294938, + "acc_norm": 0.4930555555555556, + "acc_norm_stderr": 0.04180806750294938 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4682080924855491, + "acc_stderr": 0.03804749744364763, + "acc_norm": 0.4682080924855491, + "acc_norm_stderr": 0.03804749744364763 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.39215686274509803, + "acc_stderr": 0.048580835742663434, + "acc_norm": 0.39215686274509803, + "acc_norm_stderr": 0.048580835742663434 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.65, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.65, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4808510638297872, + "acc_stderr": 0.032662042990646796, + "acc_norm": 0.4808510638297872, + "acc_norm_stderr": 0.032662042990646796 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.32456140350877194, + "acc_stderr": 0.044045561573747664, + "acc_norm": 0.32456140350877194, + "acc_norm_stderr": 0.044045561573747664 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5241379310344828, + "acc_stderr": 0.0416180850350153, + "acc_norm": 0.5241379310344828, + "acc_norm_stderr": 0.0416180850350153 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.41005291005291006, + "acc_stderr": 0.025331202438944423, + "acc_norm": 0.41005291005291006, + "acc_norm_stderr": 0.025331202438944423 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.04375888492727061, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.04375888492727061 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5193548387096775, + "acc_stderr": 0.028422687404312107, + "acc_norm": 0.5193548387096775, + "acc_norm_stderr": 0.028422687404312107 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.41379310344827586, + "acc_stderr": 0.03465304488406795, + "acc_norm": 0.41379310344827586, + "acc_norm_stderr": 0.03465304488406795 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5636363636363636, + "acc_stderr": 0.03872592983524754, + "acc_norm": 0.5636363636363636, + "acc_norm_stderr": 0.03872592983524754 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5757575757575758, + "acc_stderr": 0.035212249088415845, + "acc_norm": 0.5757575757575758, + "acc_norm_stderr": 0.035212249088415845 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.5492227979274611, + "acc_stderr": 0.03590910952235524, + "acc_norm": 0.5492227979274611, + "acc_norm_stderr": 0.03590910952235524 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5, + "acc_stderr": 0.02535100632816969, + "acc_norm": 0.5, + "acc_norm_stderr": 0.02535100632816969 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.27037037037037037, + "acc_stderr": 0.027080372815145665, + "acc_norm": 0.27037037037037037, + "acc_norm_stderr": 0.027080372815145665 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4831932773109244, + "acc_stderr": 0.03246013680375308, + "acc_norm": 0.4831932773109244, + "acc_norm_stderr": 0.03246013680375308 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6091743119266055, + "acc_stderr": 0.02092005834611106, + "acc_norm": 0.6091743119266055, + "acc_norm_stderr": 0.02092005834611106 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4583333333333333, + "acc_stderr": 0.033981108902946366, + "acc_norm": 0.4583333333333333, + "acc_norm_stderr": 0.033981108902946366 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5049019607843137, + "acc_stderr": 0.03509143375606785, + "acc_norm": 0.5049019607843137, + "acc_norm_stderr": 0.03509143375606785 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.5654008438818565, + "acc_stderr": 0.03226759995510145, + "acc_norm": 0.5654008438818565, + "acc_norm_stderr": 0.03226759995510145 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.39461883408071746, + "acc_stderr": 0.03280400504755291, + "acc_norm": 0.39461883408071746, + "acc_norm_stderr": 0.03280400504755291 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5114503816793893, + "acc_stderr": 0.04384140024078016, + "acc_norm": 0.5114503816793893, + "acc_norm_stderr": 0.04384140024078016 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6115702479338843, + "acc_stderr": 0.04449270350068384, + "acc_norm": 0.6115702479338843, + "acc_norm_stderr": 0.04449270350068384 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.49074074074074076, + "acc_stderr": 0.04832853553437055, + "acc_norm": 0.49074074074074076, + "acc_norm_stderr": 0.04832853553437055 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5398773006134969, + "acc_stderr": 0.039158572914369714, + "acc_norm": 0.5398773006134969, + "acc_norm_stderr": 0.039158572914369714 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285713, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285713 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6407766990291263, + "acc_stderr": 0.04750458399041697, + "acc_norm": 0.6407766990291263, + "acc_norm_stderr": 0.04750458399041697 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6794871794871795, + "acc_stderr": 0.030572811310299607, + "acc_norm": 0.6794871794871795, + "acc_norm_stderr": 0.030572811310299607 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.561941251596424, + "acc_stderr": 0.01774223223825723, + "acc_norm": 0.561941251596424, + "acc_norm_stderr": 0.01774223223825723 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.49710982658959535, + "acc_stderr": 0.026918645383239022, + "acc_norm": 0.49710982658959535, + "acc_norm_stderr": 0.026918645383239022 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.30837988826815643, + "acc_stderr": 0.015445716910998893, + "acc_norm": 0.30837988826815643, + "acc_norm_stderr": 0.015445716910998893 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5163398692810458, + "acc_stderr": 0.028614624752805434, + "acc_norm": 0.5163398692810458, + "acc_norm_stderr": 0.028614624752805434 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.4983922829581994, + "acc_stderr": 0.02839794490780661, + "acc_norm": 0.4983922829581994, + "acc_norm_stderr": 0.02839794490780661 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.44753086419753085, + "acc_stderr": 0.02766713856942271, + "acc_norm": 0.44753086419753085, + "acc_norm_stderr": 0.02766713856942271 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.35815602836879434, + "acc_stderr": 0.028602085862759422, + "acc_norm": 0.35815602836879434, + "acc_norm_stderr": 0.028602085862759422 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3396349413298566, + "acc_stderr": 0.012095592506931967, + "acc_norm": 0.3396349413298566, + "acc_norm_stderr": 0.012095592506931967 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.41544117647058826, + "acc_stderr": 0.02993534270787775, + "acc_norm": 0.41544117647058826, + "acc_norm_stderr": 0.02993534270787775 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4068627450980392, + "acc_stderr": 0.019873802005061177, + "acc_norm": 0.4068627450980392, + "acc_norm_stderr": 0.019873802005061177 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5, + "acc_stderr": 0.04789131426105757, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04789131426105757 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5551020408163265, + "acc_stderr": 0.031814251181977865, + "acc_norm": 0.5551020408163265, + "acc_norm_stderr": 0.031814251181977865 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6368159203980099, + "acc_stderr": 0.034005985055990146, + "acc_norm": 0.6368159203980099, + "acc_norm_stderr": 0.034005985055990146 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.64, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.64, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3855421686746988, + "acc_stderr": 0.037891344246115496, + "acc_norm": 0.3855421686746988, + "acc_norm_stderr": 0.037891344246115496 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.03811079669833531, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.03811079669833531 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2594859241126071, + "mc1_stderr": 0.015345409485557994, + "mc2": 0.39610018025256144, + "mc2_stderr": 0.015159247351087708 + }, + "harness|winogrande|5": { + "acc": 0.6274664561957379, + "acc_stderr": 0.013588173888522445 + }, + "harness|gsm8k|5": { + "acc": 0.6095526914329037, + "acc_stderr": 0.013437829864668582 + }, + "all": { + "acc": 0.4805727831472479, + "acc_stderr": 0.03501873176922748, + "acc_norm": 0.47876803306000837, + "acc_norm_stderr": 0.03574673517078834, + "mc1": 0.2594859241126071, + "mc1_stderr": 0.015345409485557994, + "mc2": 0.39610018025256144, + "mc2_stderr": 0.015159247351087708 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "c4bc3ecc584dd03e" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "a8fa53915153e1db", + "hash_cont_tokens": "2919bcd76cfabec8" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/meta-math/MetaMath-Mistral-7B/results_2023-12-04T19-35-59.251082.json b/eval-results/meta-math/MetaMath-Mistral-7B/results_2023-12-04T19-35-59.251082.json new file mode 100644 index 0000000000000000000000000000000000000000..2747017e3ee72cf0f3a89ca40b2e311b722b8b04 --- /dev/null +++ b/eval-results/meta-math/MetaMath-Mistral-7B/results_2023-12-04T19-35-59.251082.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 163417.380914755, + "end_time": 173434.289010139, + "total_evaluation_time_secondes": "10016.908095384017", + "model_name": "meta-math/MetaMath-Mistral-7B", + "model_sha": "016a7bb03bfcd953860357e1a16d5b333b887d26", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5699658703071673, + "acc_stderr": 0.01446763155913799, + "acc_norm": 0.606655290102389, + "acc_norm_stderr": 0.014275101465693024 + }, + "harness|hellaswag|10": { + "acc": 0.6437960565624378, + "acc_stderr": 0.004778978031389641, + "acc_norm": 0.8258315076677952, + "acc_norm_stderr": 0.0037847921724660652 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.23, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.23, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5925925925925926, + "acc_stderr": 0.04244633238353227, + "acc_norm": 0.5925925925925926, + "acc_norm_stderr": 0.04244633238353227 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.625, + "acc_stderr": 0.039397364351956274, + "acc_norm": 0.625, + "acc_norm_stderr": 0.039397364351956274 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.690566037735849, + "acc_stderr": 0.028450154794118637, + "acc_norm": 0.690566037735849, + "acc_norm_stderr": 0.028450154794118637 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7083333333333334, + "acc_stderr": 0.038009680605548594, + "acc_norm": 0.7083333333333334, + "acc_norm_stderr": 0.038009680605548594 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411019, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411019 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6184971098265896, + "acc_stderr": 0.03703851193099521, + "acc_norm": 0.6184971098265896, + "acc_norm_stderr": 0.03703851193099521 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3431372549019608, + "acc_stderr": 0.04724007352383887, + "acc_norm": 0.3431372549019608, + "acc_norm_stderr": 0.04724007352383887 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5617021276595745, + "acc_stderr": 0.03243618636108101, + "acc_norm": 0.5617021276595745, + "acc_norm_stderr": 0.03243618636108101 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4649122807017544, + "acc_stderr": 0.046920083813689104, + "acc_norm": 0.4649122807017544, + "acc_norm_stderr": 0.046920083813689104 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.0252798503974049, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.0252798503974049 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.04325506042017086, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.04325506042017086 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7225806451612903, + "acc_stderr": 0.025470196835900055, + "acc_norm": 0.7225806451612903, + "acc_norm_stderr": 0.025470196835900055 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.47783251231527096, + "acc_stderr": 0.03514528562175007, + "acc_norm": 0.47783251231527096, + "acc_norm_stderr": 0.03514528562175007 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7515151515151515, + "acc_stderr": 0.033744026441394036, + "acc_norm": 0.7515151515151515, + "acc_norm_stderr": 0.033744026441394036 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.030532892233932022, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.030532892233932022 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8549222797927462, + "acc_stderr": 0.025416343096306433, + "acc_norm": 0.8549222797927462, + "acc_norm_stderr": 0.025416343096306433 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6076923076923076, + "acc_stderr": 0.024756000382130956, + "acc_norm": 0.6076923076923076, + "acc_norm_stderr": 0.024756000382130956 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.029723278961476664, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.029723278961476664 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6428571428571429, + "acc_stderr": 0.031124619309328177, + "acc_norm": 0.6428571428571429, + "acc_norm_stderr": 0.031124619309328177 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3509933774834437, + "acc_stderr": 0.03896981964257375, + "acc_norm": 0.3509933774834437, + "acc_norm_stderr": 0.03896981964257375 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8036697247706422, + "acc_stderr": 0.017030719339154343, + "acc_norm": 0.8036697247706422, + "acc_norm_stderr": 0.017030719339154343 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4398148148148148, + "acc_stderr": 0.03385177976044811, + "acc_norm": 0.4398148148148148, + "acc_norm_stderr": 0.03385177976044811 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7941176470588235, + "acc_stderr": 0.028379449451588667, + "acc_norm": 0.7941176470588235, + "acc_norm_stderr": 0.028379449451588667 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7721518987341772, + "acc_stderr": 0.02730348459906943, + "acc_norm": 0.7721518987341772, + "acc_norm_stderr": 0.02730348459906943 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6636771300448431, + "acc_stderr": 0.031708824268455, + "acc_norm": 0.6636771300448431, + "acc_norm_stderr": 0.031708824268455 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7557251908396947, + "acc_stderr": 0.037683359597287434, + "acc_norm": 0.7557251908396947, + "acc_norm_stderr": 0.037683359597287434 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7768595041322314, + "acc_stderr": 0.03800754475228733, + "acc_norm": 0.7768595041322314, + "acc_norm_stderr": 0.03800754475228733 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.04133119440243839, + "acc_norm": 0.7592592592592593, + "acc_norm_stderr": 0.04133119440243839 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7484662576687117, + "acc_stderr": 0.034089978868575295, + "acc_norm": 0.7484662576687117, + "acc_norm_stderr": 0.034089978868575295 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.45535714285714285, + "acc_stderr": 0.047268355537191, + "acc_norm": 0.45535714285714285, + "acc_norm_stderr": 0.047268355537191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7961165048543689, + "acc_stderr": 0.039891398595317706, + "acc_norm": 0.7961165048543689, + "acc_norm_stderr": 0.039891398595317706 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8846153846153846, + "acc_stderr": 0.020930193185179333, + "acc_norm": 0.8846153846153846, + "acc_norm_stderr": 0.020930193185179333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7943805874840357, + "acc_stderr": 0.01445250045678583, + "acc_norm": 0.7943805874840357, + "acc_norm_stderr": 0.01445250045678583 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7023121387283237, + "acc_stderr": 0.024617055388677, + "acc_norm": 0.7023121387283237, + "acc_norm_stderr": 0.024617055388677 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.36089385474860336, + "acc_stderr": 0.01606229067111046, + "acc_norm": 0.36089385474860336, + "acc_norm_stderr": 0.01606229067111046 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7549019607843137, + "acc_stderr": 0.024630048979824775, + "acc_norm": 0.7549019607843137, + "acc_norm_stderr": 0.024630048979824775 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6881028938906752, + "acc_stderr": 0.02631185807185416, + "acc_norm": 0.6881028938906752, + "acc_norm_stderr": 0.02631185807185416 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7037037037037037, + "acc_stderr": 0.025407197798890162, + "acc_norm": 0.7037037037037037, + "acc_norm_stderr": 0.025407197798890162 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4645390070921986, + "acc_stderr": 0.029752389657427047, + "acc_norm": 0.4645390070921986, + "acc_norm_stderr": 0.029752389657427047 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4491525423728814, + "acc_stderr": 0.012704030518851488, + "acc_norm": 0.4491525423728814, + "acc_norm_stderr": 0.012704030518851488 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6213235294117647, + "acc_stderr": 0.02946513363977613, + "acc_norm": 0.6213235294117647, + "acc_norm_stderr": 0.02946513363977613 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6486928104575164, + "acc_stderr": 0.01931267606578655, + "acc_norm": 0.6486928104575164, + "acc_norm_stderr": 0.01931267606578655 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6818181818181818, + "acc_stderr": 0.04461272175910509, + "acc_norm": 0.6818181818181818, + "acc_norm_stderr": 0.04461272175910509 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6857142857142857, + "acc_stderr": 0.029719329422417475, + "acc_norm": 0.6857142857142857, + "acc_norm_stderr": 0.029719329422417475 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.835820895522388, + "acc_stderr": 0.026193923544454132, + "acc_norm": 0.835820895522388, + "acc_norm_stderr": 0.026193923544454132 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774711, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774711 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5421686746987951, + "acc_stderr": 0.0387862677100236, + "acc_norm": 0.5421686746987951, + "acc_norm_stderr": 0.0387862677100236 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8128654970760234, + "acc_stderr": 0.029913127232368043, + "acc_norm": 0.8128654970760234, + "acc_norm_stderr": 0.029913127232368043 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3047735618115055, + "mc1_stderr": 0.016114124156882455, + "mc2": 0.4489052122445318, + "mc2_stderr": 0.01547532303838066 + }, + "harness|winogrande|5": { + "acc": 0.7576953433307024, + "acc_stderr": 0.012042352526174787 + }, + "harness|gsm8k|5": { + "acc": 0.6884003032600455, + "acc_stderr": 0.012757375376754941 + }, + "all": { + "acc": 0.6224817411296446, + "acc_stderr": 0.03262551509185562, + "acc_norm": 0.6227799225969178, + "acc_norm_stderr": 0.033291016555049055, + "mc1": 0.3047735618115055, + "mc1_stderr": 0.016114124156882455, + "mc2": 0.4489052122445318, + "mc2_stderr": 0.01547532303838066 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "1498ff42b06e4878" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "0624cf39f11abcd7" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/SynthIA-7B-v1.3/results_2023-10-09T11-48-18.823660.json b/eval-results/migtissera/SynthIA-7B-v1.3/results_2023-10-09T11-48-18.823660.json new file mode 100644 index 0000000000000000000000000000000000000000..8545b12460bebde938d8843f0c1e055f9904ff10 --- /dev/null +++ b/eval-results/migtissera/SynthIA-7B-v1.3/results_2023-10-09T11-48-18.823660.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "migtissera/SynthIA-7B-v1.3", + "model_sha": "8e6d0b18be876e0ebfff47d6c4f33d776f189971", + "model_size": "13.99 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5853242320819113, + "acc_stderr": 0.014397070564409174, + "acc_norm": 0.621160409556314, + "acc_norm_stderr": 0.014175915490000326 + }, + "harness|hellaswag|10": { + "acc": 0.6429994025094603, + "acc_stderr": 0.004781358113341955, + "acc_norm": 0.8344951204939255, + "acc_norm_stderr": 0.003708760752685524 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6, + "acc_stderr": 0.04232073695151589, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04232073695151589 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6644736842105263, + "acc_stderr": 0.038424985593952694, + "acc_norm": 0.6644736842105263, + "acc_norm_stderr": 0.038424985593952694 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6792452830188679, + "acc_stderr": 0.028727502957880267, + "acc_norm": 0.6792452830188679, + "acc_norm_stderr": 0.028727502957880267 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7013888888888888, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.7013888888888888, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.630057803468208, + "acc_stderr": 0.0368122963339432, + "acc_norm": 0.630057803468208, + "acc_norm_stderr": 0.0368122963339432 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3431372549019608, + "acc_stderr": 0.047240073523838876, + "acc_norm": 0.3431372549019608, + "acc_norm_stderr": 0.047240073523838876 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.79, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5361702127659574, + "acc_stderr": 0.03260038511835771, + "acc_norm": 0.5361702127659574, + "acc_norm_stderr": 0.03260038511835771 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4473684210526316, + "acc_stderr": 0.04677473004491199, + "acc_norm": 0.4473684210526316, + "acc_norm_stderr": 0.04677473004491199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4021164021164021, + "acc_stderr": 0.02525303255499769, + "acc_norm": 0.4021164021164021, + "acc_norm_stderr": 0.02525303255499769 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.04426266681379909, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.04426266681379909 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7290322580645161, + "acc_stderr": 0.025284416114900156, + "acc_norm": 0.7290322580645161, + "acc_norm_stderr": 0.025284416114900156 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4482758620689655, + "acc_stderr": 0.03499113137676744, + "acc_norm": 0.4482758620689655, + "acc_norm_stderr": 0.03499113137676744 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7636363636363637, + "acc_stderr": 0.03317505930009181, + "acc_norm": 0.7636363636363637, + "acc_norm_stderr": 0.03317505930009181 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.029126522834586815, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.029126522834586815 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8652849740932642, + "acc_stderr": 0.024639789097709443, + "acc_norm": 0.8652849740932642, + "acc_norm_stderr": 0.024639789097709443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6358974358974359, + "acc_stderr": 0.024396672985094767, + "acc_norm": 0.6358974358974359, + "acc_norm_stderr": 0.024396672985094767 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.337037037037037, + "acc_stderr": 0.028820884666253255, + "acc_norm": 0.337037037037037, + "acc_norm_stderr": 0.028820884666253255 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6428571428571429, + "acc_stderr": 0.031124619309328177, + "acc_norm": 0.6428571428571429, + "acc_norm_stderr": 0.031124619309328177 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.03861557546255169, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.03861557546255169 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8201834862385321, + "acc_stderr": 0.016465345467391528, + "acc_norm": 0.8201834862385321, + "acc_norm_stderr": 0.016465345467391528 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.034076320938540516, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.034076320938540516 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7892156862745098, + "acc_stderr": 0.0286265479124374, + "acc_norm": 0.7892156862745098, + "acc_norm_stderr": 0.0286265479124374 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7805907172995781, + "acc_stderr": 0.026939106581553945, + "acc_norm": 0.7805907172995781, + "acc_norm_stderr": 0.026939106581553945 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.03102441174057221, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.03102441174057221 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7862595419847328, + "acc_stderr": 0.0359546161177469, + "acc_norm": 0.7862595419847328, + "acc_norm_stderr": 0.0359546161177469 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8148148148148148, + "acc_stderr": 0.03755265865037181, + "acc_norm": 0.8148148148148148, + "acc_norm_stderr": 0.03755265865037181 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7423312883435583, + "acc_stderr": 0.03436150827846917, + "acc_norm": 0.7423312883435583, + "acc_norm_stderr": 0.03436150827846917 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4732142857142857, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.4732142857142857, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7864077669902912, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.7864077669902912, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8675213675213675, + "acc_stderr": 0.022209309073165612, + "acc_norm": 0.8675213675213675, + "acc_norm_stderr": 0.022209309073165612 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8071519795657727, + "acc_stderr": 0.014108533515757431, + "acc_norm": 0.8071519795657727, + "acc_norm_stderr": 0.014108533515757431 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7023121387283237, + "acc_stderr": 0.024617055388677, + "acc_norm": 0.7023121387283237, + "acc_norm_stderr": 0.024617055388677 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4111731843575419, + "acc_stderr": 0.016456498033977512, + "acc_norm": 0.4111731843575419, + "acc_norm_stderr": 0.016456498033977512 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6830065359477124, + "acc_stderr": 0.026643278474508755, + "acc_norm": 0.6830065359477124, + "acc_norm_stderr": 0.026643278474508755 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7041800643086816, + "acc_stderr": 0.025922371788818763, + "acc_norm": 0.7041800643086816, + "acc_norm_stderr": 0.025922371788818763 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7191358024691358, + "acc_stderr": 0.02500646975579921, + "acc_norm": 0.7191358024691358, + "acc_norm_stderr": 0.02500646975579921 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4397163120567376, + "acc_stderr": 0.029609912075594106, + "acc_norm": 0.4397163120567376, + "acc_norm_stderr": 0.029609912075594106 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.46479791395045633, + "acc_stderr": 0.01273854737130396, + "acc_norm": 0.46479791395045633, + "acc_norm_stderr": 0.01273854737130396 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.028418208619406755, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.028418208619406755 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6715686274509803, + "acc_stderr": 0.018999707383162666, + "acc_norm": 0.6715686274509803, + "acc_norm_stderr": 0.018999707383162666 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6857142857142857, + "acc_stderr": 0.029719329422417475, + "acc_norm": 0.6857142857142857, + "acc_norm_stderr": 0.029719329422417475 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.845771144278607, + "acc_stderr": 0.025538433368578337, + "acc_norm": 0.845771144278607, + "acc_norm_stderr": 0.025538433368578337 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5060240963855421, + "acc_stderr": 0.03892212195333045, + "acc_norm": 0.5060240963855421, + "acc_norm_stderr": 0.03892212195333045 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8070175438596491, + "acc_stderr": 0.030267457554898458, + "acc_norm": 0.8070175438596491, + "acc_norm_stderr": 0.030267457554898458 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.34149326805385555, + "mc1_stderr": 0.016600688619950826, + "mc2": 0.5136907901080823, + "mc2_stderr": 0.015224284656166093 + }, + "all": { + "acc": 0.6260571760719941, + "acc_stderr": 0.03318501686714267, + "acc_norm": 0.6299102590458732, + "acc_norm_stderr": 0.03316308885976868, + "mc1": 0.34149326805385555, + "mc1_stderr": 0.016600688619950826, + "mc2": 0.5136907901080823, + "mc2_stderr": 0.015224284656166093 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "4130.939360141754", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/SynthIA-7B-v1.3/results_2023-10-09T11-58-55.532772.json b/eval-results/migtissera/SynthIA-7B-v1.3/results_2023-10-09T11-58-55.532772.json new file mode 100644 index 0000000000000000000000000000000000000000..1bdff434aa20ba4e721b278a941e1f1e5d7cc43c --- /dev/null +++ b/eval-results/migtissera/SynthIA-7B-v1.3/results_2023-10-09T11-58-55.532772.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "migtissera/SynthIA-7B-v1.3", + "model_sha": "8e6d0b18be876e0ebfff47d6c4f33d776f189971", + "model_size": "13.99 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5853242320819113, + "acc_stderr": 0.014397070564409174, + "acc_norm": 0.621160409556314, + "acc_norm_stderr": 0.014175915490000326 + }, + "harness|hellaswag|10": { + "acc": 0.6429994025094603, + "acc_stderr": 0.004781358113341955, + "acc_norm": 0.8344951204939255, + "acc_norm_stderr": 0.003708760752685524 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6, + "acc_stderr": 0.04232073695151589, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04232073695151589 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6644736842105263, + "acc_stderr": 0.038424985593952694, + "acc_norm": 0.6644736842105263, + "acc_norm_stderr": 0.038424985593952694 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6792452830188679, + "acc_stderr": 0.028727502957880267, + "acc_norm": 0.6792452830188679, + "acc_norm_stderr": 0.028727502957880267 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7013888888888888, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.7013888888888888, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.630057803468208, + "acc_stderr": 0.0368122963339432, + "acc_norm": 0.630057803468208, + "acc_norm_stderr": 0.0368122963339432 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3431372549019608, + "acc_stderr": 0.047240073523838876, + "acc_norm": 0.3431372549019608, + "acc_norm_stderr": 0.047240073523838876 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.79, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5361702127659574, + "acc_stderr": 0.03260038511835771, + "acc_norm": 0.5361702127659574, + "acc_norm_stderr": 0.03260038511835771 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4473684210526316, + "acc_stderr": 0.04677473004491199, + "acc_norm": 0.4473684210526316, + "acc_norm_stderr": 0.04677473004491199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4021164021164021, + "acc_stderr": 0.02525303255499769, + "acc_norm": 0.4021164021164021, + "acc_norm_stderr": 0.02525303255499769 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.04426266681379909, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.04426266681379909 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7290322580645161, + "acc_stderr": 0.025284416114900156, + "acc_norm": 0.7290322580645161, + "acc_norm_stderr": 0.025284416114900156 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4482758620689655, + "acc_stderr": 0.03499113137676744, + "acc_norm": 0.4482758620689655, + "acc_norm_stderr": 0.03499113137676744 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7636363636363637, + "acc_stderr": 0.03317505930009181, + "acc_norm": 0.7636363636363637, + "acc_norm_stderr": 0.03317505930009181 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.029126522834586815, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.029126522834586815 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8652849740932642, + "acc_stderr": 0.024639789097709443, + "acc_norm": 0.8652849740932642, + "acc_norm_stderr": 0.024639789097709443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6358974358974359, + "acc_stderr": 0.024396672985094767, + "acc_norm": 0.6358974358974359, + "acc_norm_stderr": 0.024396672985094767 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.337037037037037, + "acc_stderr": 0.028820884666253255, + "acc_norm": 0.337037037037037, + "acc_norm_stderr": 0.028820884666253255 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6428571428571429, + "acc_stderr": 0.031124619309328177, + "acc_norm": 0.6428571428571429, + "acc_norm_stderr": 0.031124619309328177 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.03861557546255169, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.03861557546255169 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8201834862385321, + "acc_stderr": 0.016465345467391528, + "acc_norm": 0.8201834862385321, + "acc_norm_stderr": 0.016465345467391528 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.034076320938540516, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.034076320938540516 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7892156862745098, + "acc_stderr": 0.0286265479124374, + "acc_norm": 0.7892156862745098, + "acc_norm_stderr": 0.0286265479124374 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7805907172995781, + "acc_stderr": 0.026939106581553945, + "acc_norm": 0.7805907172995781, + "acc_norm_stderr": 0.026939106581553945 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.03102441174057221, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.03102441174057221 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7862595419847328, + "acc_stderr": 0.0359546161177469, + "acc_norm": 0.7862595419847328, + "acc_norm_stderr": 0.0359546161177469 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8148148148148148, + "acc_stderr": 0.03755265865037181, + "acc_norm": 0.8148148148148148, + "acc_norm_stderr": 0.03755265865037181 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7423312883435583, + "acc_stderr": 0.03436150827846917, + "acc_norm": 0.7423312883435583, + "acc_norm_stderr": 0.03436150827846917 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4732142857142857, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.4732142857142857, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7864077669902912, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.7864077669902912, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8675213675213675, + "acc_stderr": 0.022209309073165612, + "acc_norm": 0.8675213675213675, + "acc_norm_stderr": 0.022209309073165612 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8071519795657727, + "acc_stderr": 0.014108533515757431, + "acc_norm": 0.8071519795657727, + "acc_norm_stderr": 0.014108533515757431 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7023121387283237, + "acc_stderr": 0.024617055388677, + "acc_norm": 0.7023121387283237, + "acc_norm_stderr": 0.024617055388677 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4111731843575419, + "acc_stderr": 0.016456498033977512, + "acc_norm": 0.4111731843575419, + "acc_norm_stderr": 0.016456498033977512 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6830065359477124, + "acc_stderr": 0.026643278474508755, + "acc_norm": 0.6830065359477124, + "acc_norm_stderr": 0.026643278474508755 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7041800643086816, + "acc_stderr": 0.025922371788818763, + "acc_norm": 0.7041800643086816, + "acc_norm_stderr": 0.025922371788818763 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7191358024691358, + "acc_stderr": 0.02500646975579921, + "acc_norm": 0.7191358024691358, + "acc_norm_stderr": 0.02500646975579921 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4397163120567376, + "acc_stderr": 0.029609912075594106, + "acc_norm": 0.4397163120567376, + "acc_norm_stderr": 0.029609912075594106 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.46479791395045633, + "acc_stderr": 0.01273854737130396, + "acc_norm": 0.46479791395045633, + "acc_norm_stderr": 0.01273854737130396 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.028418208619406755, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.028418208619406755 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6715686274509803, + "acc_stderr": 0.018999707383162666, + "acc_norm": 0.6715686274509803, + "acc_norm_stderr": 0.018999707383162666 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6857142857142857, + "acc_stderr": 0.029719329422417475, + "acc_norm": 0.6857142857142857, + "acc_norm_stderr": 0.029719329422417475 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.845771144278607, + "acc_stderr": 0.025538433368578337, + "acc_norm": 0.845771144278607, + "acc_norm_stderr": 0.025538433368578337 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536955, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536955 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5060240963855421, + "acc_stderr": 0.03892212195333045, + "acc_norm": 0.5060240963855421, + "acc_norm_stderr": 0.03892212195333045 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8070175438596491, + "acc_stderr": 0.030267457554898458, + "acc_norm": 0.8070175438596491, + "acc_norm_stderr": 0.030267457554898458 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.34149326805385555, + "mc1_stderr": 0.016600688619950826, + "mc2": 0.5136907901080823, + "mc2_stderr": 0.015224284656166093 + }, + "all": { + "acc": 0.6260571760719941, + "acc_stderr": 0.03318501686714267, + "acc_norm": 0.6299102590458732, + "acc_norm_stderr": 0.03316308885976868, + "mc1": 0.34149326805385555, + "mc1_stderr": 0.016600688619950826, + "mc2": 0.5136907901080823, + "mc2_stderr": 0.015224284656166093 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "4139.346848726273", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/SynthIA-7B-v1.3/results_2023-10-28T14-24-19.449160.json b/eval-results/migtissera/SynthIA-7B-v1.3/results_2023-10-28T14-24-19.449160.json new file mode 100644 index 0000000000000000000000000000000000000000..2db3e5fbf660529aefcc066ee693cf18625b5aa2 --- /dev/null +++ b/eval-results/migtissera/SynthIA-7B-v1.3/results_2023-10-28T14-24-19.449160.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "migtissera/SynthIA-7B-v1.3", + "model_sha": "9ffb6b55202b887084f33e34dd4dbf97e4e928c6", + "model_size": "13.99 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.34375, + "em_stderr": 0.004864023482291936, + "f1": 0.43760067114094225, + "f1_stderr": 0.004666454920595155 + }, + "harness|gsm8k|5": { + "acc": 0.17589082638362397, + "acc_stderr": 0.010487120635539617 + }, + "harness|winogrande|5": { + "acc": 0.7884767166535123, + "acc_stderr": 0.011477747684223188 + }, + "all": { + "em": 0.34375, + "em_stderr": 0.004864023482291936, + "f1": 0.43760067114094225, + "f1_stderr": 0.004666454920595155, + "acc": 0.4821837715185681, + "acc_stderr": 0.010982434159881403 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "d8b53154ad06b31b" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "7b59719cad19dceb" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2397, + "non-padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "a74412cea07571f4" + }, + "total_evaluation_time_secondes": "8112.961089849472", + "truncated": 0, + "non-truncated": 13389, + "padded": 2397, + "non-padded": 10992, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/SynthIA-7B-v1.5/results_2023-11-09T14-41-56.883085.json b/eval-results/migtissera/SynthIA-7B-v1.5/results_2023-11-09T14-41-56.883085.json new file mode 100644 index 0000000000000000000000000000000000000000..d117d89bd3c7984001a91a755936ec121706daff --- /dev/null +++ b/eval-results/migtissera/SynthIA-7B-v1.5/results_2023-11-09T14-41-56.883085.json @@ -0,0 +1,1433 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "migtissera/SynthIA-7B-v1.5", + "model_sha": "5a9912ef90a0efc1aaea327e5cf3e9554c8bd897", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5870307167235495, + "acc_stderr": 0.014388344935398324, + "acc_norm": 0.6271331058020477, + "acc_norm_stderr": 0.014131176760131172 + }, + "harness|hellaswag|10": { + "acc": 0.6432981477793268, + "acc_stderr": 0.0047804672709117705, + "acc_norm": 0.833698466440948, + "acc_norm_stderr": 0.0037159010850549967 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6148148148148148, + "acc_stderr": 0.04203921040156279, + "acc_norm": 0.6148148148148148, + "acc_norm_stderr": 0.04203921040156279 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6644736842105263, + "acc_stderr": 0.038424985593952694, + "acc_norm": 0.6644736842105263, + "acc_norm_stderr": 0.038424985593952694 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6981132075471698, + "acc_stderr": 0.02825420034443866, + "acc_norm": 0.6981132075471698, + "acc_norm_stderr": 0.02825420034443866 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.03852084696008534, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.03852084696008534 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.036430371689585475, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.036430371689585475 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5957446808510638, + "acc_stderr": 0.03208115750788684, + "acc_norm": 0.5957446808510638, + "acc_norm_stderr": 0.03208115750788684 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.43859649122807015, + "acc_stderr": 0.04668000738510455, + "acc_norm": 0.43859649122807015, + "acc_norm_stderr": 0.04668000738510455 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5448275862068965, + "acc_stderr": 0.04149886942192117, + "acc_norm": 0.5448275862068965, + "acc_norm_stderr": 0.04149886942192117 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3941798941798942, + "acc_stderr": 0.02516798233389414, + "acc_norm": 0.3941798941798942, + "acc_norm_stderr": 0.02516798233389414 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.04403438954768176, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.04403438954768176 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7645161290322581, + "acc_stderr": 0.02413763242933771, + "acc_norm": 0.7645161290322581, + "acc_norm_stderr": 0.02413763242933771 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4975369458128079, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.4975369458128079, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.029126522834586818, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.029126522834586818 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8652849740932642, + "acc_stderr": 0.02463978909770944, + "acc_norm": 0.8652849740932642, + "acc_norm_stderr": 0.02463978909770944 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6641025641025641, + "acc_stderr": 0.023946724741563973, + "acc_norm": 0.6641025641025641, + "acc_norm_stderr": 0.023946724741563973 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3592592592592593, + "acc_stderr": 0.029252905927251976, + "acc_norm": 0.3592592592592593, + "acc_norm_stderr": 0.029252905927251976 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7058823529411765, + "acc_stderr": 0.02959732973097809, + "acc_norm": 0.7058823529411765, + "acc_norm_stderr": 0.02959732973097809 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.03822746937658753, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.03822746937658753 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.818348623853211, + "acc_stderr": 0.016530617409266875, + "acc_norm": 0.818348623853211, + "acc_norm_stderr": 0.016530617409266875 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5231481481481481, + "acc_stderr": 0.03406315360711507, + "acc_norm": 0.5231481481481481, + "acc_norm_stderr": 0.03406315360711507 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.02812597226565438, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.02812597226565438 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7932489451476793, + "acc_stderr": 0.026361651668389094, + "acc_norm": 0.7932489451476793, + "acc_norm_stderr": 0.026361651668389094 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7633587786259542, + "acc_stderr": 0.03727673575596914, + "acc_norm": 0.7633587786259542, + "acc_norm_stderr": 0.03727673575596914 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7851239669421488, + "acc_stderr": 0.037494924487096966, + "acc_norm": 0.7851239669421488, + "acc_norm_stderr": 0.037494924487096966 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7607361963190185, + "acc_stderr": 0.03351953879521272, + "acc_norm": 0.7607361963190185, + "acc_norm_stderr": 0.03351953879521272 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.44642857142857145, + "acc_stderr": 0.047184714852195886, + "acc_norm": 0.44642857142857145, + "acc_norm_stderr": 0.047184714852195886 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7669902912621359, + "acc_stderr": 0.04185832598928315, + "acc_norm": 0.7669902912621359, + "acc_norm_stderr": 0.04185832598928315 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8760683760683761, + "acc_stderr": 0.02158649400128138, + "acc_norm": 0.8760683760683761, + "acc_norm_stderr": 0.02158649400128138 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8199233716475096, + "acc_stderr": 0.013740797258579828, + "acc_norm": 0.8199233716475096, + "acc_norm_stderr": 0.013740797258579828 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6994219653179191, + "acc_stderr": 0.0246853168672578, + "acc_norm": 0.6994219653179191, + "acc_norm_stderr": 0.0246853168672578 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.376536312849162, + "acc_stderr": 0.016204672385106596, + "acc_norm": 0.376536312849162, + "acc_norm_stderr": 0.016204672385106596 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7156862745098039, + "acc_stderr": 0.02582916327275748, + "acc_norm": 0.7156862745098039, + "acc_norm_stderr": 0.02582916327275748 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7041800643086816, + "acc_stderr": 0.02592237178881876, + "acc_norm": 0.7041800643086816, + "acc_norm_stderr": 0.02592237178881876 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7283950617283951, + "acc_stderr": 0.02474862449053737, + "acc_norm": 0.7283950617283951, + "acc_norm_stderr": 0.02474862449053737 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4716312056737589, + "acc_stderr": 0.029779450957303062, + "acc_norm": 0.4716312056737589, + "acc_norm_stderr": 0.029779450957303062 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4589308996088657, + "acc_stderr": 0.012727084826799797, + "acc_norm": 0.4589308996088657, + "acc_norm_stderr": 0.012727084826799797 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6580882352941176, + "acc_stderr": 0.028814722422254184, + "acc_norm": 0.6580882352941176, + "acc_norm_stderr": 0.028814722422254184 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6584967320261438, + "acc_stderr": 0.019184639328092487, + "acc_norm": 0.6584967320261438, + "acc_norm_stderr": 0.019184639328092487 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.0449429086625209, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.0449429086625209 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6938775510204082, + "acc_stderr": 0.02950489645459596, + "acc_norm": 0.6938775510204082, + "acc_norm_stderr": 0.02950489645459596 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.835820895522388, + "acc_stderr": 0.026193923544454132, + "acc_norm": 0.835820895522388, + "acc_norm_stderr": 0.026193923544454132 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774711, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774711 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.536144578313253, + "acc_stderr": 0.03882310850890594, + "acc_norm": 0.536144578313253, + "acc_norm_stderr": 0.03882310850890594 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8187134502923976, + "acc_stderr": 0.029547741687640038, + "acc_norm": 0.8187134502923976, + "acc_norm_stderr": 0.029547741687640038 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35128518971848227, + "mc1_stderr": 0.016711358163544403, + "mc2": 0.5131996962275648, + "mc2_stderr": 0.015337988977122931 + }, + "harness|winogrande|5": { + "acc": 0.7924230465666929, + "acc_stderr": 0.01139859341938678 + }, + "harness|drop|3": { + "em": 0.1875, + "em_stderr": 0.003997164044486006, + "f1": 0.26010591442953035, + "f1_stderr": 0.004042449995216609 + }, + "harness|gsm8k|5": { + "acc": 0.17437452615617893, + "acc_stderr": 0.010451421361976231 + }, + "all": { + "acc": 0.6291968571108129, + "acc_stderr": 0.03252538162461919, + "acc_norm": 0.63804599014876, + "acc_norm_stderr": 0.03323519542303871, + "mc1": 0.35128518971848227, + "mc1_stderr": 0.016711358163544403, + "mc2": 0.5131996962275648, + "mc2_stderr": 0.015337988977122931, + "em": 0.1875, + "em_stderr": 0.003997164044486006, + "f1": 0.26010591442953035, + "f1_stderr": 0.004042449995216609 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "ac3794c3bc7cf9ca" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "f9000e2b16ad58e0" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "8ad5d96ccfbd4317" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Synthia-13B-v1.2/results_2023-10-03T11-41-50.925709.json b/eval-results/migtissera/Synthia-13B-v1.2/results_2023-10-03T11-41-50.925709.json new file mode 100644 index 0000000000000000000000000000000000000000..25abda02d725b3e7c2baba1ed9a3473806889017 --- /dev/null +++ b/eval-results/migtissera/Synthia-13B-v1.2/results_2023-10-03T11-41-50.925709.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "migtissera/Synthia-13B-v1.2", + "model_sha": "60d4937ac3c4dcb84c40bbf7265c5cc7f5f3d4f9", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5767918088737202, + "acc_stderr": 0.014438036220848027, + "acc_norm": 0.6126279863481229, + "acc_norm_stderr": 0.01423587248790987 + }, + "harness|hellaswag|10": { + "acc": 0.629555865365465, + "acc_stderr": 0.004819367172685959, + "acc_norm": 0.8293168691495718, + "acc_norm_stderr": 0.0037546293132751604 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252606, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252606 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45925925925925926, + "acc_stderr": 0.04304979692464243, + "acc_norm": 0.45925925925925926, + "acc_norm_stderr": 0.04304979692464243 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5460526315789473, + "acc_stderr": 0.04051646342874142, + "acc_norm": 0.5460526315789473, + "acc_norm_stderr": 0.04051646342874142 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.55, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6226415094339622, + "acc_stderr": 0.029832808114796, + "acc_norm": 0.6226415094339622, + "acc_norm_stderr": 0.029832808114796 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5763888888888888, + "acc_stderr": 0.0413212501972337, + "acc_norm": 0.5763888888888888, + "acc_norm_stderr": 0.0413212501972337 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.45, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5086705202312138, + "acc_stderr": 0.03811890988940412, + "acc_norm": 0.5086705202312138, + "acc_norm_stderr": 0.03811890988940412 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.04488482852329017, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.04488482852329017 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4127659574468085, + "acc_stderr": 0.03218471141400351, + "acc_norm": 0.4127659574468085, + "acc_norm_stderr": 0.03218471141400351 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3508771929824561, + "acc_stderr": 0.044895393502706986, + "acc_norm": 0.3508771929824561, + "acc_norm_stderr": 0.044895393502706986 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5103448275862069, + "acc_stderr": 0.04165774775728763, + "acc_norm": 0.5103448275862069, + "acc_norm_stderr": 0.04165774775728763 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.335978835978836, + "acc_stderr": 0.024326310529149138, + "acc_norm": 0.335978835978836, + "acc_norm_stderr": 0.024326310529149138 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.042857142857142816, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.042857142857142816 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6612903225806451, + "acc_stderr": 0.026923446059302844, + "acc_norm": 0.6612903225806451, + "acc_norm_stderr": 0.026923446059302844 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4630541871921182, + "acc_stderr": 0.03508370520442666, + "acc_norm": 0.4630541871921182, + "acc_norm_stderr": 0.03508370520442666 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237101, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237101 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6787878787878788, + "acc_stderr": 0.0364620496325381, + "acc_norm": 0.6787878787878788, + "acc_norm_stderr": 0.0364620496325381 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7070707070707071, + "acc_stderr": 0.032424979581788166, + "acc_norm": 0.7070707070707071, + "acc_norm_stderr": 0.032424979581788166 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8290155440414507, + "acc_stderr": 0.027171213683164545, + "acc_norm": 0.8290155440414507, + "acc_norm_stderr": 0.027171213683164545 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5333333333333333, + "acc_stderr": 0.025294608023986472, + "acc_norm": 0.5333333333333333, + "acc_norm_stderr": 0.025294608023986472 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.02763490726417854, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.02763490726417854 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.032252942323996406, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.032252942323996406 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943342, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943342 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7596330275229358, + "acc_stderr": 0.01832060732096407, + "acc_norm": 0.7596330275229358, + "acc_norm_stderr": 0.01832060732096407 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.38425925925925924, + "acc_stderr": 0.03317354514310742, + "acc_norm": 0.38425925925925924, + "acc_norm_stderr": 0.03317354514310742 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7549019607843137, + "acc_stderr": 0.03019028245350195, + "acc_norm": 0.7549019607843137, + "acc_norm_stderr": 0.03019028245350195 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229966, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229966 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6636771300448431, + "acc_stderr": 0.031708824268455005, + "acc_norm": 0.6636771300448431, + "acc_norm_stderr": 0.031708824268455005 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5877862595419847, + "acc_stderr": 0.04317171194870254, + "acc_norm": 0.5877862595419847, + "acc_norm_stderr": 0.04317171194870254 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.040261875275912073, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.040261875275912073 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.04236511258094633, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.04236511258094633 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6748466257668712, + "acc_stderr": 0.03680350371286461, + "acc_norm": 0.6748466257668712, + "acc_norm_stderr": 0.03680350371286461 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.04432804055291519, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.04432804055291519 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7669902912621359, + "acc_stderr": 0.041858325989283136, + "acc_norm": 0.7669902912621359, + "acc_norm_stderr": 0.041858325989283136 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.024414947304543678, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.024414947304543678 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7713920817369093, + "acc_stderr": 0.015016884698539871, + "acc_norm": 0.7713920817369093, + "acc_norm_stderr": 0.015016884698539871 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6445086705202312, + "acc_stderr": 0.025770292082977243, + "acc_norm": 0.6445086705202312, + "acc_norm_stderr": 0.025770292082977243 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4670391061452514, + "acc_stderr": 0.016686126653013937, + "acc_norm": 0.4670391061452514, + "acc_norm_stderr": 0.016686126653013937 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5915032679738562, + "acc_stderr": 0.028146405993096358, + "acc_norm": 0.5915032679738562, + "acc_norm_stderr": 0.028146405993096358 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6334405144694534, + "acc_stderr": 0.02736807824397164, + "acc_norm": 0.6334405144694534, + "acc_norm_stderr": 0.02736807824397164 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6358024691358025, + "acc_stderr": 0.026774929899722334, + "acc_norm": 0.6358024691358025, + "acc_norm_stderr": 0.026774929899722334 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4432624113475177, + "acc_stderr": 0.029634838473766002, + "acc_norm": 0.4432624113475177, + "acc_norm_stderr": 0.029634838473766002 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.41395045632333766, + "acc_stderr": 0.012579699631289264, + "acc_norm": 0.41395045632333766, + "acc_norm_stderr": 0.012579699631289264 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5477941176470589, + "acc_stderr": 0.03023375855159644, + "acc_norm": 0.5477941176470589, + "acc_norm_stderr": 0.03023375855159644 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5620915032679739, + "acc_stderr": 0.020071257886886525, + "acc_norm": 0.5620915032679739, + "acc_norm_stderr": 0.020071257886886525 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.045820048415054174, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.045820048415054174 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6326530612244898, + "acc_stderr": 0.03086214492108756, + "acc_norm": 0.6326530612244898, + "acc_norm_stderr": 0.03086214492108756 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7313432835820896, + "acc_stderr": 0.03134328358208954, + "acc_norm": 0.7313432835820896, + "acc_norm_stderr": 0.03134328358208954 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.43373493975903615, + "acc_stderr": 0.03858158940685517, + "acc_norm": 0.43373493975903615, + "acc_norm_stderr": 0.03858158940685517 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.03188578017686398, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.03188578017686398 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.33659730722154224, + "mc1_stderr": 0.01654241280949489, + "mc2": 0.4727191424035362, + "mc2_stderr": 0.015128415623267133 + }, + "all": { + "acc": 0.5660497955475398, + "acc_stderr": 0.03436189503917685, + "acc_norm": 0.5700429681112433, + "acc_norm_stderr": 0.03434042213083196, + "mc1": 0.33659730722154224, + "mc1_stderr": 0.01654241280949489, + "mc2": 0.4727191424035362, + "mc2_stderr": 0.015128415623267133 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6372.084060668945", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Synthia-13B-v1.2/results_2023-11-04T18-02-40.204522.json b/eval-results/migtissera/Synthia-13B-v1.2/results_2023-11-04T18-02-40.204522.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e832d3cfd495fd1843ba75b3d0ec98987e154 --- /dev/null +++ b/eval-results/migtissera/Synthia-13B-v1.2/results_2023-11-04T18-02-40.204522.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "migtissera/Synthia-13B-v1.2", + "model_sha": "897268239bf7329b30977ea1beb319b856b578e6", + "model_dtype": "torch.float16", + "model_size": "24.32 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.1863464765100671, + "em_stderr": 0.003987677232655252, + "f1": 0.2547860738255037, + "f1_stderr": 0.004029636733616552 + }, + "harness|gsm8k|5": { + "acc": 0.10993176648976498, + "acc_stderr": 0.008616195587865394 + }, + "harness|winogrande|5": { + "acc": 0.7647987371744278, + "acc_stderr": 0.011920008163650877 + }, + "all": { + "em": 0.1863464765100671, + "em_stderr": 0.003987677232655252, + "f1": 0.2547860738255037, + "f1_stderr": 0.004029636733616552, + "acc": 0.4373652518320964, + "acc_stderr": 0.010268101875758134 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "33e1c02d184c25be" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "24379bfe5cef3a6f" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "4ef8b15e51415fae" + }, + "truncated": 3, + "non_truncated": 12119, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Synthia-13B-v1.2/results_2023-11-06T21-31-35.338838.json b/eval-results/migtissera/Synthia-13B-v1.2/results_2023-11-06T21-31-35.338838.json new file mode 100644 index 0000000000000000000000000000000000000000..e04e832d3cfd495fd1843ba75b3d0ec98987e154 --- /dev/null +++ b/eval-results/migtissera/Synthia-13B-v1.2/results_2023-11-06T21-31-35.338838.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "migtissera/Synthia-13B-v1.2", + "model_sha": "897268239bf7329b30977ea1beb319b856b578e6", + "model_dtype": "torch.float16", + "model_size": "24.32 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.1863464765100671, + "em_stderr": 0.003987677232655252, + "f1": 0.2547860738255037, + "f1_stderr": 0.004029636733616552 + }, + "harness|gsm8k|5": { + "acc": 0.10993176648976498, + "acc_stderr": 0.008616195587865394 + }, + "harness|winogrande|5": { + "acc": 0.7647987371744278, + "acc_stderr": 0.011920008163650877 + }, + "all": { + "em": 0.1863464765100671, + "em_stderr": 0.003987677232655252, + "f1": 0.2547860738255037, + "f1_stderr": 0.004029636733616552, + "acc": 0.4373652518320964, + "acc_stderr": 0.010268101875758134 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "33e1c02d184c25be" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "24379bfe5cef3a6f" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "4ef8b15e51415fae" + }, + "truncated": 3, + "non_truncated": 12119, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Synthia-13B/results_2023-08-18T07-48-14.366837.json b/eval-results/migtissera/Synthia-13B/results_2023-08-18T07-48-14.366837.json new file mode 100644 index 0000000000000000000000000000000000000000..dd27a8897fbdb847f000a91bf9dfbfa8e6b86f09 --- /dev/null +++ b/eval-results/migtissera/Synthia-13B/results_2023-08-18T07-48-14.366837.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5546075085324232, + "acc_stderr": 0.014523987638344081, + "acc_norm": 0.5998293515358362, + "acc_norm_stderr": 0.014317197787809181 + }, + "harness|hellaswag|10": { + "acc": 0.6237801234813782, + "acc_stderr": 0.004834461997944859, + "acc_norm": 0.8185620394343757, + "acc_norm_stderr": 0.003845930169643794 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252606, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252606 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45925925925925926, + "acc_stderr": 0.04304979692464243, + "acc_norm": 0.45925925925925926, + "acc_norm_stderr": 0.04304979692464243 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5657894736842105, + "acc_stderr": 0.04033565667848319, + "acc_norm": 0.5657894736842105, + "acc_norm_stderr": 0.04033565667848319 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5849056603773585, + "acc_stderr": 0.03032594578928611, + "acc_norm": 0.5849056603773585, + "acc_norm_stderr": 0.03032594578928611 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5972222222222222, + "acc_stderr": 0.04101405519842425, + "acc_norm": 0.5972222222222222, + "acc_norm_stderr": 0.04101405519842425 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5086705202312138, + "acc_stderr": 0.03811890988940412, + "acc_norm": 0.5086705202312138, + "acc_norm_stderr": 0.03811890988940412 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.04488482852329017, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.04488482852329017 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4127659574468085, + "acc_stderr": 0.03218471141400351, + "acc_norm": 0.4127659574468085, + "acc_norm_stderr": 0.03218471141400351 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.04339138322579861, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.04339138322579861 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5448275862068965, + "acc_stderr": 0.04149886942192118, + "acc_norm": 0.5448275862068965, + "acc_norm_stderr": 0.04149886942192118 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3412698412698413, + "acc_stderr": 0.024419234966819064, + "acc_norm": 0.3412698412698413, + "acc_norm_stderr": 0.024419234966819064 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.04375888492727061, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.04375888492727061 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6548387096774193, + "acc_stderr": 0.027045746573534327, + "acc_norm": 0.6548387096774193, + "acc_norm_stderr": 0.027045746573534327 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4039408866995074, + "acc_stderr": 0.03452453903822039, + "acc_norm": 0.4039408866995074, + "acc_norm_stderr": 0.03452453903822039 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.036085410115739666, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.036085410115739666 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.702020202020202, + "acc_stderr": 0.03258630383836557, + "acc_norm": 0.702020202020202, + "acc_norm_stderr": 0.03258630383836557 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7979274611398963, + "acc_stderr": 0.028979089794296732, + "acc_norm": 0.7979274611398963, + "acc_norm_stderr": 0.028979089794296732 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5076923076923077, + "acc_stderr": 0.025348006031534778, + "acc_norm": 0.5076923076923077, + "acc_norm_stderr": 0.025348006031534778 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.027840811495871923, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.027840811495871923 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5630252100840336, + "acc_stderr": 0.032219436365661956, + "acc_norm": 0.5630252100840336, + "acc_norm_stderr": 0.032219436365661956 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7651376146788991, + "acc_stderr": 0.018175110510343574, + "acc_norm": 0.7651376146788991, + "acc_norm_stderr": 0.018175110510343574 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.44907407407407407, + "acc_stderr": 0.03392238405321616, + "acc_norm": 0.44907407407407407, + "acc_norm_stderr": 0.03392238405321616 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7696078431372549, + "acc_stderr": 0.029554292605695066, + "acc_norm": 0.7696078431372549, + "acc_norm_stderr": 0.029554292605695066 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7679324894514767, + "acc_stderr": 0.02747974455080851, + "acc_norm": 0.7679324894514767, + "acc_norm_stderr": 0.02747974455080851 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6547085201793722, + "acc_stderr": 0.03191100192835794, + "acc_norm": 0.6547085201793722, + "acc_norm_stderr": 0.03191100192835794 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6106870229007634, + "acc_stderr": 0.04276486542814591, + "acc_norm": 0.6106870229007634, + "acc_norm_stderr": 0.04276486542814591 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.03984979653302872, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.03984979653302872 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.042844679680521934, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.042844679680521934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7055214723926381, + "acc_stderr": 0.03581165790474082, + "acc_norm": 0.7055214723926381, + "acc_norm_stderr": 0.03581165790474082 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764377, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764377 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6796116504854369, + "acc_stderr": 0.04620284082280041, + "acc_norm": 0.6796116504854369, + "acc_norm_stderr": 0.04620284082280041 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.811965811965812, + "acc_stderr": 0.025598193686652247, + "acc_norm": 0.811965811965812, + "acc_norm_stderr": 0.025598193686652247 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7624521072796935, + "acc_stderr": 0.015218733046150193, + "acc_norm": 0.7624521072796935, + "acc_norm_stderr": 0.015218733046150193 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.615606936416185, + "acc_stderr": 0.02618966696627204, + "acc_norm": 0.615606936416185, + "acc_norm_stderr": 0.02618966696627204 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4759776536312849, + "acc_stderr": 0.016703190189300186, + "acc_norm": 0.4759776536312849, + "acc_norm_stderr": 0.016703190189300186 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.02818059632825929, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.02818059632825929 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6141479099678456, + "acc_stderr": 0.027648149599751464, + "acc_norm": 0.6141479099678456, + "acc_norm_stderr": 0.027648149599751464 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6604938271604939, + "acc_stderr": 0.026348564412011624, + "acc_norm": 0.6604938271604939, + "acc_norm_stderr": 0.026348564412011624 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.41134751773049644, + "acc_stderr": 0.02935491115994099, + "acc_norm": 0.41134751773049644, + "acc_norm_stderr": 0.02935491115994099 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4002607561929596, + "acc_stderr": 0.012513582529136213, + "acc_norm": 0.4002607561929596, + "acc_norm_stderr": 0.012513582529136213 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5330882352941176, + "acc_stderr": 0.030306257722468307, + "acc_norm": 0.5330882352941176, + "acc_norm_stderr": 0.030306257722468307 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5784313725490197, + "acc_stderr": 0.01997742260022747, + "acc_norm": 0.5784313725490197, + "acc_norm_stderr": 0.01997742260022747 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.046075820907199756, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.046075820907199756 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6244897959183674, + "acc_stderr": 0.03100120903989484, + "acc_norm": 0.6244897959183674, + "acc_norm_stderr": 0.03100120903989484 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7213930348258707, + "acc_stderr": 0.031700561834973086, + "acc_norm": 0.7213930348258707, + "acc_norm_stderr": 0.031700561834973086 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.03861229196653693, + "acc_norm": 0.82, + "acc_norm_stderr": 0.03861229196653693 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.43373493975903615, + "acc_stderr": 0.03858158940685517, + "acc_norm": 0.43373493975903615, + "acc_norm_stderr": 0.03858158940685517 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.03094445977853321, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.03094445977853321 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3378212974296206, + "mc1_stderr": 0.016557167322516882, + "mc2": 0.4741242810932586, + "mc2_stderr": 0.015240307440730938 + }, + "all": { + "acc": 0.562038864233033, + "acc_stderr": 0.034464491996069525, + "acc_norm": 0.5661067245543279, + "acc_norm_stderr": 0.03444423230659774, + "mc1": 0.3378212974296206, + "mc1_stderr": 0.016557167322516882, + "mc2": 0.4741242810932586, + "mc2_stderr": 0.015240307440730938 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "migtissera/Synthia-13B", + "model_sha": "fbb23bc41438b016f1df1e9180c6c350a03557ea", + "model_dtype": "torch.float16", + "lighteval_sha": "8bab069fee0c6e75ffa4c1ef8a9591c28ee0e049", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6280.784925222397", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Synthia-13B/results_2023-10-15T08-11-39.705325.json b/eval-results/migtissera/Synthia-13B/results_2023-10-15T08-11-39.705325.json new file mode 100644 index 0000000000000000000000000000000000000000..0bc156e81cc796ba46dc06c86132a0fc8e76d5e4 --- /dev/null +++ b/eval-results/migtissera/Synthia-13B/results_2023-10-15T08-11-39.705325.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "migtissera/Synthia-13B", + "model_sha": "41a2e61653dbc55d04516f201e36f6b0fdf20445", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.007130872483221477, + "em_stderr": 0.0008617017796718602, + "f1": 0.07447776845637605, + "f1_stderr": 0.0016286126770648315 + }, + "harness|gsm8k|5": { + "acc": 0.10993176648976498, + "acc_stderr": 0.008616195587865404 + }, + "harness|winogrande|5": { + "acc": 0.760852407261247, + "acc_stderr": 0.011988541844843905 + }, + "all": { + "em": 0.007130872483221477, + "em_stderr": 0.0008617017796718602, + "f1": 0.07447776845637605, + "f1_stderr": 0.0016286126770648315, + "acc": 0.435392086875506, + "acc_stderr": 0.010302368716354655 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "d78762a7aff0a651" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "46f938ba357dd590" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "01a5f2d8878c7e6f" + }, + "total_evaluation_time_secondes": "13025.633740901947", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Synthia-34B-v1.2/results_2023-09-18T20-05-34.645170.json b/eval-results/migtissera/Synthia-34B-v1.2/results_2023-09-18T20-05-34.645170.json new file mode 100644 index 0000000000000000000000000000000000000000..3abed403f039d6ff524416d3939bca3a5f430e3d --- /dev/null +++ b/eval-results/migtissera/Synthia-34B-v1.2/results_2023-09-18T20-05-34.645170.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "migtissera/Synthia-34B-v1.2", + "model_sha": "42c2e521c1de5f83f2d3f537ceac71ede63e988d", + "model_size": "63.23 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5119453924914675, + "acc_stderr": 0.014607220340597171, + "acc_norm": 0.5486348122866894, + "acc_norm_stderr": 0.01454210456995527 + }, + "harness|hellaswag|10": { + "acc": 0.5587532364070902, + "acc_stderr": 0.00495521278783238, + "acc_norm": 0.7432782314280024, + "acc_norm_stderr": 0.004359318206428689 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.43703703703703706, + "acc_stderr": 0.04284958639753399, + "acc_norm": 0.43703703703703706, + "acc_norm_stderr": 0.04284958639753399 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.506578947368421, + "acc_stderr": 0.040685900502249704, + "acc_norm": 0.506578947368421, + "acc_norm_stderr": 0.040685900502249704 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5433962264150943, + "acc_stderr": 0.030656748696739428, + "acc_norm": 0.5433962264150943, + "acc_norm_stderr": 0.030656748696739428 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5694444444444444, + "acc_stderr": 0.04140685639111502, + "acc_norm": 0.5694444444444444, + "acc_norm_stderr": 0.04140685639111502 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5028901734104047, + "acc_stderr": 0.03812400565974834, + "acc_norm": 0.5028901734104047, + "acc_norm_stderr": 0.03812400565974834 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.37254901960784315, + "acc_stderr": 0.04810840148082635, + "acc_norm": 0.37254901960784315, + "acc_norm_stderr": 0.04810840148082635 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421296, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421296 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.41702127659574467, + "acc_stderr": 0.03223276266711712, + "acc_norm": 0.41702127659574467, + "acc_norm_stderr": 0.03223276266711712 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.39473684210526316, + "acc_stderr": 0.045981880578165414, + "acc_norm": 0.39473684210526316, + "acc_norm_stderr": 0.045981880578165414 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.46206896551724136, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.46206896551724136, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4021164021164021, + "acc_stderr": 0.025253032554997692, + "acc_norm": 0.4021164021164021, + "acc_norm_stderr": 0.025253032554997692 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4365079365079365, + "acc_stderr": 0.04435932892851466, + "acc_norm": 0.4365079365079365, + "acc_norm_stderr": 0.04435932892851466 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145634, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145634 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6225806451612903, + "acc_stderr": 0.027575960723278243, + "acc_norm": 0.6225806451612903, + "acc_norm_stderr": 0.027575960723278243 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3842364532019704, + "acc_stderr": 0.0342239856565755, + "acc_norm": 0.3842364532019704, + "acc_norm_stderr": 0.0342239856565755 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252606, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252606 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6606060606060606, + "acc_stderr": 0.03697442205031596, + "acc_norm": 0.6606060606060606, + "acc_norm_stderr": 0.03697442205031596 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.033586181457325226, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.033586181457325226 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7305699481865285, + "acc_stderr": 0.032018671228777947, + "acc_norm": 0.7305699481865285, + "acc_norm_stderr": 0.032018671228777947 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5128205128205128, + "acc_stderr": 0.02534267129380725, + "acc_norm": 0.5128205128205128, + "acc_norm_stderr": 0.02534267129380725 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.028037929969114982, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.028037929969114982 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5672268907563025, + "acc_stderr": 0.032183581077426124, + "acc_norm": 0.5672268907563025, + "acc_norm_stderr": 0.032183581077426124 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.39072847682119205, + "acc_stderr": 0.03983798306659806, + "acc_norm": 0.39072847682119205, + "acc_norm_stderr": 0.03983798306659806 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6954128440366972, + "acc_stderr": 0.019732299420354052, + "acc_norm": 0.6954128440366972, + "acc_norm_stderr": 0.019732299420354052 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4027777777777778, + "acc_stderr": 0.033448873829978666, + "acc_norm": 0.4027777777777778, + "acc_norm_stderr": 0.033448873829978666 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7107843137254902, + "acc_stderr": 0.031822318676475544, + "acc_norm": 0.7107843137254902, + "acc_norm_stderr": 0.031822318676475544 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7172995780590717, + "acc_stderr": 0.02931281415395593, + "acc_norm": 0.7172995780590717, + "acc_norm_stderr": 0.02931281415395593 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5426008968609866, + "acc_stderr": 0.033435777055830646, + "acc_norm": 0.5426008968609866, + "acc_norm_stderr": 0.033435777055830646 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5343511450381679, + "acc_stderr": 0.043749285605997376, + "acc_norm": 0.5343511450381679, + "acc_norm_stderr": 0.043749285605997376 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6942148760330579, + "acc_stderr": 0.04205953933884123, + "acc_norm": 0.6942148760330579, + "acc_norm_stderr": 0.04205953933884123 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6481481481481481, + "acc_stderr": 0.04616631111801714, + "acc_norm": 0.6481481481481481, + "acc_norm_stderr": 0.04616631111801714 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6380368098159509, + "acc_stderr": 0.037757007291414416, + "acc_norm": 0.6380368098159509, + "acc_norm_stderr": 0.037757007291414416 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.38392857142857145, + "acc_stderr": 0.04616143075028547, + "acc_norm": 0.38392857142857145, + "acc_norm_stderr": 0.04616143075028547 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6601941747572816, + "acc_stderr": 0.046897659372781335, + "acc_norm": 0.6601941747572816, + "acc_norm_stderr": 0.046897659372781335 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7692307692307693, + "acc_stderr": 0.0276019213814176, + "acc_norm": 0.7692307692307693, + "acc_norm_stderr": 0.0276019213814176 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6743295019157088, + "acc_stderr": 0.016757989458549675, + "acc_norm": 0.6743295019157088, + "acc_norm_stderr": 0.016757989458549675 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5867052023121387, + "acc_stderr": 0.02651126136940925, + "acc_norm": 0.5867052023121387, + "acc_norm_stderr": 0.02651126136940925 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.35977653631284917, + "acc_stderr": 0.016051419760310263, + "acc_norm": 0.35977653631284917, + "acc_norm_stderr": 0.016051419760310263 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.028580341065138296, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.028580341065138296 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6141479099678456, + "acc_stderr": 0.027648149599751464, + "acc_norm": 0.6141479099678456, + "acc_norm_stderr": 0.027648149599751464 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5524691358024691, + "acc_stderr": 0.027667138569422708, + "acc_norm": 0.5524691358024691, + "acc_norm_stderr": 0.027667138569422708 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.37943262411347517, + "acc_stderr": 0.028947338851614105, + "acc_norm": 0.37943262411347517, + "acc_norm_stderr": 0.028947338851614105 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.378748370273794, + "acc_stderr": 0.012389052105003732, + "acc_norm": 0.378748370273794, + "acc_norm_stderr": 0.012389052105003732 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.40808823529411764, + "acc_stderr": 0.029855261393483924, + "acc_norm": 0.40808823529411764, + "acc_norm_stderr": 0.029855261393483924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.48366013071895425, + "acc_stderr": 0.020217030653186457, + "acc_norm": 0.48366013071895425, + "acc_norm_stderr": 0.020217030653186457 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5727272727272728, + "acc_stderr": 0.047381987035454834, + "acc_norm": 0.5727272727272728, + "acc_norm_stderr": 0.047381987035454834 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6448979591836734, + "acc_stderr": 0.030635655150387638, + "acc_norm": 0.6448979591836734, + "acc_norm_stderr": 0.030635655150387638 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7313432835820896, + "acc_stderr": 0.031343283582089536, + "acc_norm": 0.7313432835820896, + "acc_norm_stderr": 0.031343283582089536 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4397590361445783, + "acc_stderr": 0.03864139923699122, + "acc_norm": 0.4397590361445783, + "acc_norm_stderr": 0.03864139923699122 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03615507630310935, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03615507630310935 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2998776009791922, + "mc1_stderr": 0.01604035296671362, + "mc2": 0.4467341818408572, + "mc2_stderr": 0.014969799807071376 + }, + "all": { + "acc": 0.5320903185183409, + "acc_stderr": 0.03517517994960793, + "acc_norm": 0.5358397153796313, + "acc_norm_stderr": 0.03516397638431902, + "mc1": 0.2998776009791922, + "mc1_stderr": 0.01604035296671362, + "mc2": 0.4467341818408572, + "mc2_stderr": 0.014969799807071376 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "23922.954869508743", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Synthia-70B-v1.1/results_2023-08-29T08-55-05.432450.json b/eval-results/migtissera/Synthia-70B-v1.1/results_2023-08-29T08-55-05.432450.json new file mode 100644 index 0000000000000000000000000000000000000000..82106dee8351e2dd79965a2ba17caef817634c70 --- /dev/null +++ b/eval-results/migtissera/Synthia-70B-v1.1/results_2023-08-29T08-55-05.432450.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "migtissera/Synthia-70B-v1.1", + "model_sha": "05a13f6adfe95a713dff04dc2eaa214c77c2512a", + "model_dtype": "torch.float16", + "lighteval_sha": "80ee8075eb45365510e77cda067c7896f7f676aa", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.64419795221843, + "acc_stderr": 0.013990571137918762, + "acc_norm": 0.7005119453924915, + "acc_norm_stderr": 0.013385021637313572 + }, + "harness|hellaswag|10": { + "acc": 0.6804421429994025, + "acc_stderr": 0.00465352303836937, + "acc_norm": 0.8712407886875124, + "acc_norm_stderr": 0.003342487333262269 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6518518518518519, + "acc_stderr": 0.041153246103369526, + "acc_norm": 0.6518518518518519, + "acc_norm_stderr": 0.041153246103369526 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.8355263157894737, + "acc_stderr": 0.03016753346863271, + "acc_norm": 0.8355263157894737, + "acc_norm_stderr": 0.03016753346863271 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7245283018867924, + "acc_stderr": 0.02749566368372406, + "acc_norm": 0.7245283018867924, + "acc_norm_stderr": 0.02749566368372406 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8263888888888888, + "acc_stderr": 0.03167473383795718, + "acc_norm": 0.8263888888888888, + "acc_norm_stderr": 0.03167473383795718 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6878612716763006, + "acc_stderr": 0.035331333893236574, + "acc_norm": 0.6878612716763006, + "acc_norm_stderr": 0.035331333893236574 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4019607843137255, + "acc_stderr": 0.04878608714466996, + "acc_norm": 0.4019607843137255, + "acc_norm_stderr": 0.04878608714466996 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6638297872340425, + "acc_stderr": 0.030881618520676942, + "acc_norm": 0.6638297872340425, + "acc_norm_stderr": 0.030881618520676942 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.40350877192982454, + "acc_stderr": 0.04615186962583703, + "acc_norm": 0.40350877192982454, + "acc_norm_stderr": 0.04615186962583703 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6620689655172414, + "acc_stderr": 0.039417076320648906, + "acc_norm": 0.6620689655172414, + "acc_norm_stderr": 0.039417076320648906 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.43386243386243384, + "acc_stderr": 0.02552503438247489, + "acc_norm": 0.43386243386243384, + "acc_norm_stderr": 0.02552503438247489 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5079365079365079, + "acc_stderr": 0.044715725362943486, + "acc_norm": 0.5079365079365079, + "acc_norm_stderr": 0.044715725362943486 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8225806451612904, + "acc_stderr": 0.021732540689329293, + "acc_norm": 0.8225806451612904, + "acc_norm_stderr": 0.021732540689329293 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5467980295566502, + "acc_stderr": 0.03502544650845872, + "acc_norm": 0.5467980295566502, + "acc_norm_stderr": 0.03502544650845872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8242424242424242, + "acc_stderr": 0.02972094300622445, + "acc_norm": 0.8242424242424242, + "acc_norm_stderr": 0.02972094300622445 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.898989898989899, + "acc_stderr": 0.021469735576055343, + "acc_norm": 0.898989898989899, + "acc_norm_stderr": 0.021469735576055343 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9326424870466321, + "acc_stderr": 0.018088393839078912, + "acc_norm": 0.9326424870466321, + "acc_norm_stderr": 0.018088393839078912 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.717948717948718, + "acc_stderr": 0.022815813098896597, + "acc_norm": 0.717948717948718, + "acc_norm_stderr": 0.022815813098896597 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.02831753349606648, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.02831753349606648 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7521008403361344, + "acc_stderr": 0.028047967224176892, + "acc_norm": 0.7521008403361344, + "acc_norm_stderr": 0.028047967224176892 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.48344370860927155, + "acc_stderr": 0.040802441856289715, + "acc_norm": 0.48344370860927155, + "acc_norm_stderr": 0.040802441856289715 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8899082568807339, + "acc_stderr": 0.013419939018681203, + "acc_norm": 0.8899082568807339, + "acc_norm_stderr": 0.013419939018681203 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.033247089118091176, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.033247089118091176 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9166666666666666, + "acc_stderr": 0.019398452135813902, + "acc_norm": 0.9166666666666666, + "acc_norm_stderr": 0.019398452135813902 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.890295358649789, + "acc_stderr": 0.020343400734868834, + "acc_norm": 0.890295358649789, + "acc_norm_stderr": 0.020343400734868834 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7802690582959642, + "acc_stderr": 0.027790177064383595, + "acc_norm": 0.7802690582959642, + "acc_norm_stderr": 0.027790177064383595 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8549618320610687, + "acc_stderr": 0.030884661089515375, + "acc_norm": 0.8549618320610687, + "acc_norm_stderr": 0.030884661089515375 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8760330578512396, + "acc_stderr": 0.030083098716035202, + "acc_norm": 0.8760330578512396, + "acc_norm_stderr": 0.030083098716035202 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.03602814176392645, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.03602814176392645 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8098159509202454, + "acc_stderr": 0.03083349114628123, + "acc_norm": 0.8098159509202454, + "acc_norm_stderr": 0.03083349114628123 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.48214285714285715, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.48214285714285715, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8349514563106796, + "acc_stderr": 0.03675668832233188, + "acc_norm": 0.8349514563106796, + "acc_norm_stderr": 0.03675668832233188 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.905982905982906, + "acc_stderr": 0.019119892798924974, + "acc_norm": 0.905982905982906, + "acc_norm_stderr": 0.019119892798924974 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.68, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.68, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8748403575989783, + "acc_stderr": 0.011832954239305728, + "acc_norm": 0.8748403575989783, + "acc_norm_stderr": 0.011832954239305728 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.8005780346820809, + "acc_stderr": 0.021511900654252562, + "acc_norm": 0.8005780346820809, + "acc_norm_stderr": 0.021511900654252562 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.5597765363128492, + "acc_stderr": 0.01660256461504993, + "acc_norm": 0.5597765363128492, + "acc_norm_stderr": 0.01660256461504993 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7647058823529411, + "acc_stderr": 0.02428861946604611, + "acc_norm": 0.7647058823529411, + "acc_norm_stderr": 0.02428861946604611 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7781350482315113, + "acc_stderr": 0.02359885829286305, + "acc_norm": 0.7781350482315113, + "acc_norm_stderr": 0.02359885829286305 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8271604938271605, + "acc_stderr": 0.02103851777015738, + "acc_norm": 0.8271604938271605, + "acc_norm_stderr": 0.02103851777015738 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.574468085106383, + "acc_stderr": 0.02949482760014436, + "acc_norm": 0.574468085106383, + "acc_norm_stderr": 0.02949482760014436 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5560625814863103, + "acc_stderr": 0.012689708167787679, + "acc_norm": 0.5560625814863103, + "acc_norm_stderr": 0.012689708167787679 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7389705882352942, + "acc_stderr": 0.02667925227010313, + "acc_norm": 0.7389705882352942, + "acc_norm_stderr": 0.02667925227010313 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7467320261437909, + "acc_stderr": 0.01759348689536683, + "acc_norm": 0.7467320261437909, + "acc_norm_stderr": 0.01759348689536683 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7363636363636363, + "acc_stderr": 0.04220224692971987, + "acc_norm": 0.7363636363636363, + "acc_norm_stderr": 0.04220224692971987 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.8163265306122449, + "acc_stderr": 0.024789071332007636, + "acc_norm": 0.8163265306122449, + "acc_norm_stderr": 0.024789071332007636 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8805970149253731, + "acc_stderr": 0.02292879327721974, + "acc_norm": 0.8805970149253731, + "acc_norm_stderr": 0.02292879327721974 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.91, + "acc_stderr": 0.028762349126466125, + "acc_norm": 0.91, + "acc_norm_stderr": 0.028762349126466125 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5180722891566265, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.5180722891566265, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8713450292397661, + "acc_stderr": 0.025679342723276915, + "acc_norm": 0.8713450292397661, + "acc_norm_stderr": 0.025679342723276915 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.41370869033047736, + "mc1_stderr": 0.0172408618120998, + "mc2": 0.5783691579924066, + "mc2_stderr": 0.015010896867718398 + }, + "all": { + "acc": 0.7019852800388521, + "acc_stderr": 0.03085135266489998, + "acc_norm": 0.7061736298500754, + "acc_norm_stderr": 0.030818868169887912, + "mc1": 0.41370869033047736, + "mc1_stderr": 0.0172408618120998, + "mc2": 0.5783691579924066, + "mc2_stderr": 0.015010896867718398 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "31629.506284475327", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Synthia-70B-v1.1/results_2023-09-23T19-08-11.059191.json b/eval-results/migtissera/Synthia-70B-v1.1/results_2023-09-23T19-08-11.059191.json new file mode 100644 index 0000000000000000000000000000000000000000..e7fd48afa71ef8d4abdee90447073ba36578b3e1 --- /dev/null +++ b/eval-results/migtissera/Synthia-70B-v1.1/results_2023-09-23T19-08-11.059191.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "migtissera/Synthia-70B-v1.1", + "model_sha": "c87658a2bb2e7aadc8ec6b57be17a6a5e9a407c7", + "model_size": "128.56 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.33326342281879195, + "em_stderr": 0.004827370333271099, + "f1": 0.39018036912751786, + "f1_stderr": 0.004711418943333287 + }, + "harness|gsm8k|5": { + "acc": 0.31842304776345715, + "acc_stderr": 0.012832225723075403 + }, + "harness|winogrande|5": { + "acc": 0.8366219415943172, + "acc_stderr": 0.010390695970273763 + }, + "all": { + "em": 0.33326342281879195, + "em_stderr": 0.004827370333271099, + "f1": 0.39018036912751786, + "f1_stderr": 0.004711418943333287, + "acc": 0.5775224946788872, + "acc_stderr": 0.011611460846674582 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "07a67164336d963c" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "9d642d94ee96a257" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "f678a1d72d31a314" + }, + "total_evaluation_time_secondes": "38009.54581594467", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Synthia-70B-v1.2/results_2023-09-02T17-59-05.420313.json b/eval-results/migtissera/Synthia-70B-v1.2/results_2023-09-02T17-59-05.420313.json new file mode 100644 index 0000000000000000000000000000000000000000..602a779ff03a23a5802252c745455103bec17693 --- /dev/null +++ b/eval-results/migtissera/Synthia-70B-v1.2/results_2023-09-02T17-59-05.420313.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "migtissera/Synthia-70B-v1.2", + "model_sha": "9b92ee1093b125035ba1649dca6f4ceb9d86a656", + "model_dtype": "torch.float16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6578498293515358, + "acc_stderr": 0.013864152159177278, + "acc_norm": 0.7047781569965871, + "acc_norm_stderr": 0.01332975029338232 + }, + "harness|hellaswag|10": { + "acc": 0.6822346146186019, + "acc_stderr": 0.004646561453031608, + "acc_norm": 0.8698466440948018, + "acc_norm_stderr": 0.0033578442491239554 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6444444444444445, + "acc_stderr": 0.04135176749720385, + "acc_norm": 0.6444444444444445, + "acc_norm_stderr": 0.04135176749720385 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.8486842105263158, + "acc_stderr": 0.02916263159684399, + "acc_norm": 0.8486842105263158, + "acc_norm_stderr": 0.02916263159684399 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7169811320754716, + "acc_stderr": 0.027724236492700918, + "acc_norm": 0.7169811320754716, + "acc_norm_stderr": 0.027724236492700918 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8125, + "acc_stderr": 0.032639560491693344, + "acc_norm": 0.8125, + "acc_norm_stderr": 0.032639560491693344 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6820809248554913, + "acc_stderr": 0.035506839891655796, + "acc_norm": 0.6820809248554913, + "acc_norm_stderr": 0.035506839891655796 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4019607843137255, + "acc_stderr": 0.04878608714466996, + "acc_norm": 0.4019607843137255, + "acc_norm_stderr": 0.04878608714466996 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6808510638297872, + "acc_stderr": 0.030472973363380045, + "acc_norm": 0.6808510638297872, + "acc_norm_stderr": 0.030472973363380045 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4298245614035088, + "acc_stderr": 0.04657047260594963, + "acc_norm": 0.4298245614035088, + "acc_norm_stderr": 0.04657047260594963 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6482758620689655, + "acc_stderr": 0.0397923663749741, + "acc_norm": 0.6482758620689655, + "acc_norm_stderr": 0.0397923663749741 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.02559185776138218, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.02559185776138218 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.48412698412698413, + "acc_stderr": 0.04469881854072606, + "acc_norm": 0.48412698412698413, + "acc_norm_stderr": 0.04469881854072606 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8129032258064516, + "acc_stderr": 0.022185710092252252, + "acc_norm": 0.8129032258064516, + "acc_norm_stderr": 0.022185710092252252 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5369458128078818, + "acc_stderr": 0.035083705204426656, + "acc_norm": 0.5369458128078818, + "acc_norm_stderr": 0.035083705204426656 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.81, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.81, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8242424242424242, + "acc_stderr": 0.02972094300622445, + "acc_norm": 0.8242424242424242, + "acc_norm_stderr": 0.02972094300622445 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8888888888888888, + "acc_stderr": 0.022390787638216773, + "acc_norm": 0.8888888888888888, + "acc_norm_stderr": 0.022390787638216773 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9378238341968912, + "acc_stderr": 0.017426974154240528, + "acc_norm": 0.9378238341968912, + "acc_norm_stderr": 0.017426974154240528 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.717948717948718, + "acc_stderr": 0.0228158130988966, + "acc_norm": 0.717948717948718, + "acc_norm_stderr": 0.0228158130988966 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.02831753349606648, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.02831753349606648 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7436974789915967, + "acc_stderr": 0.02835962087053395, + "acc_norm": 0.7436974789915967, + "acc_norm_stderr": 0.02835962087053395 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4768211920529801, + "acc_stderr": 0.04078093859163083, + "acc_norm": 0.4768211920529801, + "acc_norm_stderr": 0.04078093859163083 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8899082568807339, + "acc_stderr": 0.013419939018681203, + "acc_norm": 0.8899082568807339, + "acc_norm_stderr": 0.013419939018681203 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5972222222222222, + "acc_stderr": 0.03344887382997866, + "acc_norm": 0.5972222222222222, + "acc_norm_stderr": 0.03344887382997866 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9166666666666666, + "acc_stderr": 0.019398452135813902, + "acc_norm": 0.9166666666666666, + "acc_norm_stderr": 0.019398452135813902 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.890295358649789, + "acc_stderr": 0.020343400734868834, + "acc_norm": 0.890295358649789, + "acc_norm_stderr": 0.020343400734868834 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.8116591928251121, + "acc_stderr": 0.026241132996407256, + "acc_norm": 0.8116591928251121, + "acc_norm_stderr": 0.026241132996407256 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8396946564885496, + "acc_stderr": 0.032178294207446306, + "acc_norm": 0.8396946564885496, + "acc_norm_stderr": 0.032178294207446306 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.859504132231405, + "acc_stderr": 0.031722334260021585, + "acc_norm": 0.859504132231405, + "acc_norm_stderr": 0.031722334260021585 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.03602814176392645, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.03602814176392645 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.803680981595092, + "acc_stderr": 0.031207970394709218, + "acc_norm": 0.803680981595092, + "acc_norm_stderr": 0.031207970394709218 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5089285714285714, + "acc_stderr": 0.04745033255489122, + "acc_norm": 0.5089285714285714, + "acc_norm_stderr": 0.04745033255489122 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.0376017800602662, + "acc_norm": 0.8252427184466019, + "acc_norm_stderr": 0.0376017800602662 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9017094017094017, + "acc_stderr": 0.019503444900757567, + "acc_norm": 0.9017094017094017, + "acc_norm_stderr": 0.019503444900757567 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8773946360153256, + "acc_stderr": 0.011728672144131565, + "acc_norm": 0.8773946360153256, + "acc_norm_stderr": 0.011728672144131565 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7890173410404624, + "acc_stderr": 0.02196630994704311, + "acc_norm": 0.7890173410404624, + "acc_norm_stderr": 0.02196630994704311 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.5452513966480447, + "acc_stderr": 0.016653875777524, + "acc_norm": 0.5452513966480447, + "acc_norm_stderr": 0.016653875777524 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7679738562091504, + "acc_stderr": 0.024170840879340873, + "acc_norm": 0.7679738562091504, + "acc_norm_stderr": 0.024170840879340873 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.77491961414791, + "acc_stderr": 0.023720088516179027, + "acc_norm": 0.77491961414791, + "acc_norm_stderr": 0.023720088516179027 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8302469135802469, + "acc_stderr": 0.020888690414093865, + "acc_norm": 0.8302469135802469, + "acc_norm_stderr": 0.020888690414093865 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.574468085106383, + "acc_stderr": 0.02949482760014436, + "acc_norm": 0.574468085106383, + "acc_norm_stderr": 0.02949482760014436 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5514993481095176, + "acc_stderr": 0.012702317490559821, + "acc_norm": 0.5514993481095176, + "acc_norm_stderr": 0.012702317490559821 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7389705882352942, + "acc_stderr": 0.026679252270103128, + "acc_norm": 0.7389705882352942, + "acc_norm_stderr": 0.026679252270103128 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7532679738562091, + "acc_stderr": 0.017440820367402503, + "acc_norm": 0.7532679738562091, + "acc_norm_stderr": 0.017440820367402503 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7363636363636363, + "acc_stderr": 0.04220224692971987, + "acc_norm": 0.7363636363636363, + "acc_norm_stderr": 0.04220224692971987 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.8081632653061225, + "acc_stderr": 0.02520696315422538, + "acc_norm": 0.8081632653061225, + "acc_norm_stderr": 0.02520696315422538 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8955223880597015, + "acc_stderr": 0.021628920516700643, + "acc_norm": 0.8955223880597015, + "acc_norm_stderr": 0.021628920516700643 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.89, + "acc_stderr": 0.03144660377352203, + "acc_norm": 0.89, + "acc_norm_stderr": 0.03144660377352203 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.536144578313253, + "acc_stderr": 0.03882310850890594, + "acc_norm": 0.536144578313253, + "acc_norm_stderr": 0.03882310850890594 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8713450292397661, + "acc_stderr": 0.025679342723276915, + "acc_norm": 0.8713450292397661, + "acc_norm_stderr": 0.025679342723276915 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.4222766217870257, + "mc1_stderr": 0.017290733254248174, + "mc2": 0.5863515695677809, + "mc2_stderr": 0.015002713147024338 + }, + "all": { + "acc": 0.700283718449465, + "acc_stderr": 0.030924880314556678, + "acc_norm": 0.7042589787396556, + "acc_norm_stderr": 0.030893979991341382, + "mc1": 0.4222766217870257, + "mc1_stderr": 0.017290733254248174, + "mc2": 0.5863515695677809, + "mc2_stderr": 0.015002713147024338 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "26508.604179620743", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Synthia-70B-v1.2/results_2023-10-17T14-51-20.480254.json b/eval-results/migtissera/Synthia-70B-v1.2/results_2023-10-17T14-51-20.480254.json new file mode 100644 index 0000000000000000000000000000000000000000..5cc04fc848f836b89e959901a7c6994135c4682f --- /dev/null +++ b/eval-results/migtissera/Synthia-70B-v1.2/results_2023-10-17T14-51-20.480254.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "migtissera/Synthia-70B-v1.2", + "model_sha": "9877da4fec22ee28ba99bb5e48f8dc4a3bce01e5", + "model_size": "128.56 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.364618288590604, + "em_stderr": 0.004929197624393639, + "f1": 0.42417365771812215, + "f1_stderr": 0.004776577842624861 + }, + "harness|gsm8k|5": { + "acc": 0.3191811978771797, + "acc_stderr": 0.012840345676251651 + }, + "harness|winogrande|5": { + "acc": 0.8326756116811366, + "acc_stderr": 0.010490608806828079 + }, + "all": { + "em": 0.364618288590604, + "em_stderr": 0.004929197624393639, + "f1": 0.42417365771812215, + "f1_stderr": 0.004776577842624861, + "acc": 0.5759284047791582, + "acc_stderr": 0.011665477241539865 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "97db72e8a33950e6" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "6be96637921bb09e" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "7374798b278a9ea2" + }, + "total_evaluation_time_secondes": "36170.71736717224", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Synthia-70B-v1.2b/results_2023-09-13T14-25-34.731307.json b/eval-results/migtissera/Synthia-70B-v1.2b/results_2023-09-13T14-25-34.731307.json new file mode 100644 index 0000000000000000000000000000000000000000..daa4a7e4ba76fcba5ef164aa79a980c2d85c1bb3 --- /dev/null +++ b/eval-results/migtissera/Synthia-70B-v1.2b/results_2023-09-13T14-25-34.731307.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "migtissera/Synthia-70B-v1.2b", + "model_sha": "7b687d6e4101b8bb8cc4062f8a318d639098a55d", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6501706484641638, + "acc_stderr": 0.013936809212158289, + "acc_norm": 0.6877133105802048, + "acc_norm_stderr": 0.013542598541688067 + }, + "harness|hellaswag|10": { + "acc": 0.6886078470424218, + "acc_stderr": 0.004621163476949207, + "acc_norm": 0.8757219677355108, + "acc_norm_stderr": 0.0032922425436373404 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6444444444444445, + "acc_stderr": 0.04135176749720385, + "acc_norm": 0.6444444444444445, + "acc_norm_stderr": 0.04135176749720385 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7828947368421053, + "acc_stderr": 0.03355045304882924, + "acc_norm": 0.7828947368421053, + "acc_norm_stderr": 0.03355045304882924 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7169811320754716, + "acc_stderr": 0.027724236492700918, + "acc_norm": 0.7169811320754716, + "acc_norm_stderr": 0.027724236492700918 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8125, + "acc_stderr": 0.032639560491693344, + "acc_norm": 0.8125, + "acc_norm_stderr": 0.032639560491693344 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.653179190751445, + "acc_stderr": 0.036291466701596636, + "acc_norm": 0.653179190751445, + "acc_norm_stderr": 0.036291466701596636 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6680851063829787, + "acc_stderr": 0.030783736757745653, + "acc_norm": 0.6680851063829787, + "acc_norm_stderr": 0.030783736757745653 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4298245614035088, + "acc_stderr": 0.046570472605949625, + "acc_norm": 0.4298245614035088, + "acc_norm_stderr": 0.046570472605949625 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5862068965517241, + "acc_stderr": 0.04104269211806232, + "acc_norm": 0.5862068965517241, + "acc_norm_stderr": 0.04104269211806232 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.43915343915343913, + "acc_stderr": 0.02555992055053101, + "acc_norm": 0.43915343915343913, + "acc_norm_stderr": 0.02555992055053101 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5158730158730159, + "acc_stderr": 0.044698818540726076, + "acc_norm": 0.5158730158730159, + "acc_norm_stderr": 0.044698818540726076 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8064516129032258, + "acc_stderr": 0.022475258525536057, + "acc_norm": 0.8064516129032258, + "acc_norm_stderr": 0.022475258525536057 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5320197044334976, + "acc_stderr": 0.035107665979592154, + "acc_norm": 0.5320197044334976, + "acc_norm_stderr": 0.035107665979592154 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.74, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.74, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8242424242424242, + "acc_stderr": 0.02972094300622445, + "acc_norm": 0.8242424242424242, + "acc_norm_stderr": 0.02972094300622445 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8787878787878788, + "acc_stderr": 0.02325315795194208, + "acc_norm": 0.8787878787878788, + "acc_norm_stderr": 0.02325315795194208 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.927461139896373, + "acc_stderr": 0.018718998520678178, + "acc_norm": 0.927461139896373, + "acc_norm_stderr": 0.018718998520678178 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7025641025641025, + "acc_stderr": 0.023177408131465942, + "acc_norm": 0.7025641025641025, + "acc_norm_stderr": 0.023177408131465942 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.028037929969114996, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.028037929969114996 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7478991596638656, + "acc_stderr": 0.028205545033277726, + "acc_norm": 0.7478991596638656, + "acc_norm_stderr": 0.028205545033277726 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.47019867549668876, + "acc_stderr": 0.040752249922169775, + "acc_norm": 0.47019867549668876, + "acc_norm_stderr": 0.040752249922169775 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8880733944954129, + "acc_stderr": 0.013517352714958788, + "acc_norm": 0.8880733944954129, + "acc_norm_stderr": 0.013517352714958788 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.03400603625538271, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.03400603625538271 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9117647058823529, + "acc_stderr": 0.01990739979131694, + "acc_norm": 0.9117647058823529, + "acc_norm_stderr": 0.01990739979131694 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8607594936708861, + "acc_stderr": 0.022535526352692705, + "acc_norm": 0.8607594936708861, + "acc_norm_stderr": 0.022535526352692705 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.8071748878923767, + "acc_stderr": 0.026478240960489365, + "acc_norm": 0.8071748878923767, + "acc_norm_stderr": 0.026478240960489365 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8396946564885496, + "acc_stderr": 0.03217829420744633, + "acc_norm": 0.8396946564885496, + "acc_norm_stderr": 0.03217829420744633 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8677685950413223, + "acc_stderr": 0.03092278832044579, + "acc_norm": 0.8677685950413223, + "acc_norm_stderr": 0.03092278832044579 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8148148148148148, + "acc_stderr": 0.03755265865037181, + "acc_norm": 0.8148148148148148, + "acc_norm_stderr": 0.03755265865037181 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.803680981595092, + "acc_stderr": 0.031207970394709218, + "acc_norm": 0.803680981595092, + "acc_norm_stderr": 0.031207970394709218 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5089285714285714, + "acc_stderr": 0.04745033255489122, + "acc_norm": 0.5089285714285714, + "acc_norm_stderr": 0.04745033255489122 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.0376017800602662, + "acc_norm": 0.8252427184466019, + "acc_norm_stderr": 0.0376017800602662 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9102564102564102, + "acc_stderr": 0.018724301741941635, + "acc_norm": 0.9102564102564102, + "acc_norm_stderr": 0.018724301741941635 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8646232439335888, + "acc_stderr": 0.012234384586856491, + "acc_norm": 0.8646232439335888, + "acc_norm_stderr": 0.012234384586856491 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.791907514450867, + "acc_stderr": 0.021855255263421795, + "acc_norm": 0.791907514450867, + "acc_norm_stderr": 0.021855255263421795 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.49162011173184356, + "acc_stderr": 0.01672015279467255, + "acc_norm": 0.49162011173184356, + "acc_norm_stderr": 0.01672015279467255 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7287581699346405, + "acc_stderr": 0.025457756696667874, + "acc_norm": 0.7287581699346405, + "acc_norm_stderr": 0.025457756696667874 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7845659163987139, + "acc_stderr": 0.023350225475471442, + "acc_norm": 0.7845659163987139, + "acc_norm_stderr": 0.023350225475471442 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.808641975308642, + "acc_stderr": 0.021887704613396154, + "acc_norm": 0.808641975308642, + "acc_norm_stderr": 0.021887704613396154 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5390070921985816, + "acc_stderr": 0.02973659252642444, + "acc_norm": 0.5390070921985816, + "acc_norm_stderr": 0.02973659252642444 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5410691003911343, + "acc_stderr": 0.012727084826799807, + "acc_norm": 0.5410691003911343, + "acc_norm_stderr": 0.012727084826799807 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6985294117647058, + "acc_stderr": 0.027875982114273168, + "acc_norm": 0.6985294117647058, + "acc_norm_stderr": 0.027875982114273168 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7369281045751634, + "acc_stderr": 0.017812676542320657, + "acc_norm": 0.7369281045751634, + "acc_norm_stderr": 0.017812676542320657 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7181818181818181, + "acc_stderr": 0.043091187099464585, + "acc_norm": 0.7181818181818181, + "acc_norm_stderr": 0.043091187099464585 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7959183673469388, + "acc_stderr": 0.025801283475090496, + "acc_norm": 0.7959183673469388, + "acc_norm_stderr": 0.025801283475090496 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8557213930348259, + "acc_stderr": 0.02484575321230604, + "acc_norm": 0.8557213930348259, + "acc_norm_stderr": 0.02484575321230604 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.91, + "acc_stderr": 0.028762349126466125, + "acc_norm": 0.91, + "acc_norm_stderr": 0.028762349126466125 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5542168674698795, + "acc_stderr": 0.03869543323472101, + "acc_norm": 0.5542168674698795, + "acc_norm_stderr": 0.03869543323472101 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8538011695906432, + "acc_stderr": 0.027097290118070796, + "acc_norm": 0.8538011695906432, + "acc_norm_stderr": 0.027097290118070796 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.412484700122399, + "mc1_stderr": 0.017233299399571217, + "mc2": 0.5769082726326263, + "mc2_stderr": 0.014903392770102011 + }, + "all": { + "acc": 0.6874964217832943, + "acc_stderr": 0.03135304734670862, + "acc_norm": 0.6913041638648051, + "acc_norm_stderr": 0.031323841726305535, + "mc1": 0.412484700122399, + "mc1_stderr": 0.017233299399571217, + "mc2": 0.5769082726326263, + "mc2_stderr": 0.014903392770102011 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "43885.92614364624", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Synthia-70B-v1.2b/results_2023-10-24T18-54-59.551883.json b/eval-results/migtissera/Synthia-70B-v1.2b/results_2023-10-24T18-54-59.551883.json new file mode 100644 index 0000000000000000000000000000000000000000..bc379cbad80282a69468f5fc105a3f16a9487ac6 --- /dev/null +++ b/eval-results/migtissera/Synthia-70B-v1.2b/results_2023-10-24T18-54-59.551883.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "migtissera/Synthia-70B-v1.2b", + "model_sha": "5af92a0ed2136f96e24a31ac8b76932d1868d454", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.44190436241610737, + "em_stderr": 0.00508578632439048, + "f1": 0.5040551593959751, + "f1_stderr": 0.00484284160320387 + }, + "harness|gsm8k|5": { + "acc": 0.3525398028809704, + "acc_stderr": 0.013159909755930321 + }, + "harness|winogrande|5": { + "acc": 0.8389897395422258, + "acc_stderr": 0.010329712832785717 + }, + "all": { + "em": 0.44190436241610737, + "em_stderr": 0.00508578632439048, + "f1": 0.5040551593959751, + "f1_stderr": 0.00484284160320387, + "acc": 0.5957647712115981, + "acc_stderr": 0.011744811294358018 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "1b33297f43d39231" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "5e9c855c3040ab8e" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "f3c468601910e9a6" + }, + "total_evaluation_time_secondes": "35365.17928361893", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Synthia-70B/results_2023-08-23T05-19-54.133935.json b/eval-results/migtissera/Synthia-70B/results_2023-08-23T05-19-54.133935.json new file mode 100644 index 0000000000000000000000000000000000000000..78ded52061f6b33e5be486df937f417f4682f445 --- /dev/null +++ b/eval-results/migtissera/Synthia-70B/results_2023-08-23T05-19-54.133935.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.658703071672355, + "acc_stderr": 0.013855831287497723, + "acc_norm": 0.6945392491467577, + "acc_norm_stderr": 0.013460080478002503 + }, + "harness|hellaswag|10": { + "acc": 0.6826329416450906, + "acc_stderr": 0.004645003662067883, + "acc_norm": 0.8711412069308903, + "acc_norm_stderr": 0.003343588514866123 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6222222222222222, + "acc_stderr": 0.04188307537595853, + "acc_norm": 0.6222222222222222, + "acc_norm_stderr": 0.04188307537595853 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7960526315789473, + "acc_stderr": 0.0327900040631005, + "acc_norm": 0.7960526315789473, + "acc_norm_stderr": 0.0327900040631005 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.720754716981132, + "acc_stderr": 0.027611163402399715, + "acc_norm": 0.720754716981132, + "acc_norm_stderr": 0.027611163402399715 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8055555555555556, + "acc_stderr": 0.03309615177059007, + "acc_norm": 0.8055555555555556, + "acc_norm_stderr": 0.03309615177059007 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.03692820767264866, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.03692820767264866 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.04784060704105654, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.04784060704105654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6468085106382979, + "acc_stderr": 0.031245325202761926, + "acc_norm": 0.6468085106382979, + "acc_norm_stderr": 0.031245325202761926 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.45614035087719296, + "acc_stderr": 0.04685473041907789, + "acc_norm": 0.45614035087719296, + "acc_norm_stderr": 0.04685473041907789 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.593103448275862, + "acc_stderr": 0.04093793981266237, + "acc_norm": 0.593103448275862, + "acc_norm_stderr": 0.04093793981266237 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.43915343915343913, + "acc_stderr": 0.02555992055053101, + "acc_norm": 0.43915343915343913, + "acc_norm_stderr": 0.02555992055053101 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.46825396825396826, + "acc_stderr": 0.04463112720677173, + "acc_norm": 0.46825396825396826, + "acc_norm_stderr": 0.04463112720677173 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8064516129032258, + "acc_stderr": 0.022475258525536057, + "acc_norm": 0.8064516129032258, + "acc_norm_stderr": 0.022475258525536057 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.035158955511656986, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.035158955511656986 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8181818181818182, + "acc_stderr": 0.030117688929503592, + "acc_norm": 0.8181818181818182, + "acc_norm_stderr": 0.030117688929503592 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8585858585858586, + "acc_stderr": 0.024825909793343336, + "acc_norm": 0.8585858585858586, + "acc_norm_stderr": 0.024825909793343336 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.927461139896373, + "acc_stderr": 0.018718998520678175, + "acc_norm": 0.927461139896373, + "acc_norm_stderr": 0.018718998520678175 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6871794871794872, + "acc_stderr": 0.023507579020645365, + "acc_norm": 0.6871794871794872, + "acc_norm_stderr": 0.023507579020645365 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34814814814814815, + "acc_stderr": 0.02904560029061626, + "acc_norm": 0.34814814814814815, + "acc_norm_stderr": 0.02904560029061626 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7563025210084033, + "acc_stderr": 0.027886828078380575, + "acc_norm": 0.7563025210084033, + "acc_norm_stderr": 0.027886828078380575 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4768211920529801, + "acc_stderr": 0.04078093859163083, + "acc_norm": 0.4768211920529801, + "acc_norm_stderr": 0.04078093859163083 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8880733944954129, + "acc_stderr": 0.013517352714958788, + "acc_norm": 0.8880733944954129, + "acc_norm_stderr": 0.013517352714958788 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.033622774366080424, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.033622774366080424 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9166666666666666, + "acc_stderr": 0.019398452135813905, + "acc_norm": 0.9166666666666666, + "acc_norm_stderr": 0.019398452135813905 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.869198312236287, + "acc_stderr": 0.02194876605947076, + "acc_norm": 0.869198312236287, + "acc_norm_stderr": 0.02194876605947076 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.8161434977578476, + "acc_stderr": 0.025998379092356513, + "acc_norm": 0.8161434977578476, + "acc_norm_stderr": 0.025998379092356513 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8244274809160306, + "acc_stderr": 0.03336820338476078, + "acc_norm": 0.8244274809160306, + "acc_norm_stderr": 0.03336820338476078 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8512396694214877, + "acc_stderr": 0.03248470083807194, + "acc_norm": 0.8512396694214877, + "acc_norm_stderr": 0.03248470083807194 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8148148148148148, + "acc_stderr": 0.03755265865037181, + "acc_norm": 0.8148148148148148, + "acc_norm_stderr": 0.03755265865037181 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7975460122699386, + "acc_stderr": 0.03157065078911901, + "acc_norm": 0.7975460122699386, + "acc_norm_stderr": 0.03157065078911901 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5535714285714286, + "acc_stderr": 0.04718471485219588, + "acc_norm": 0.5535714285714286, + "acc_norm_stderr": 0.04718471485219588 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.03916667762822583, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.03916667762822583 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9188034188034188, + "acc_stderr": 0.017893784904018533, + "acc_norm": 0.9188034188034188, + "acc_norm_stderr": 0.017893784904018533 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8722860791826309, + "acc_stderr": 0.011935626313999878, + "acc_norm": 0.8722860791826309, + "acc_norm_stderr": 0.011935626313999878 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7716763005780347, + "acc_stderr": 0.022598703804321628, + "acc_norm": 0.7716763005780347, + "acc_norm_stderr": 0.022598703804321628 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.43910614525139663, + "acc_stderr": 0.016598022120580428, + "acc_norm": 0.43910614525139663, + "acc_norm_stderr": 0.016598022120580428 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7450980392156863, + "acc_stderr": 0.024954184324879905, + "acc_norm": 0.7450980392156863, + "acc_norm_stderr": 0.024954184324879905 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7588424437299035, + "acc_stderr": 0.024296594034763426, + "acc_norm": 0.7588424437299035, + "acc_norm_stderr": 0.024296594034763426 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.808641975308642, + "acc_stderr": 0.02188770461339615, + "acc_norm": 0.808641975308642, + "acc_norm_stderr": 0.02188770461339615 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5425531914893617, + "acc_stderr": 0.029719281272236837, + "acc_norm": 0.5425531914893617, + "acc_norm_stderr": 0.029719281272236837 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5528031290743155, + "acc_stderr": 0.012698825252435117, + "acc_norm": 0.5528031290743155, + "acc_norm_stderr": 0.012698825252435117 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7132352941176471, + "acc_stderr": 0.027472274473233815, + "acc_norm": 0.7132352941176471, + "acc_norm_stderr": 0.027472274473233815 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7516339869281046, + "acc_stderr": 0.017479487001364764, + "acc_norm": 0.7516339869281046, + "acc_norm_stderr": 0.017479487001364764 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7363636363636363, + "acc_stderr": 0.04220224692971987, + "acc_norm": 0.7363636363636363, + "acc_norm_stderr": 0.04220224692971987 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7795918367346939, + "acc_stderr": 0.026537045312145298, + "acc_norm": 0.7795918367346939, + "acc_norm_stderr": 0.026537045312145298 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8855721393034826, + "acc_stderr": 0.022509345325101716, + "acc_norm": 0.8855721393034826, + "acc_norm_stderr": 0.022509345325101716 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.88, + "acc_stderr": 0.03265986323710906, + "acc_norm": 0.88, + "acc_norm_stderr": 0.03265986323710906 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5301204819277109, + "acc_stderr": 0.03885425420866767, + "acc_norm": 0.5301204819277109, + "acc_norm_stderr": 0.03885425420866767 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8596491228070176, + "acc_stderr": 0.0266405825391332, + "acc_norm": 0.8596491228070176, + "acc_norm_stderr": 0.0266405825391332 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.43451652386780903, + "mc1_stderr": 0.017352738749259564, + "mc2": 0.5978847833710849, + "mc2_stderr": 0.014931476744751782 + }, + "all": { + "acc": 0.6884676888614354, + "acc_stderr": 0.03140279617853231, + "acc_norm": 0.6922701370438118, + "acc_norm_stderr": 0.03137403065384253, + "mc1": 0.43451652386780903, + "mc1_stderr": 0.017352738749259564, + "mc2": 0.5978847833710849, + "mc2_stderr": 0.014931476744751782 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "migtissera/Synthia-70B", + "model_sha": "d63dfdd0baed756981f5f78f7419fd822c572362", + "model_dtype": "torch.float16", + "lighteval_sha": "2d7f9b0219a3536f201c55d7e8126251127b731c", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "26855.382113933563", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Synthia-70B/results_2023-10-15T22-51-19.251335.json b/eval-results/migtissera/Synthia-70B/results_2023-10-15T22-51-19.251335.json new file mode 100644 index 0000000000000000000000000000000000000000..3f51edb59e742d299b7647844bb18d7d3b38903a --- /dev/null +++ b/eval-results/migtissera/Synthia-70B/results_2023-10-15T22-51-19.251335.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "migtissera/Synthia-70B", + "model_sha": "277ec4def836d3432f880d3e560203fe2c1cc236", + "model_size": "128.56 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.15100671140939598, + "em_stderr": 0.0036668226447704277, + "f1": 0.21747168624161078, + "f1_stderr": 0.0037439821226941702 + }, + "harness|gsm8k|5": { + "acc": 0.31387414708112205, + "acc_stderr": 0.012782681251053207 + }, + "harness|winogrande|5": { + "acc": 0.8366219415943172, + "acc_stderr": 0.010390695970273763 + }, + "all": { + "em": 0.15100671140939598, + "em_stderr": 0.0036668226447704277, + "f1": 0.21747168624161078, + "f1_stderr": 0.0037439821226941702, + "acc": 0.5752480443377197, + "acc_stderr": 0.011586688610663485 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "774a5af357d7d4a3" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "12b4b43a32b2a062" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "f26ca5ed312c3724" + }, + "total_evaluation_time_secondes": "41978.55783319473", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Synthia-7B-v1.2/results_2023-09-22T05-35-25.402553.json b/eval-results/migtissera/Synthia-7B-v1.2/results_2023-09-22T05-35-25.402553.json new file mode 100644 index 0000000000000000000000000000000000000000..4aa0d1f33db59acff2bfabaa4de0987acc1174c2 --- /dev/null +++ b/eval-results/migtissera/Synthia-7B-v1.2/results_2023-09-22T05-35-25.402553.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "migtissera/Synthia-7B-v1.2", + "model_sha": "85ea4f4818478084eedd01e958ac5cc7cf64b3bb", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.514505119453925, + "acc_stderr": 0.014605241081370056, + "acc_norm": 0.5435153583617748, + "acc_norm_stderr": 0.01455594976049644 + }, + "harness|hellaswag|10": { + "acc": 0.5989842660824537, + "acc_stderr": 0.004891025533633028, + "acc_norm": 0.7928699462258514, + "acc_norm_stderr": 0.004044213304049367 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.42962962962962964, + "acc_stderr": 0.04276349494376599, + "acc_norm": 0.42962962962962964, + "acc_norm_stderr": 0.04276349494376599 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.04063302731486671, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.04063302731486671 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5132075471698113, + "acc_stderr": 0.030762134874500476, + "acc_norm": 0.5132075471698113, + "acc_norm_stderr": 0.030762134874500476 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5069444444444444, + "acc_stderr": 0.04180806750294938, + "acc_norm": 0.5069444444444444, + "acc_norm_stderr": 0.04180806750294938 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.43352601156069365, + "acc_stderr": 0.03778621079092056, + "acc_norm": 0.43352601156069365, + "acc_norm_stderr": 0.03778621079092056 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237655, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237655 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.42127659574468085, + "acc_stderr": 0.03227834510146267, + "acc_norm": 0.42127659574468085, + "acc_norm_stderr": 0.03227834510146267 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748142, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748142 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.46206896551724136, + "acc_stderr": 0.041546596717075474, + "acc_norm": 0.46206896551724136, + "acc_norm_stderr": 0.041546596717075474 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29894179894179895, + "acc_stderr": 0.023577604791655802, + "acc_norm": 0.29894179894179895, + "acc_norm_stderr": 0.023577604791655802 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.04073524322147125, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.04073524322147125 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5290322580645161, + "acc_stderr": 0.028396016402761005, + "acc_norm": 0.5290322580645161, + "acc_norm_stderr": 0.028396016402761005 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3842364532019704, + "acc_stderr": 0.0342239856565755, + "acc_norm": 0.3842364532019704, + "acc_norm_stderr": 0.0342239856565755 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.42, + "acc_stderr": 0.04960449637488584, + "acc_norm": 0.42, + "acc_norm_stderr": 0.04960449637488584 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7151515151515152, + "acc_stderr": 0.03524390844511781, + "acc_norm": 0.7151515151515152, + "acc_norm_stderr": 0.03524390844511781 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6060606060606061, + "acc_stderr": 0.03481285338232963, + "acc_norm": 0.6060606060606061, + "acc_norm_stderr": 0.03481285338232963 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7512953367875648, + "acc_stderr": 0.031195840877700286, + "acc_norm": 0.7512953367875648, + "acc_norm_stderr": 0.031195840877700286 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.44358974358974357, + "acc_stderr": 0.025189149894764194, + "acc_norm": 0.44358974358974357, + "acc_norm_stderr": 0.025189149894764194 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.0273091405882302, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.0273091405882302 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.46218487394957986, + "acc_stderr": 0.032385469487589795, + "acc_norm": 0.46218487394957986, + "acc_norm_stderr": 0.032385469487589795 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2913907284768212, + "acc_stderr": 0.03710185726119995, + "acc_norm": 0.2913907284768212, + "acc_norm_stderr": 0.03710185726119995 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.636697247706422, + "acc_stderr": 0.020620603919625804, + "acc_norm": 0.636697247706422, + "acc_norm_stderr": 0.020620603919625804 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.38425925925925924, + "acc_stderr": 0.03317354514310742, + "acc_norm": 0.38425925925925924, + "acc_norm_stderr": 0.03317354514310742 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7058823529411765, + "acc_stderr": 0.03198001660115071, + "acc_norm": 0.7058823529411765, + "acc_norm_stderr": 0.03198001660115071 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6962025316455697, + "acc_stderr": 0.02993669638713861, + "acc_norm": 0.6962025316455697, + "acc_norm_stderr": 0.02993669638713861 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5919282511210763, + "acc_stderr": 0.03298574607842822, + "acc_norm": 0.5919282511210763, + "acc_norm_stderr": 0.03298574607842822 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5572519083969466, + "acc_stderr": 0.04356447202665069, + "acc_norm": 0.5572519083969466, + "acc_norm_stderr": 0.04356447202665069 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6115702479338843, + "acc_stderr": 0.04449270350068383, + "acc_norm": 0.6115702479338843, + "acc_norm_stderr": 0.04449270350068383 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5462962962962963, + "acc_stderr": 0.04812917324536823, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.04812917324536823 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5030674846625767, + "acc_stderr": 0.03928297078179663, + "acc_norm": 0.5030674846625767, + "acc_norm_stderr": 0.03928297078179663 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4642857142857143, + "acc_stderr": 0.04733667890053756, + "acc_norm": 0.4642857142857143, + "acc_norm_stderr": 0.04733667890053756 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6504854368932039, + "acc_stderr": 0.04721188506097174, + "acc_norm": 0.6504854368932039, + "acc_norm_stderr": 0.04721188506097174 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7606837606837606, + "acc_stderr": 0.027951826808924333, + "acc_norm": 0.7606837606837606, + "acc_norm_stderr": 0.027951826808924333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6717752234993615, + "acc_stderr": 0.016791685640192892, + "acc_norm": 0.6717752234993615, + "acc_norm_stderr": 0.016791685640192892 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5375722543352601, + "acc_stderr": 0.026842985519615375, + "acc_norm": 0.5375722543352601, + "acc_norm_stderr": 0.026842985519615375 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5326797385620915, + "acc_stderr": 0.02856869975222587, + "acc_norm": 0.5326797385620915, + "acc_norm_stderr": 0.02856869975222587 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5755627009646302, + "acc_stderr": 0.028071928247946208, + "acc_norm": 0.5755627009646302, + "acc_norm_stderr": 0.028071928247946208 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5524691358024691, + "acc_stderr": 0.027667138569422708, + "acc_norm": 0.5524691358024691, + "acc_norm_stderr": 0.027667138569422708 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.38652482269503546, + "acc_stderr": 0.029049190342543465, + "acc_norm": 0.38652482269503546, + "acc_norm_stderr": 0.029049190342543465 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.38070404172099087, + "acc_stderr": 0.012401430654645891, + "acc_norm": 0.38070404172099087, + "acc_norm_stderr": 0.012401430654645891 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4889705882352941, + "acc_stderr": 0.030365446477275675, + "acc_norm": 0.4889705882352941, + "acc_norm_stderr": 0.030365446477275675 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.49673202614379086, + "acc_stderr": 0.020227402794434867, + "acc_norm": 0.49673202614379086, + "acc_norm_stderr": 0.020227402794434867 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6, + "acc_stderr": 0.0469237132203465, + "acc_norm": 0.6, + "acc_norm_stderr": 0.0469237132203465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6, + "acc_stderr": 0.03136250240935893, + "acc_norm": 0.6, + "acc_norm_stderr": 0.03136250240935893 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6069651741293532, + "acc_stderr": 0.0345368246603156, + "acc_norm": 0.6069651741293532, + "acc_norm_stderr": 0.0345368246603156 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695238, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695238 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3855421686746988, + "acc_stderr": 0.0378913442461155, + "acc_norm": 0.3855421686746988, + "acc_norm_stderr": 0.0378913442461155 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.672514619883041, + "acc_stderr": 0.03599335771456027, + "acc_norm": 0.672514619883041, + "acc_norm_stderr": 0.03599335771456027 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3353733170134639, + "mc1_stderr": 0.01652753403966899, + "mc2": 0.48916296496506734, + "mc2_stderr": 0.01533111941565817 + }, + "all": { + "acc": 0.49547543726554166, + "acc_stderr": 0.03514522094300258, + "acc_norm": 0.4992533341986137, + "acc_norm_stderr": 0.03513003274723212, + "mc1": 0.3353733170134639, + "mc1_stderr": 0.01652753403966899, + "mc2": 0.48916296496506734, + "mc2_stderr": 0.01533111941565817 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "2751.026495695114", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Synthia-7B-v1.2/results_2023-10-25T08-51-48.447096.json b/eval-results/migtissera/Synthia-7B-v1.2/results_2023-10-25T08-51-48.447096.json new file mode 100644 index 0000000000000000000000000000000000000000..18ca7dc08026653150ebb98c3ee2256dba73f055 --- /dev/null +++ b/eval-results/migtissera/Synthia-7B-v1.2/results_2023-10-25T08-51-48.447096.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "migtissera/Synthia-7B-v1.2", + "model_sha": "236c177131ca287e5194ebed23ede18dbdf44f57", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.08913590604026846, + "em_stderr": 0.0029180503705090555, + "f1": 0.16236577181208006, + "f1_stderr": 0.003176440216561889 + }, + "harness|gsm8k|5": { + "acc": 0.10841546626231995, + "acc_stderr": 0.00856385250662748 + }, + "harness|winogrande|5": { + "acc": 0.7355958958168903, + "acc_stderr": 0.012394724896983799 + }, + "all": { + "em": 0.08913590604026846, + "em_stderr": 0.0029180503705090555, + "f1": 0.16236577181208006, + "f1_stderr": 0.003176440216561889, + "acc": 0.4220056810396051, + "acc_stderr": 0.01047928870180564 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "ad2153bda986d612" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "6cb096e5eb4dd1c7" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "823c2933dd3ee24c" + }, + "total_evaluation_time_secondes": "9167.631344079971", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Synthia-7B/results_2023-08-17T17-21-07.158534.json b/eval-results/migtissera/Synthia-7B/results_2023-08-17T17-21-07.158534.json new file mode 100644 index 0000000000000000000000000000000000000000..6aafba423ac30f38882b45563b012cc5a2c8ac94 --- /dev/null +++ b/eval-results/migtissera/Synthia-7B/results_2023-08-17T17-21-07.158534.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5366894197952219, + "acc_stderr": 0.014572000527756994, + "acc_norm": 0.5614334470989761, + "acc_norm_stderr": 0.014500682618212864 + }, + "harness|hellaswag|10": { + "acc": 0.5997809201354312, + "acc_stderr": 0.004889413126208774, + "acc_norm": 0.7859988050189205, + "acc_norm_stderr": 0.004092894578418982 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.046482319871173156, + "acc_norm": 0.31, + "acc_norm_stderr": 0.046482319871173156 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5037037037037037, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.5037037037037037, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4605263157894737, + "acc_stderr": 0.04056242252249034, + "acc_norm": 0.4605263157894737, + "acc_norm_stderr": 0.04056242252249034 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5283018867924528, + "acc_stderr": 0.030723535249006107, + "acc_norm": 0.5283018867924528, + "acc_norm_stderr": 0.030723535249006107 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4861111111111111, + "acc_stderr": 0.041795966175810016, + "acc_norm": 0.4861111111111111, + "acc_norm_stderr": 0.041795966175810016 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4624277456647399, + "acc_stderr": 0.0380168510452446, + "acc_norm": 0.4624277456647399, + "acc_norm_stderr": 0.0380168510452446 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.0379328118530781, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.0379328118530781 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.425531914893617, + "acc_stderr": 0.03232146916224468, + "acc_norm": 0.425531914893617, + "acc_norm_stderr": 0.03232146916224468 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3508771929824561, + "acc_stderr": 0.044895393502707, + "acc_norm": 0.3508771929824561, + "acc_norm_stderr": 0.044895393502707 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30952380952380953, + "acc_stderr": 0.023809523809523864, + "acc_norm": 0.30952380952380953, + "acc_norm_stderr": 0.023809523809523864 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.04073524322147125, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.04073524322147125 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.532258064516129, + "acc_stderr": 0.028384747788813332, + "acc_norm": 0.532258064516129, + "acc_norm_stderr": 0.028384747788813332 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.39901477832512317, + "acc_stderr": 0.03445487686264716, + "acc_norm": 0.39901477832512317, + "acc_norm_stderr": 0.03445487686264716 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.037131580674819135, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.037131580674819135 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6161616161616161, + "acc_stderr": 0.034648816750163396, + "acc_norm": 0.6161616161616161, + "acc_norm_stderr": 0.034648816750163396 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7305699481865285, + "acc_stderr": 0.03201867122877794, + "acc_norm": 0.7305699481865285, + "acc_norm_stderr": 0.03201867122877794 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.441025641025641, + "acc_stderr": 0.025174048384000752, + "acc_norm": 0.441025641025641, + "acc_norm_stderr": 0.025174048384000752 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.027634907264178544, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.027634907264178544 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.44537815126050423, + "acc_stderr": 0.0322841062671639, + "acc_norm": 0.44537815126050423, + "acc_norm_stderr": 0.0322841062671639 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3708609271523179, + "acc_stderr": 0.03943966699183629, + "acc_norm": 0.3708609271523179, + "acc_norm_stderr": 0.03943966699183629 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.673394495412844, + "acc_stderr": 0.0201069908899373, + "acc_norm": 0.673394495412844, + "acc_norm_stderr": 0.0201069908899373 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.033247089118091176, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.033247089118091176 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.032834720561085606, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.032834720561085606 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7172995780590717, + "acc_stderr": 0.029312814153955934, + "acc_norm": 0.7172995780590717, + "acc_norm_stderr": 0.029312814153955934 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5874439461883408, + "acc_stderr": 0.03304062175449296, + "acc_norm": 0.5874439461883408, + "acc_norm_stderr": 0.03304062175449296 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5954198473282443, + "acc_stderr": 0.043046937953806645, + "acc_norm": 0.5954198473282443, + "acc_norm_stderr": 0.043046937953806645 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6033057851239669, + "acc_stderr": 0.044658697805310094, + "acc_norm": 0.6033057851239669, + "acc_norm_stderr": 0.044658697805310094 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6018518518518519, + "acc_stderr": 0.04732332615978813, + "acc_norm": 0.6018518518518519, + "acc_norm_stderr": 0.04732332615978813 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5398773006134969, + "acc_stderr": 0.03915857291436971, + "acc_norm": 0.5398773006134969, + "acc_norm_stderr": 0.03915857291436971 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4375, + "acc_stderr": 0.04708567521880525, + "acc_norm": 0.4375, + "acc_norm_stderr": 0.04708567521880525 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6893203883495146, + "acc_stderr": 0.04582124160161549, + "acc_norm": 0.6893203883495146, + "acc_norm_stderr": 0.04582124160161549 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.717948717948718, + "acc_stderr": 0.029480360549541194, + "acc_norm": 0.717948717948718, + "acc_norm_stderr": 0.029480360549541194 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.58, + "acc_stderr": 0.04960449637488583, + "acc_norm": 0.58, + "acc_norm_stderr": 0.04960449637488583 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.698595146871009, + "acc_stderr": 0.016409091097268784, + "acc_norm": 0.698595146871009, + "acc_norm_stderr": 0.016409091097268784 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5404624277456648, + "acc_stderr": 0.026830805998952236, + "acc_norm": 0.5404624277456648, + "acc_norm_stderr": 0.026830805998952236 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25251396648044694, + "acc_stderr": 0.014530330201468636, + "acc_norm": 0.25251396648044694, + "acc_norm_stderr": 0.014530330201468636 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5490196078431373, + "acc_stderr": 0.02849199358617157, + "acc_norm": 0.5490196078431373, + "acc_norm_stderr": 0.02849199358617157 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5884244372990354, + "acc_stderr": 0.027950481494401266, + "acc_norm": 0.5884244372990354, + "acc_norm_stderr": 0.027950481494401266 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.027777777777777797, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.027777777777777797 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3829787234042553, + "acc_stderr": 0.028999080904806185, + "acc_norm": 0.3829787234042553, + "acc_norm_stderr": 0.028999080904806185 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3559322033898305, + "acc_stderr": 0.01222864553727757, + "acc_norm": 0.3559322033898305, + "acc_norm_stderr": 0.01222864553727757 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5, + "acc_stderr": 0.030372836961539352, + "acc_norm": 0.5, + "acc_norm_stderr": 0.030372836961539352 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5163398692810458, + "acc_stderr": 0.02021703065318646, + "acc_norm": 0.5163398692810458, + "acc_norm_stderr": 0.02021703065318646 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5727272727272728, + "acc_stderr": 0.047381987035454834, + "acc_norm": 0.5727272727272728, + "acc_norm_stderr": 0.047381987035454834 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5591836734693878, + "acc_stderr": 0.03178419114175363, + "acc_norm": 0.5591836734693878, + "acc_norm_stderr": 0.03178419114175363 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6119402985074627, + "acc_stderr": 0.0344578996436275, + "acc_norm": 0.6119402985074627, + "acc_norm_stderr": 0.0344578996436275 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252607, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252607 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.40963855421686746, + "acc_stderr": 0.03828401115079022, + "acc_norm": 0.40963855421686746, + "acc_norm_stderr": 0.03828401115079022 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7134502923976608, + "acc_stderr": 0.03467826685703826, + "acc_norm": 0.7134502923976608, + "acc_norm_stderr": 0.03467826685703826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3023255813953488, + "mc1_stderr": 0.016077509266133026, + "mc2": 0.4503268940046918, + "mc2_stderr": 0.015174573803698735 + }, + "all": { + "acc": 0.5056926539104564, + "acc_stderr": 0.03526203275119529, + "acc_norm": 0.5092682795407486, + "acc_norm_stderr": 0.03524732365869811, + "mc1": 0.3023255813953488, + "mc1_stderr": 0.016077509266133026, + "mc2": 0.4503268940046918, + "mc2_stderr": 0.015174573803698735 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "migtissera/Synthia-7B", + "model_sha": "4f9e95665d95b4c692910190ff77257216e476f1", + "model_dtype": "torch.float16", + "lighteval_sha": "8bab069fee0c6e75ffa4c1ef8a9591c28ee0e049", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "2564.749200820923", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Synthia-7B/results_2023-10-15T06-07-54.738296.json b/eval-results/migtissera/Synthia-7B/results_2023-10-15T06-07-54.738296.json new file mode 100644 index 0000000000000000000000000000000000000000..12cdebb4d2d80d31b609449bc9627d4e31f07cb5 --- /dev/null +++ b/eval-results/migtissera/Synthia-7B/results_2023-10-15T06-07-54.738296.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "migtissera/Synthia-7B", + "model_sha": "c0a3bc17604b11f252806013ad52e6172569816f", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.07151845637583892, + "em_stderr": 0.00263897548039012, + "f1": 0.14513737416107345, + "f1_stderr": 0.0029452435334875074 + }, + "harness|gsm8k|5": { + "acc": 0.06595905989385899, + "acc_stderr": 0.006836951192034222 + }, + "harness|winogrande|5": { + "acc": 0.7426992896606156, + "acc_stderr": 0.012285989618865708 + }, + "all": { + "em": 0.07151845637583892, + "em_stderr": 0.00263897548039012, + "f1": 0.14513737416107345, + "f1_stderr": 0.0029452435334875074, + "acc": 0.4043291747772373, + "acc_stderr": 0.009561470405449964 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "ed048a9f86bc900f" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "73a7eab5e982e39d" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "8c2af79fab6f311e" + }, + "total_evaluation_time_secondes": "9282.341347694397", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Tess-34B-v1.4/results_2023-12-08T21-42-14.185157.json b/eval-results/migtissera/Tess-34B-v1.4/results_2023-12-08T21-42-14.185157.json new file mode 100644 index 0000000000000000000000000000000000000000..98009c9f0625dbf27178e83939dd11ca7a2b3426 --- /dev/null +++ b/eval-results/migtissera/Tess-34B-v1.4/results_2023-12-08T21-42-14.185157.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 432821.085591086, + "end_time": 526610.355072111, + "total_evaluation_time_secondes": "93789.269481025", + "model_name": "migtissera/Tess-34B-v1.4", + "model_sha": "173d834656c3965cbaa49be6aab0772c3ce57821", + "model_dtype": "torch.float16", + "model_size": "69.78 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6228668941979523, + "acc_stderr": 0.0141633668961926, + "acc_norm": 0.6459044368600683, + "acc_norm_stderr": 0.01397545412275656 + }, + "harness|hellaswag|10": { + "acc": 0.6419040031866162, + "acc_stderr": 0.004784607222774645, + "acc_norm": 0.833698466440948, + "acc_norm_stderr": 0.0037159010850549854 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6962962962962963, + "acc_stderr": 0.03972552884785136, + "acc_norm": 0.6962962962962963, + "acc_norm_stderr": 0.03972552884785136 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.8618421052631579, + "acc_stderr": 0.028081042939576552, + "acc_norm": 0.8618421052631579, + "acc_norm_stderr": 0.028081042939576552 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.77, + "acc_stderr": 0.042295258468165044, + "acc_norm": 0.77, + "acc_norm_stderr": 0.042295258468165044 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.8037735849056604, + "acc_stderr": 0.02444238813110082, + "acc_norm": 0.8037735849056604, + "acc_norm_stderr": 0.02444238813110082 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.875, + "acc_stderr": 0.02765610492929436, + "acc_norm": 0.875, + "acc_norm_stderr": 0.02765610492929436 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.45, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.45, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.7398843930635838, + "acc_stderr": 0.033450369167889904, + "acc_norm": 0.7398843930635838, + "acc_norm_stderr": 0.033450369167889904 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.5392156862745098, + "acc_stderr": 0.04959859966384181, + "acc_norm": 0.5392156862745098, + "acc_norm_stderr": 0.04959859966384181 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.7702127659574468, + "acc_stderr": 0.027501752944412417, + "acc_norm": 0.7702127659574468, + "acc_norm_stderr": 0.027501752944412417 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.6228070175438597, + "acc_stderr": 0.04559522141958216, + "acc_norm": 0.6228070175438597, + "acc_norm_stderr": 0.04559522141958216 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.7379310344827587, + "acc_stderr": 0.03664666337225257, + "acc_norm": 0.7379310344827587, + "acc_norm_stderr": 0.03664666337225257 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.6851851851851852, + "acc_stderr": 0.023919984164047736, + "acc_norm": 0.6851851851851852, + "acc_norm_stderr": 0.023919984164047736 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5476190476190477, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.5476190476190477, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.9032258064516129, + "acc_stderr": 0.016818943416345197, + "acc_norm": 0.9032258064516129, + "acc_norm_stderr": 0.016818943416345197 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.6748768472906403, + "acc_stderr": 0.032957975663112704, + "acc_norm": 0.6748768472906403, + "acc_norm_stderr": 0.032957975663112704 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036625, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036625 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8363636363636363, + "acc_stderr": 0.028887872395487946, + "acc_norm": 0.8363636363636363, + "acc_norm_stderr": 0.028887872395487946 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.9191919191919192, + "acc_stderr": 0.019417681889724536, + "acc_norm": 0.9191919191919192, + "acc_norm_stderr": 0.019417681889724536 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9689119170984456, + "acc_stderr": 0.012525310625527041, + "acc_norm": 0.9689119170984456, + "acc_norm_stderr": 0.012525310625527041 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7974358974358975, + "acc_stderr": 0.020377660970371393, + "acc_norm": 0.7974358974358975, + "acc_norm_stderr": 0.020377660970371393 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.4148148148148148, + "acc_stderr": 0.03003984245406929, + "acc_norm": 0.4148148148148148, + "acc_norm_stderr": 0.03003984245406929 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.8235294117647058, + "acc_stderr": 0.02476290267805793, + "acc_norm": 0.8235294117647058, + "acc_norm_stderr": 0.02476290267805793 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.48344370860927155, + "acc_stderr": 0.0408024418562897, + "acc_norm": 0.48344370860927155, + "acc_norm_stderr": 0.0408024418562897 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.9100917431192661, + "acc_stderr": 0.012264304540230446, + "acc_norm": 0.9100917431192661, + "acc_norm_stderr": 0.012264304540230446 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.625, + "acc_stderr": 0.033016908987210894, + "acc_norm": 0.625, + "acc_norm_stderr": 0.033016908987210894 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9313725490196079, + "acc_stderr": 0.017744453647073315, + "acc_norm": 0.9313725490196079, + "acc_norm_stderr": 0.017744453647073315 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8945147679324894, + "acc_stderr": 0.019995560723758535, + "acc_norm": 0.8945147679324894, + "acc_norm_stderr": 0.019995560723758535 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7892376681614349, + "acc_stderr": 0.027373095500540186, + "acc_norm": 0.7892376681614349, + "acc_norm_stderr": 0.027373095500540186 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8396946564885496, + "acc_stderr": 0.03217829420744631, + "acc_norm": 0.8396946564885496, + "acc_norm_stderr": 0.03217829420744631 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8760330578512396, + "acc_stderr": 0.030083098716035202, + "acc_norm": 0.8760330578512396, + "acc_norm_stderr": 0.030083098716035202 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8518518518518519, + "acc_stderr": 0.03434300243631001, + "acc_norm": 0.8518518518518519, + "acc_norm_stderr": 0.03434300243631001 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8650306748466258, + "acc_stderr": 0.02684576505455386, + "acc_norm": 0.8650306748466258, + "acc_norm_stderr": 0.02684576505455386 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8446601941747572, + "acc_stderr": 0.03586594738573974, + "acc_norm": 0.8446601941747572, + "acc_norm_stderr": 0.03586594738573974 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9230769230769231, + "acc_stderr": 0.017456987872436186, + "acc_norm": 0.9230769230769231, + "acc_norm_stderr": 0.017456987872436186 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.83, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.83, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.9042145593869731, + "acc_stderr": 0.01052403107905584, + "acc_norm": 0.9042145593869731, + "acc_norm_stderr": 0.01052403107905584 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.8208092485549133, + "acc_stderr": 0.020647590029679332, + "acc_norm": 0.8208092485549133, + "acc_norm_stderr": 0.020647590029679332 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.6737430167597765, + "acc_stderr": 0.01568044151888918, + "acc_norm": 0.6737430167597765, + "acc_norm_stderr": 0.01568044151888918 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7973856209150327, + "acc_stderr": 0.023015446877985665, + "acc_norm": 0.7973856209150327, + "acc_norm_stderr": 0.023015446877985665 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7845659163987139, + "acc_stderr": 0.023350225475471442, + "acc_norm": 0.7845659163987139, + "acc_norm_stderr": 0.023350225475471442 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8734567901234568, + "acc_stderr": 0.018498600558790906, + "acc_norm": 0.8734567901234568, + "acc_norm_stderr": 0.018498600558790906 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.6134751773049646, + "acc_stderr": 0.029049190342543465, + "acc_norm": 0.6134751773049646, + "acc_norm_stderr": 0.029049190342543465 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5795306388526728, + "acc_stderr": 0.012607654553832701, + "acc_norm": 0.5795306388526728, + "acc_norm_stderr": 0.012607654553832701 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.8014705882352942, + "acc_stderr": 0.02423101337054108, + "acc_norm": 0.8014705882352942, + "acc_norm_stderr": 0.02423101337054108 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.8022875816993464, + "acc_stderr": 0.016112443369726732, + "acc_norm": 0.8022875816993464, + "acc_norm_stderr": 0.016112443369726732 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7181818181818181, + "acc_stderr": 0.04309118709946458, + "acc_norm": 0.7181818181818181, + "acc_norm_stderr": 0.04309118709946458 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.8367346938775511, + "acc_stderr": 0.023661699177098615, + "acc_norm": 0.8367346938775511, + "acc_norm_stderr": 0.023661699177098615 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8855721393034826, + "acc_stderr": 0.022509345325101706, + "acc_norm": 0.8855721393034826, + "acc_norm_stderr": 0.022509345325101706 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.92, + "acc_stderr": 0.027265992434429103, + "acc_norm": 0.92, + "acc_norm_stderr": 0.027265992434429103 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5301204819277109, + "acc_stderr": 0.03885425420866767, + "acc_norm": 0.5301204819277109, + "acc_norm_stderr": 0.03885425420866767 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8771929824561403, + "acc_stderr": 0.02517298435015577, + "acc_norm": 0.8771929824561403, + "acc_norm_stderr": 0.02517298435015577 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.412484700122399, + "mc1_stderr": 0.01723329939957122, + "mc2": 0.5679144363917273, + "mc2_stderr": 0.01582118053131118 + }, + "harness|winogrande|5": { + "acc": 0.8121546961325967, + "acc_stderr": 0.010977481103435091 + }, + "harness|gsm8k|5": { + "acc": 0.5966641394996209, + "acc_stderr": 0.013512654781814706 + }, + "all": { + "acc": 0.744827149985735, + "acc_stderr": 0.029006208191077444, + "acc_norm": 0.7498384630409163, + "acc_norm_stderr": 0.02955308381117489, + "mc1": 0.412484700122399, + "mc1_stderr": 0.01723329939957122, + "mc2": 0.5679144363917273, + "mc2_stderr": 0.01582118053131118 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "40489066810a8901", + "hash_cont_tokens": "e23c779c4c2dd1ec" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4682, + "non_padded": 5, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ef60cabaa9013478", + "hash_cont_tokens": "55da5ba61989a8fe" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40097, + "non_padded": 71, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "57ace135b7466127", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "42dc1ea1fa6d82c1", + "hash_cont_tokens": "5cc800feae9fa1ad" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "8aebf432b444ab39", + "hash_cont_tokens": "655dbb90034f484a" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "d7f1ea50cc3bbeb3", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "b478c0c5b9db6649", + "hash_cont_tokens": "f77b74d946d7fc02" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "b1c627c6eb96a0c5", + "hash_cont_tokens": "1ba4b1a158d8bf3f" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "9288d8baf6845601", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "aed771600d6a99f9", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "d894b8fd73f824bf", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "82c1d19db93b0d1a", + "hash_cont_tokens": "78a0ebf66d91c5cf" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "8fc19b050e0980c9", + "hash_cont_tokens": "5a030c95824fdbe5" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4bee692ef11ce74d", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "36ec55566216bf8c", + "hash_cont_tokens": "2326dc60d0bc41b6" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "5f8a6ca26af4fa24", + "hash_cont_tokens": "be908364b6f14dd6" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "f27fa41a971012bc", + "hash_cont_tokens": "179280ef597fe1bf" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 564, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "a01362f218292970", + "hash_cont_tokens": "95cdcdaf1abd0bd2" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "52298ae27ce37603", + "hash_cont_tokens": "6a4818f3c307c346" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "8a8ad22aa325ceab", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "dffa06152b2f8493", + "hash_cont_tokens": "36d0d84455f0bdba" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b2b45cc2ae4ceb25", + "hash_cont_tokens": "c678f794a9b8ee74" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "1e5e2a1ef482ec75", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "97fdb399c1648085", + "hash_cont_tokens": "e9c94304326d875c" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6c6a6e35fc178837", + "hash_cont_tokens": "f937a1349eb483eb" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "2b67aa85a6bf6a30", + "hash_cont_tokens": "8b27dd3907d25b4e" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "d62be00061c87ef4", + "hash_cont_tokens": "3763cae29e2f938c" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "5793fc23ddb8a65d", + "hash_cont_tokens": "fd7b555352d765a4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "f87090bf0f08ad42", + "hash_cont_tokens": "61f46d4a209b9aa2" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "685c468b5589f463", + "hash_cont_tokens": "4e7053e7c19d680d" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "e77a448734066d49", + "hash_cont_tokens": "84d19ae8790476bb" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "66db7c250b356884", + "hash_cont_tokens": "b119c7b668213a4e" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "584802bdd1a87b58", + "hash_cont_tokens": "a3b126bc622d571f" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "94e130c89c7d75f6", + "hash_cont_tokens": "9abf19ceb76331ff" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c6d293e76bcfb74a", + "hash_cont_tokens": "0e2e725ae9a898da" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "9a7d2880571d0567", + "hash_cont_tokens": "a94c1dea6d775249" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cd30ba63d38ab1f6", + "hash_cont_tokens": "3832f860859bb86b" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "0e68d6fc5a38055b", + "hash_cont_tokens": "9fac5a0c364fca8a" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "29871cd209a1497d", + "hash_cont_tokens": "dc53ed31134ddf3a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "0eaeff5a26043f24", + "hash_cont_tokens": "e272b5456d5552d6" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "ef74b35eddcc2485", + "hash_cont_tokens": "7119d4642957b1f0" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "9be1a8c16522eab9", + "hash_cont_tokens": "099d58c66ece3f11" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "eb07817458c8a2e6", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d3e380b9452ad897", + "hash_cont_tokens": "bae342d4e82ba8f7" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "08b0b14c698b0352", + "hash_cont_tokens": "578c64cbdbb1e0d4" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "d86956f0d743889d", + "hash_cont_tokens": "79b25f42b3fce0f9" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "61e00d762a54dbc7", + "hash_cont_tokens": "9d1f3b976417156c" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd9ffbd919a90135", + "hash_cont_tokens": "88dab560e1e06d97" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3e1813f766721f1e", + "hash_cont_tokens": "04ea847139fe9393" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "7586f4ed3328692c", + "hash_cont_tokens": "0435ff692ad17e68" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1124, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "83ae4dc0f5d2d065", + "hash_cont_tokens": "b852c74e9f8801bd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "0db2792632b8d50c", + "hash_cont_tokens": "5db0f6460652d063" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "c3a7c2a8f5a3c728", + "hash_cont_tokens": "c960676ef7f3dbe5" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "152b0e047ae22138", + "hash_cont_tokens": "3320565f412c4b01" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "6f999fb3e43c58fb", + "hash_cont_tokens": "218ed775ef60aab9" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fc84e0b3f5d27284", + "hash_cont_tokens": "20babf5cc4cc7f3d" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "22fb144e223c4380", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "f79f449601363c0f", + "hash_cont_tokens": "dc6d57296bea0882" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "2e3848d7ccb89a7a", + "hash_cont_tokens": "37f53444db289ed3" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "b67103f10d8a2f4d", + "hash_cont_tokens": "71a67034827cd30e" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "b062d01ca34fb467", + "hash_cont_tokens": "c93e9c22fa3077a0" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "ef77dd2b59b34427", + "hash_cont_tokens": "cd688ab990eace4f" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "b6e3fb9b64a22343", + "hash_cont_tokens": "1f7c866226b38a0e" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113445, + "non_padded": 1427, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Tess-7B-v1.4/results_2023-12-07T22-12-44.585661.json b/eval-results/migtissera/Tess-7B-v1.4/results_2023-12-07T22-12-44.585661.json new file mode 100644 index 0000000000000000000000000000000000000000..73c6e7af1e46aa72200f2d81be461c13c6878d4c --- /dev/null +++ b/eval-results/migtissera/Tess-7B-v1.4/results_2023-12-07T22-12-44.585661.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 432966.569413404, + "end_time": 442039.080144496, + "total_evaluation_time_secondes": "9072.510731092014", + "model_name": "migtissera/Tess-7B-v1.4", + "model_sha": "53a5249ee9e5b2327de81f09c26a4577dea9260b", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5691126279863481, + "acc_stderr": 0.014471133392642473, + "acc_norm": 0.6040955631399317, + "acc_norm_stderr": 0.014291228393536587 + }, + "harness|hellaswag|10": { + "acc": 0.6409081856203943, + "acc_stderr": 0.0047875373851530055, + "acc_norm": 0.8287193786098387, + "acc_norm_stderr": 0.0037598401271507057 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5777777777777777, + "acc_stderr": 0.04266763404099582, + "acc_norm": 0.5777777777777777, + "acc_norm_stderr": 0.04266763404099582 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6710526315789473, + "acc_stderr": 0.03823428969926604, + "acc_norm": 0.6710526315789473, + "acc_norm_stderr": 0.03823428969926604 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6641509433962264, + "acc_stderr": 0.029067220146644823, + "acc_norm": 0.6641509433962264, + "acc_norm_stderr": 0.029067220146644823 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7013888888888888, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.7013888888888888, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6011560693641619, + "acc_stderr": 0.0373362665538351, + "acc_norm": 0.6011560693641619, + "acc_norm_stderr": 0.0373362665538351 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4215686274509804, + "acc_stderr": 0.04913595201274498, + "acc_norm": 0.4215686274509804, + "acc_norm_stderr": 0.04913595201274498 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5531914893617021, + "acc_stderr": 0.032500536843658404, + "acc_norm": 0.5531914893617021, + "acc_norm_stderr": 0.032500536843658404 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4473684210526316, + "acc_stderr": 0.04677473004491199, + "acc_norm": 0.4473684210526316, + "acc_norm_stderr": 0.04677473004491199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5241379310344828, + "acc_stderr": 0.0416180850350153, + "acc_norm": 0.5241379310344828, + "acc_norm_stderr": 0.0416180850350153 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.025279850397404897, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.025279850397404897 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.0436031486007746, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.0436031486007746 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7225806451612903, + "acc_stderr": 0.025470196835900055, + "acc_norm": 0.7225806451612903, + "acc_norm_stderr": 0.025470196835900055 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5123152709359606, + "acc_stderr": 0.035169204442208966, + "acc_norm": 0.5123152709359606, + "acc_norm_stderr": 0.035169204442208966 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7454545454545455, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.7454545454545455, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7727272727272727, + "acc_stderr": 0.02985751567338641, + "acc_norm": 0.7727272727272727, + "acc_norm_stderr": 0.02985751567338641 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8393782383419689, + "acc_stderr": 0.026499057701397443, + "acc_norm": 0.8393782383419689, + "acc_norm_stderr": 0.026499057701397443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6128205128205129, + "acc_stderr": 0.024697216930878937, + "acc_norm": 0.6128205128205129, + "acc_norm_stderr": 0.024697216930878937 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32222222222222224, + "acc_stderr": 0.028493465091028597, + "acc_norm": 0.32222222222222224, + "acc_norm_stderr": 0.028493465091028597 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6302521008403361, + "acc_stderr": 0.03135709599613591, + "acc_norm": 0.6302521008403361, + "acc_norm_stderr": 0.03135709599613591 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8036697247706422, + "acc_stderr": 0.01703071933915434, + "acc_norm": 0.8036697247706422, + "acc_norm_stderr": 0.01703071933915434 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.44907407407407407, + "acc_stderr": 0.03392238405321616, + "acc_norm": 0.44907407407407407, + "acc_norm_stderr": 0.03392238405321616 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7598039215686274, + "acc_stderr": 0.02998373305591361, + "acc_norm": 0.7598039215686274, + "acc_norm_stderr": 0.02998373305591361 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.759493670886076, + "acc_stderr": 0.027820781981149685, + "acc_norm": 0.759493670886076, + "acc_norm_stderr": 0.027820781981149685 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6636771300448431, + "acc_stderr": 0.031708824268455005, + "acc_norm": 0.6636771300448431, + "acc_norm_stderr": 0.031708824268455005 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7099236641221374, + "acc_stderr": 0.03980066246467766, + "acc_norm": 0.7099236641221374, + "acc_norm_stderr": 0.03980066246467766 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.768595041322314, + "acc_stderr": 0.03849856098794087, + "acc_norm": 0.768595041322314, + "acc_norm_stderr": 0.03849856098794087 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.04330043749650742, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.04330043749650742 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7239263803680982, + "acc_stderr": 0.035123852837050475, + "acc_norm": 0.7239263803680982, + "acc_norm_stderr": 0.035123852837050475 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.045218299028335865, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.045218299028335865 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.03916667762822585, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.03916667762822585 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8504273504273504, + "acc_stderr": 0.023365051491753715, + "acc_norm": 0.8504273504273504, + "acc_norm_stderr": 0.023365051491753715 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8033205619412516, + "acc_stderr": 0.014214138556913912, + "acc_norm": 0.8033205619412516, + "acc_norm_stderr": 0.014214138556913912 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6907514450867052, + "acc_stderr": 0.024883140570071762, + "acc_norm": 0.6907514450867052, + "acc_norm_stderr": 0.024883140570071762 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3564245810055866, + "acc_stderr": 0.016018239710513398, + "acc_norm": 0.3564245810055866, + "acc_norm_stderr": 0.016018239710513398 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.696078431372549, + "acc_stderr": 0.026336613469046626, + "acc_norm": 0.696078431372549, + "acc_norm_stderr": 0.026336613469046626 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6559485530546624, + "acc_stderr": 0.02698147804364805, + "acc_norm": 0.6559485530546624, + "acc_norm_stderr": 0.02698147804364805 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6635802469135802, + "acc_stderr": 0.02628973494595293, + "acc_norm": 0.6635802469135802, + "acc_norm_stderr": 0.02628973494595293 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.44680851063829785, + "acc_stderr": 0.029658235097666907, + "acc_norm": 0.44680851063829785, + "acc_norm_stderr": 0.029658235097666907 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4602346805736636, + "acc_stderr": 0.01272978538659856, + "acc_norm": 0.4602346805736636, + "acc_norm_stderr": 0.01272978538659856 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6102941176470589, + "acc_stderr": 0.0296246635811597, + "acc_norm": 0.6102941176470589, + "acc_norm_stderr": 0.0296246635811597 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6388888888888888, + "acc_stderr": 0.019431775677037313, + "acc_norm": 0.6388888888888888, + "acc_norm_stderr": 0.019431775677037313 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.04350271442923243, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.04350271442923243 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6857142857142857, + "acc_stderr": 0.029719329422417482, + "acc_norm": 0.6857142857142857, + "acc_norm_stderr": 0.029719329422417482 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7711442786069652, + "acc_stderr": 0.029705284056772432, + "acc_norm": 0.7711442786069652, + "acc_norm_stderr": 0.029705284056772432 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.8, + "acc_stderr": 0.040201512610368445, + "acc_norm": 0.8, + "acc_norm_stderr": 0.040201512610368445 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.536144578313253, + "acc_stderr": 0.03882310850890594, + "acc_norm": 0.536144578313253, + "acc_norm_stderr": 0.03882310850890594 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8011695906432749, + "acc_stderr": 0.030611116557432528, + "acc_norm": 0.8011695906432749, + "acc_norm_stderr": 0.030611116557432528 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3623011015911873, + "mc1_stderr": 0.016826646897262258, + "mc2": 0.5187917410145858, + "mc2_stderr": 0.015933996625694287 + }, + "harness|winogrande|5": { + "acc": 0.7482241515390686, + "acc_stderr": 0.012198489100259764 + }, + "harness|gsm8k|5": { + "acc": 0.42153146322971946, + "acc_stderr": 0.013601824409483272 + }, + "all": { + "acc": 0.6088715659432319, + "acc_stderr": 0.03312437671303067, + "acc_norm": 0.6134610853544302, + "acc_norm_stderr": 0.03378947565640712, + "mc1": 0.3623011015911873, + "mc1_stderr": 0.016826646897262258, + "mc2": 0.5187917410145858, + "mc2_stderr": 0.015933996625694287 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "183f2e9171cd0ac9" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "89eeac8f2298387f" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Tess-M-Creative-v1.0/results_2023-12-05T03-45-38.672992.json b/eval-results/migtissera/Tess-M-Creative-v1.0/results_2023-12-05T03-45-38.672992.json new file mode 100644 index 0000000000000000000000000000000000000000..a6c3223ca93b156b5a7d4d36633eda8ec99789a5 --- /dev/null +++ b/eval-results/migtissera/Tess-M-Creative-v1.0/results_2023-12-05T03-45-38.672992.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 171688.743192176, + "end_time": 202807.446346256, + "total_evaluation_time_secondes": "31118.703154079994", + "model_name": "migtissera/Tess-M-Creative-v1.0", + "model_sha": "26923a2648b9864e2ec6f0cc66b8b6fcfbbdd491", + "model_dtype": "torch.float16", + "model_size": "69.78 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6331058020477816, + "acc_stderr": 0.014084133118104296, + "acc_norm": 0.6680887372013652, + "acc_norm_stderr": 0.01376098820088053 + }, + "harness|hellaswag|10": { + "acc": 0.6496713802031467, + "acc_stderr": 0.004760978203023324, + "acc_norm": 0.8514240191196972, + "acc_norm_stderr": 0.003549431247907371 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6888888888888889, + "acc_stderr": 0.039992628766177214, + "acc_norm": 0.6888888888888889, + "acc_norm_stderr": 0.039992628766177214 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.875, + "acc_stderr": 0.026913523521537846, + "acc_norm": 0.875, + "acc_norm_stderr": 0.026913523521537846 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.76, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.76, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.8113207547169812, + "acc_stderr": 0.024079995130062246, + "acc_norm": 0.8113207547169812, + "acc_norm_stderr": 0.024079995130062246 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8888888888888888, + "acc_stderr": 0.026280550932848062, + "acc_norm": 0.8888888888888888, + "acc_norm_stderr": 0.026280550932848062 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.7167630057803468, + "acc_stderr": 0.034355680560478746, + "acc_norm": 0.7167630057803468, + "acc_norm_stderr": 0.034355680560478746 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.5490196078431373, + "acc_stderr": 0.049512182523962604, + "acc_norm": 0.5490196078431373, + "acc_norm_stderr": 0.049512182523962604 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.83, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.83, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.774468085106383, + "acc_stderr": 0.027321078417387533, + "acc_norm": 0.774468085106383, + "acc_norm_stderr": 0.027321078417387533 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5789473684210527, + "acc_stderr": 0.046446020912223177, + "acc_norm": 0.5789473684210527, + "acc_norm_stderr": 0.046446020912223177 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.7103448275862069, + "acc_stderr": 0.03780019230438015, + "acc_norm": 0.7103448275862069, + "acc_norm_stderr": 0.03780019230438015 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.6931216931216931, + "acc_stderr": 0.02375292871211214, + "acc_norm": 0.6931216931216931, + "acc_norm_stderr": 0.02375292871211214 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5317460317460317, + "acc_stderr": 0.04463112720677173, + "acc_norm": 0.5317460317460317, + "acc_norm_stderr": 0.04463112720677173 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.9, + "acc_stderr": 0.017066403719657255, + "acc_norm": 0.9, + "acc_norm_stderr": 0.017066403719657255 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.6847290640394089, + "acc_stderr": 0.03269080871970186, + "acc_norm": 0.6847290640394089, + "acc_norm_stderr": 0.03269080871970186 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8484848484848485, + "acc_stderr": 0.027998073798781668, + "acc_norm": 0.8484848484848485, + "acc_norm_stderr": 0.027998073798781668 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.9292929292929293, + "acc_stderr": 0.01826310542019949, + "acc_norm": 0.9292929292929293, + "acc_norm_stderr": 0.01826310542019949 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9740932642487047, + "acc_stderr": 0.01146452335695318, + "acc_norm": 0.9740932642487047, + "acc_norm_stderr": 0.01146452335695318 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.823076923076923, + "acc_stderr": 0.019348070174396985, + "acc_norm": 0.823076923076923, + "acc_norm_stderr": 0.019348070174396985 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.029723278961476668, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.029723278961476668 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.8487394957983193, + "acc_stderr": 0.023274255898707946, + "acc_norm": 0.8487394957983193, + "acc_norm_stderr": 0.023274255898707946 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.48344370860927155, + "acc_stderr": 0.0408024418562897, + "acc_norm": 0.48344370860927155, + "acc_norm_stderr": 0.0408024418562897 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.9284403669724771, + "acc_stderr": 0.011051255247815453, + "acc_norm": 0.9284403669724771, + "acc_norm_stderr": 0.011051255247815453 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.6435185185185185, + "acc_stderr": 0.032664783315272714, + "acc_norm": 0.6435185185185185, + "acc_norm_stderr": 0.032664783315272714 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9117647058823529, + "acc_stderr": 0.01990739979131695, + "acc_norm": 0.9117647058823529, + "acc_norm_stderr": 0.01990739979131695 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.9156118143459916, + "acc_stderr": 0.01809424711647332, + "acc_norm": 0.9156118143459916, + "acc_norm_stderr": 0.01809424711647332 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.8116591928251121, + "acc_stderr": 0.026241132996407252, + "acc_norm": 0.8116591928251121, + "acc_norm_stderr": 0.026241132996407252 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8702290076335878, + "acc_stderr": 0.029473649496907065, + "acc_norm": 0.8702290076335878, + "acc_norm_stderr": 0.029473649496907065 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.9008264462809917, + "acc_stderr": 0.02728524631275896, + "acc_norm": 0.9008264462809917, + "acc_norm_stderr": 0.02728524631275896 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8796296296296297, + "acc_stderr": 0.0314570385430625, + "acc_norm": 0.8796296296296297, + "acc_norm_stderr": 0.0314570385430625 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8588957055214724, + "acc_stderr": 0.027351605518389752, + "acc_norm": 0.8588957055214724, + "acc_norm_stderr": 0.027351605518389752 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5803571428571429, + "acc_stderr": 0.04684099321077106, + "acc_norm": 0.5803571428571429, + "acc_norm_stderr": 0.04684099321077106 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8543689320388349, + "acc_stderr": 0.0349260647662379, + "acc_norm": 0.8543689320388349, + "acc_norm_stderr": 0.0349260647662379 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9273504273504274, + "acc_stderr": 0.01700436856813234, + "acc_norm": 0.9273504273504274, + "acc_norm_stderr": 0.01700436856813234 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.9067688378033205, + "acc_stderr": 0.010397417087292849, + "acc_norm": 0.9067688378033205, + "acc_norm_stderr": 0.010397417087292849 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.8092485549132948, + "acc_stderr": 0.021152676966575284, + "acc_norm": 0.8092485549132948, + "acc_norm_stderr": 0.021152676966575284 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.6972067039106146, + "acc_stderr": 0.015366860386397112, + "acc_norm": 0.6972067039106146, + "acc_norm_stderr": 0.015366860386397112 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.8366013071895425, + "acc_stderr": 0.021170623011213516, + "acc_norm": 0.8366013071895425, + "acc_norm_stderr": 0.021170623011213516 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.8006430868167203, + "acc_stderr": 0.022691033780549656, + "acc_norm": 0.8006430868167203, + "acc_norm_stderr": 0.022691033780549656 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8580246913580247, + "acc_stderr": 0.019420260109438293, + "acc_norm": 0.8580246913580247, + "acc_norm_stderr": 0.019420260109438293 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.6205673758865248, + "acc_stderr": 0.02894733885161409, + "acc_norm": 0.6205673758865248, + "acc_norm_stderr": 0.02894733885161409 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5951760104302477, + "acc_stderr": 0.012536743830953979, + "acc_norm": 0.5951760104302477, + "acc_norm_stderr": 0.012536743830953979 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.8308823529411765, + "acc_stderr": 0.022770868010113004, + "acc_norm": 0.8308823529411765, + "acc_norm_stderr": 0.022770868010113004 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.815359477124183, + "acc_stderr": 0.01569702924075778, + "acc_norm": 0.815359477124183, + "acc_norm_stderr": 0.01569702924075778 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7181818181818181, + "acc_stderr": 0.04309118709946458, + "acc_norm": 0.7181818181818181, + "acc_norm_stderr": 0.04309118709946458 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.8408163265306122, + "acc_stderr": 0.02342097206916635, + "acc_norm": 0.8408163265306122, + "acc_norm_stderr": 0.02342097206916635 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8905472636815921, + "acc_stderr": 0.022076326101824657, + "acc_norm": 0.8905472636815921, + "acc_norm_stderr": 0.022076326101824657 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.92, + "acc_stderr": 0.0272659924344291, + "acc_norm": 0.92, + "acc_norm_stderr": 0.0272659924344291 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5843373493975904, + "acc_stderr": 0.03836722176598053, + "acc_norm": 0.5843373493975904, + "acc_norm_stderr": 0.03836722176598053 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8713450292397661, + "acc_stderr": 0.025679342723276908, + "acc_norm": 0.8713450292397661, + "acc_norm_stderr": 0.025679342723276908 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.41982864137086906, + "mc1_stderr": 0.01727703030177577, + "mc2": 0.5768450076180885, + "mc2_stderr": 0.014925146586405758 + }, + "harness|winogrande|5": { + "acc": 0.8310970797158642, + "acc_stderr": 0.01052998141183891 + }, + "harness|gsm8k|5": { + "acc": 0.6209249431387415, + "acc_stderr": 0.013363630295088356 + }, + "all": { + "acc": 0.7506953369656723, + "acc_stderr": 0.028559826064592703, + "acc_norm": 0.755544561120704, + "acc_norm_stderr": 0.029096967565438774, + "mc1": 0.41982864137086906, + "mc1_stderr": 0.01727703030177577, + "mc2": 0.5768450076180885, + "mc2_stderr": 0.014925146586405758 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "c84bbabff7655573", + "hash_cont_tokens": "e23c779c4c2dd1ec" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4682, + "non_padded": 5, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "52e70aa3670e3695", + "hash_cont_tokens": "55da5ba61989a8fe" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40097, + "non_padded": 71, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "085f405a873c9f87", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "3b492ddc5de3f57a", + "hash_cont_tokens": "5cc800feae9fa1ad" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa55e6645b3f3526", + "hash_cont_tokens": "655dbb90034f484a" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "5f80d5327a047022", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "c0a3ae71b5506278", + "hash_cont_tokens": "f77b74d946d7fc02" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "6fcc5fb2ad3a62b5", + "hash_cont_tokens": "1ba4b1a158d8bf3f" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "b3c5950ef0ab5b9f", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "d4b18e1debc64387", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "78289261a74f39aa", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "5449a8e432780f7f", + "hash_cont_tokens": "78a0ebf66d91c5cf" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "b55be981de130fed", + "hash_cont_tokens": "5a030c95824fdbe5" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "b39d36783fd07415", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "90db261ac05081a8", + "hash_cont_tokens": "2326dc60d0bc41b6" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "3b6ab5e66082a68d", + "hash_cont_tokens": "be908364b6f14dd6" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "a8e0453f990ff5aa", + "hash_cont_tokens": "179280ef597fe1bf" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 564, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "9e30d3a741143c4a", + "hash_cont_tokens": "95cdcdaf1abd0bd2" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "06838690ab0d64b9", + "hash_cont_tokens": "6a4818f3c307c346" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "50dc8670e216ba78", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "0097a3c431b4fc51", + "hash_cont_tokens": "36d0d84455f0bdba" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "75f3de0dad7830bc", + "hash_cont_tokens": "c678f794a9b8ee74" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "bc373cd584fa942b", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "507c0abd3d17fd8f", + "hash_cont_tokens": "e9c94304326d875c" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "a8ab4dfafa4f65b4", + "hash_cont_tokens": "f937a1349eb483eb" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "e33171fd6e0b4a9c", + "hash_cont_tokens": "8b27dd3907d25b4e" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "f3319223cf191987", + "hash_cont_tokens": "3763cae29e2f938c" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "2f08fbb89a3a31b0", + "hash_cont_tokens": "fd7b555352d765a4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "d2ff2b6e81f3e039", + "hash_cont_tokens": "61f46d4a209b9aa2" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "dd50a9b81a6e14a2", + "hash_cont_tokens": "4e7053e7c19d680d" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d5f514e075b8a310", + "hash_cont_tokens": "84d19ae8790476bb" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "3faf848f9d19cb14", + "hash_cont_tokens": "b119c7b668213a4e" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "dafa7c29ee53148d", + "hash_cont_tokens": "a3b126bc622d571f" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "f3f7c0cb054a9101", + "hash_cont_tokens": "9abf19ceb76331ff" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "ee334f2be12733c8", + "hash_cont_tokens": "0e2e725ae9a898da" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "a9997011eacb1c14", + "hash_cont_tokens": "a94c1dea6d775249" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "5e065bb834e5eb5f", + "hash_cont_tokens": "3832f860859bb86b" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "6694a4e4327a0eee", + "hash_cont_tokens": "9fac5a0c364fca8a" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "630193f0a85c4db4", + "hash_cont_tokens": "dc53ed31134ddf3a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "481eec60fca7d379", + "hash_cont_tokens": "e272b5456d5552d6" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "5e29b566e42d5c49", + "hash_cont_tokens": "7119d4642957b1f0" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "abc950328f30685d", + "hash_cont_tokens": "099d58c66ece3f11" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "7b7f0526063c20bd", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "2f35d509e71e13d9", + "hash_cont_tokens": "bae342d4e82ba8f7" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "a1fe66c367aec9a4", + "hash_cont_tokens": "578c64cbdbb1e0d4" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "477794fff20bb51b", + "hash_cont_tokens": "79b25f42b3fce0f9" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "f0035147162e2914", + "hash_cont_tokens": "9d1f3b976417156c" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "afde0a4bb78262a8", + "hash_cont_tokens": "88dab560e1e06d97" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "80cbaf9c72217b9b", + "hash_cont_tokens": "04ea847139fe9393" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "34fa03402fe143e2", + "hash_cont_tokens": "0435ff692ad17e68" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1124, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "970559d2709d7dfb", + "hash_cont_tokens": "b852c74e9f8801bd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "e6bad9d3d227482c", + "hash_cont_tokens": "5db0f6460652d063" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "5915ac075f743cd6", + "hash_cont_tokens": "c960676ef7f3dbe5" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "abdaa0333725e504", + "hash_cont_tokens": "3320565f412c4b01" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "5e5e21ce02813577", + "hash_cont_tokens": "218ed775ef60aab9" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "74f6e50f8da04eb6", + "hash_cont_tokens": "20babf5cc4cc7f3d" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "4234573f54827f4f", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "d8f9c3d810f8d6f2", + "hash_cont_tokens": "dc6d57296bea0882" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "a96ae58b7a2f1010", + "hash_cont_tokens": "37f53444db289ed3" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "4214b9bf45e97067", + "hash_cont_tokens": "71a67034827cd30e" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "a7eeaad96f70499b", + "hash_cont_tokens": "c93e9c22fa3077a0" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "d488b9ef001d40f5", + "hash_cont_tokens": "7c53c2445afbab6b" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "f525e4fba6bb528d", + "hash_cont_tokens": "9e4395c5cfc76726" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113445, + "non_padded": 1427, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Tess-M-v1.1/results_2023-11-27T10-17-48.853373.json b/eval-results/migtissera/Tess-M-v1.1/results_2023-11-27T10-17-48.853373.json new file mode 100644 index 0000000000000000000000000000000000000000..c033c7aa83b4419636a2ac20318b8ee516236dd5 --- /dev/null +++ b/eval-results/migtissera/Tess-M-v1.1/results_2023-11-27T10-17-48.853373.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 775466.87349868, + "end_time": 858169.892432161, + "total_evaluation_time_secondes": "82703.01893348107", + "model_name": "migtissera/Tess-M-v1.1", + "model_sha": "e5a016b08aa507fe9db45436074016928bf6f939", + "model_dtype": "torch.float16", + "model_size": "69.78 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.643344709897611, + "acc_stderr": 0.013998056902620194, + "acc_norm": 0.6715017064846417, + "acc_norm_stderr": 0.013724978465537295 + }, + "harness|hellaswag|10": { + "acc": 0.6500697072296355, + "acc_stderr": 0.004759729267943188, + "acc_norm": 0.8476399123680541, + "acc_norm_stderr": 0.0035863512488635914 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.674074074074074, + "acc_stderr": 0.040491220417025055, + "acc_norm": 0.674074074074074, + "acc_norm_stderr": 0.040491220417025055 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.875, + "acc_stderr": 0.026913523521537846, + "acc_norm": 0.875, + "acc_norm_stderr": 0.026913523521537846 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.8075471698113208, + "acc_stderr": 0.024262979839372274, + "acc_norm": 0.8075471698113208, + "acc_norm_stderr": 0.024262979839372274 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8888888888888888, + "acc_stderr": 0.02628055093284806, + "acc_norm": 0.8888888888888888, + "acc_norm_stderr": 0.02628055093284806 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.62, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.62, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6936416184971098, + "acc_stderr": 0.035149425512674394, + "acc_norm": 0.6936416184971098, + "acc_norm_stderr": 0.035149425512674394 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.049665709039785295, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.049665709039785295 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.82, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.82, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.7404255319148936, + "acc_stderr": 0.028659179374292326, + "acc_norm": 0.7404255319148936, + "acc_norm_stderr": 0.028659179374292326 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5964912280701754, + "acc_stderr": 0.04615186962583707, + "acc_norm": 0.5964912280701754, + "acc_norm_stderr": 0.04615186962583707 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.7586206896551724, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.7586206896551724, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.6243386243386243, + "acc_stderr": 0.024942368931159777, + "acc_norm": 0.6243386243386243, + "acc_norm_stderr": 0.024942368931159777 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5158730158730159, + "acc_stderr": 0.044698818540726076, + "acc_norm": 0.5158730158730159, + "acc_norm_stderr": 0.044698818540726076 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8870967741935484, + "acc_stderr": 0.01800360332586362, + "acc_norm": 0.8870967741935484, + "acc_norm_stderr": 0.01800360332586362 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.6748768472906403, + "acc_stderr": 0.032957975663112704, + "acc_norm": 0.6748768472906403, + "acc_norm_stderr": 0.032957975663112704 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932261, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932261 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8424242424242424, + "acc_stderr": 0.028450388805284343, + "acc_norm": 0.8424242424242424, + "acc_norm_stderr": 0.028450388805284343 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.9242424242424242, + "acc_stderr": 0.018852670234993107, + "acc_norm": 0.9242424242424242, + "acc_norm_stderr": 0.018852670234993107 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9792746113989638, + "acc_stderr": 0.010281417011909039, + "acc_norm": 0.9792746113989638, + "acc_norm_stderr": 0.010281417011909039 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7769230769230769, + "acc_stderr": 0.02110773012724401, + "acc_norm": 0.7769230769230769, + "acc_norm_stderr": 0.02110773012724401 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.362962962962963, + "acc_stderr": 0.02931820364520686, + "acc_norm": 0.362962962962963, + "acc_norm_stderr": 0.02931820364520686 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.8277310924369747, + "acc_stderr": 0.024528664971305424, + "acc_norm": 0.8277310924369747, + "acc_norm_stderr": 0.024528664971305424 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.46357615894039733, + "acc_stderr": 0.04071636065944215, + "acc_norm": 0.46357615894039733, + "acc_norm_stderr": 0.04071636065944215 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.9064220183486239, + "acc_stderr": 0.012486841824601963, + "acc_norm": 0.9064220183486239, + "acc_norm_stderr": 0.012486841824601963 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.6388888888888888, + "acc_stderr": 0.032757734861009996, + "acc_norm": 0.6388888888888888, + "acc_norm_stderr": 0.032757734861009996 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9068627450980392, + "acc_stderr": 0.020397853969426998, + "acc_norm": 0.9068627450980392, + "acc_norm_stderr": 0.020397853969426998 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8818565400843882, + "acc_stderr": 0.021011052659878446, + "acc_norm": 0.8818565400843882, + "acc_norm_stderr": 0.021011052659878446 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7847533632286996, + "acc_stderr": 0.027584066602208274, + "acc_norm": 0.7847533632286996, + "acc_norm_stderr": 0.027584066602208274 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8244274809160306, + "acc_stderr": 0.03336820338476073, + "acc_norm": 0.8244274809160306, + "acc_norm_stderr": 0.03336820338476073 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.9173553719008265, + "acc_stderr": 0.025135382356604227, + "acc_norm": 0.9173553719008265, + "acc_norm_stderr": 0.025135382356604227 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8518518518518519, + "acc_stderr": 0.03434300243631, + "acc_norm": 0.8518518518518519, + "acc_norm_stderr": 0.03434300243631 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.852760736196319, + "acc_stderr": 0.027839915278339653, + "acc_norm": 0.852760736196319, + "acc_norm_stderr": 0.027839915278339653 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5892857142857143, + "acc_stderr": 0.04669510663875191, + "acc_norm": 0.5892857142857143, + "acc_norm_stderr": 0.04669510663875191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8737864077669902, + "acc_stderr": 0.03288180278808628, + "acc_norm": 0.8737864077669902, + "acc_norm_stderr": 0.03288180278808628 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9188034188034188, + "acc_stderr": 0.017893784904018543, + "acc_norm": 0.9188034188034188, + "acc_norm_stderr": 0.017893784904018543 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.81, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.81, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.896551724137931, + "acc_stderr": 0.010890452544691504, + "acc_norm": 0.896551724137931, + "acc_norm_stderr": 0.010890452544691504 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.8063583815028902, + "acc_stderr": 0.021274230317515564, + "acc_norm": 0.8063583815028902, + "acc_norm_stderr": 0.021274230317515564 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.7385474860335196, + "acc_stderr": 0.014696599650364552, + "acc_norm": 0.7385474860335196, + "acc_norm_stderr": 0.014696599650364552 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.021339479988816027, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.021339479988816027 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.8006430868167203, + "acc_stderr": 0.022691033780549656, + "acc_norm": 0.8006430868167203, + "acc_norm_stderr": 0.022691033780549656 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.845679012345679, + "acc_stderr": 0.020100830999851, + "acc_norm": 0.845679012345679, + "acc_norm_stderr": 0.020100830999851 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.6063829787234043, + "acc_stderr": 0.029144544781596157, + "acc_norm": 0.6063829787234043, + "acc_norm_stderr": 0.029144544781596157 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5645371577574967, + "acc_stderr": 0.012663412101248344, + "acc_norm": 0.5645371577574967, + "acc_norm_stderr": 0.012663412101248344 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7941176470588235, + "acc_stderr": 0.02456220431414231, + "acc_norm": 0.7941176470588235, + "acc_norm_stderr": 0.02456220431414231 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.8055555555555556, + "acc_stderr": 0.01601123799633693, + "acc_norm": 0.8055555555555556, + "acc_norm_stderr": 0.01601123799633693 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.04350271442923243, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.04350271442923243 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.8163265306122449, + "acc_stderr": 0.02478907133200765, + "acc_norm": 0.8163265306122449, + "acc_norm_stderr": 0.02478907133200765 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8855721393034826, + "acc_stderr": 0.022509345325101706, + "acc_norm": 0.8855721393034826, + "acc_norm_stderr": 0.022509345325101706 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.9, + "acc_stderr": 0.030151134457776334, + "acc_norm": 0.9, + "acc_norm_stderr": 0.030151134457776334 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.572289156626506, + "acc_stderr": 0.03851597683718533, + "acc_norm": 0.572289156626506, + "acc_norm_stderr": 0.03851597683718533 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.847953216374269, + "acc_stderr": 0.027539122889061452, + "acc_norm": 0.847953216374269, + "acc_norm_stderr": 0.027539122889061452 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.397796817625459, + "mc1_stderr": 0.017133934248559635, + "mc2": 0.5479512734029787, + "mc2_stderr": 0.015226252471194385 + }, + "harness|winogrande|5": { + "acc": 0.8287292817679558, + "acc_stderr": 0.010588417294962524 + }, + "harness|drop|3": { + "em": 0.3674496644295302, + "em_stderr": 0.004937261432204727, + "f1": 0.4201719798657712, + "f1_stderr": 0.004750991171404277 + }, + "harness|gsm8k|5": { + "acc": 0.43896891584533737, + "acc_stderr": 0.013669500369036216 + }, + "all": { + "acc": 0.7381069879348441, + "acc_stderr": 0.02905272580642503, + "acc_norm": 0.745467038443011, + "acc_norm_stderr": 0.02960189661138585, + "mc1": 0.397796817625459, + "mc1_stderr": 0.017133934248559635, + "mc2": 0.5479512734029787, + "mc2_stderr": 0.015226252471194385, + "em": 0.3674496644295302, + "em_stderr": 0.004937261432204727, + "f1": 0.4201719798657712, + "f1_stderr": 0.004750991171404277 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "f52f7134dd4e8235", + "hash_cont_tokens": "e23c779c4c2dd1ec" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4682, + "non_padded": 5, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "8380af90422a117e", + "hash_cont_tokens": "55da5ba61989a8fe" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40097, + "non_padded": 71, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "9185dc38dcc328ea", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "90fdbbaaf0213cec", + "hash_cont_tokens": "5cc800feae9fa1ad" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "cbe1c711494076b6", + "hash_cont_tokens": "655dbb90034f484a" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "09397035a4a73e5f", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "90c311de52544438", + "hash_cont_tokens": "f77b74d946d7fc02" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "d8fd4e3af4ae46c3", + "hash_cont_tokens": "1ba4b1a158d8bf3f" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "da514a10083e8e97", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "7ccea65975bb46d4", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "8ea8585f6adc2650", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9d07c6e852253252", + "hash_cont_tokens": "78a0ebf66d91c5cf" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "0d3d540477f9eddb", + "hash_cont_tokens": "5a030c95824fdbe5" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5ebc754afaa1fac8", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "7780b9cde8badacb", + "hash_cont_tokens": "2326dc60d0bc41b6" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "8acec1576892f7ab", + "hash_cont_tokens": "be908364b6f14dd6" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e0321889f63f18d7", + "hash_cont_tokens": "179280ef597fe1bf" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 564, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "60e497887b9e2608", + "hash_cont_tokens": "95cdcdaf1abd0bd2" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "53adc0607e358206", + "hash_cont_tokens": "6a4818f3c307c346" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "34682f752c1a1ac4", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "bb5cc287970e5c14", + "hash_cont_tokens": "36d0d84455f0bdba" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b12197fdbc9a45f0", + "hash_cont_tokens": "c678f794a9b8ee74" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "36408b638d9d7a8d", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "652bd20e505a2826", + "hash_cont_tokens": "e9c94304326d875c" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "8f4cd01faf05c6f1", + "hash_cont_tokens": "f937a1349eb483eb" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "217861435fcb5576", + "hash_cont_tokens": "8b27dd3907d25b4e" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "bcedb3cf953f812f", + "hash_cont_tokens": "3763cae29e2f938c" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "52affce916d66c97", + "hash_cont_tokens": "fd7b555352d765a4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "b9d29201856d353d", + "hash_cont_tokens": "61f46d4a209b9aa2" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "9c27af329cb41097", + "hash_cont_tokens": "4e7053e7c19d680d" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "192aef17a8956826", + "hash_cont_tokens": "84d19ae8790476bb" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a9bc6c02c6f83983", + "hash_cont_tokens": "b119c7b668213a4e" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "14741fa2bd2a4414", + "hash_cont_tokens": "a3b126bc622d571f" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "67f306eb2bf3d2cb", + "hash_cont_tokens": "9abf19ceb76331ff" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "e5cc30c46358588f", + "hash_cont_tokens": "0e2e725ae9a898da" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "10a6536adeac8632", + "hash_cont_tokens": "a94c1dea6d775249" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d9015aba41ce0d5c", + "hash_cont_tokens": "3832f860859bb86b" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "d5f2109de63c3402", + "hash_cont_tokens": "9fac5a0c364fca8a" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e0b39eb7c9788cfe", + "hash_cont_tokens": "dc53ed31134ddf3a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "643a872ad0f99bb0", + "hash_cont_tokens": "e272b5456d5552d6" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "1232c5b0f524b151", + "hash_cont_tokens": "7119d4642957b1f0" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "f1d76d4a1e08e901", + "hash_cont_tokens": "099d58c66ece3f11" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "cd181ff20fe83b83", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "a3d90d10e2efc569", + "hash_cont_tokens": "bae342d4e82ba8f7" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "4b35576715cc147a", + "hash_cont_tokens": "578c64cbdbb1e0d4" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "1b93703ae85294ee", + "hash_cont_tokens": "79b25f42b3fce0f9" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "6741a26253bd4258", + "hash_cont_tokens": "9d1f3b976417156c" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "730a52e273f8fcf5", + "hash_cont_tokens": "88dab560e1e06d97" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "9e211e939e14b414", + "hash_cont_tokens": "04ea847139fe9393" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "d5761e6be99ed835", + "hash_cont_tokens": "0435ff692ad17e68" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1124, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "fcbc59834dbaa06c", + "hash_cont_tokens": "b852c74e9f8801bd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "ba5999ee85a41b08", + "hash_cont_tokens": "5db0f6460652d063" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "35652463c3b2d9c6", + "hash_cont_tokens": "c960676ef7f3dbe5" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "af501bc2c58d000f", + "hash_cont_tokens": "3320565f412c4b01" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "5df7af45226ffc3a", + "hash_cont_tokens": "218ed775ef60aab9" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "5dc2e3734f4dd402", + "hash_cont_tokens": "20babf5cc4cc7f3d" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "ed972b660c40d1e4", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "ed703c55cc114c98", + "hash_cont_tokens": "dc6d57296bea0882" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "00cf9f5943b1480b", + "hash_cont_tokens": "37f53444db289ed3" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "5e931dfc6ab75011", + "hash_cont_tokens": "71a67034827cd30e" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "bd055e8ba456ab4a", + "hash_cont_tokens": "c93e9c22fa3077a0" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "38e6103f4f5a5085", + "hash_cont_tokens": "1266eb0af2714377" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "5cae6c4034435931", + "hash_cont_tokens": "a94448e865e1e677" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "8f81e6c651620b85", + "hash_cont_tokens": "2056d140918b7f4b" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113445, + "non_padded": 10963, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Tess-M-v1.1/results_2023-12-03T00-02-00.527557.json b/eval-results/migtissera/Tess-M-v1.1/results_2023-12-03T00-02-00.527557.json new file mode 100644 index 0000000000000000000000000000000000000000..6c020d0abce09dcfa2cab0d85704025d5f5eb5a3 --- /dev/null +++ b/eval-results/migtissera/Tess-M-v1.1/results_2023-12-03T00-02-00.527557.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 6555.609323487, + "end_time": 16593.536951618, + "total_evaluation_time_secondes": "10037.927628131", + "model_name": "migtissera/Tess-M-v1.1", + "model_sha": "e5a016b08aa507fe9db45436074016928bf6f939", + "model_dtype": "torch.float16", + "model_size": "69.78 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.5466262319939348, + "acc_stderr": 0.013712471049515444 + }, + "all": { + "acc": 0.5466262319939348, + "acc_stderr": 0.013712471049515444 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "5cae6c4034435931", + "hash_cont_tokens": "a94448e865e1e677" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "e91836b3786e690b", + "hash_cont_tokens": "69885acce147c768" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Tess-M-v1.3/results_2023-12-04T23-32-51.712332.json b/eval-results/migtissera/Tess-M-v1.3/results_2023-12-04T23-32-51.712332.json new file mode 100644 index 0000000000000000000000000000000000000000..e32f754ecea99d491f779639cc7e0b640d840ea3 --- /dev/null +++ b/eval-results/migtissera/Tess-M-v1.3/results_2023-12-04T23-32-51.712332.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 135323.57183851, + "end_time": 187640.710671092, + "total_evaluation_time_secondes": "52317.138832582015", + "model_name": "migtissera/Tess-M-v1.3", + "model_sha": "7d733ec8449ec0219a9f499084a94a4248846f7e", + "model_dtype": "torch.float16", + "model_size": "69.78 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5921501706484642, + "acc_stderr": 0.014361097288449705, + "acc_norm": 0.6254266211604096, + "acc_norm_stderr": 0.014144193471893456 + }, + "harness|hellaswag|10": { + "acc": 0.6494722166899024, + "acc_stderr": 0.004761601303258892, + "acc_norm": 0.8394742083250348, + "acc_norm_stderr": 0.0036634275361781586 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.7111111111111111, + "acc_stderr": 0.03915450630414251, + "acc_norm": 0.7111111111111111, + "acc_norm_stderr": 0.03915450630414251 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.8618421052631579, + "acc_stderr": 0.028081042939576552, + "acc_norm": 0.8618421052631579, + "acc_norm_stderr": 0.028081042939576552 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.76, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.76, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.8377358490566038, + "acc_stderr": 0.022691482872035353, + "acc_norm": 0.8377358490566038, + "acc_norm_stderr": 0.022691482872035353 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8888888888888888, + "acc_stderr": 0.026280550932848062, + "acc_norm": 0.8888888888888888, + "acc_norm_stderr": 0.026280550932848062 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.48, + "acc_stderr": 0.05021167315686779, + "acc_norm": 0.48, + "acc_norm_stderr": 0.05021167315686779 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.7109826589595376, + "acc_stderr": 0.03456425745086999, + "acc_norm": 0.7109826589595376, + "acc_norm_stderr": 0.03456425745086999 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.43137254901960786, + "acc_stderr": 0.04928099597287534, + "acc_norm": 0.43137254901960786, + "acc_norm_stderr": 0.04928099597287534 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.82, + "acc_stderr": 0.03861229196653695, + "acc_norm": 0.82, + "acc_norm_stderr": 0.03861229196653695 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.7829787234042553, + "acc_stderr": 0.026947483121496228, + "acc_norm": 0.7829787234042553, + "acc_norm_stderr": 0.026947483121496228 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5789473684210527, + "acc_stderr": 0.046446020912223177, + "acc_norm": 0.5789473684210527, + "acc_norm_stderr": 0.046446020912223177 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.7448275862068966, + "acc_stderr": 0.03632984052707842, + "acc_norm": 0.7448275862068966, + "acc_norm_stderr": 0.03632984052707842 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.6640211640211641, + "acc_stderr": 0.02432631052914915, + "acc_norm": 0.6640211640211641, + "acc_norm_stderr": 0.02432631052914915 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.04426266681379909, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.04426266681379909 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.9, + "acc_stderr": 0.017066403719657248, + "acc_norm": 0.9, + "acc_norm_stderr": 0.017066403719657248 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.6650246305418719, + "acc_stderr": 0.033208527423483104, + "acc_norm": 0.6650246305418719, + "acc_norm_stderr": 0.033208527423483104 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8666666666666667, + "acc_stderr": 0.026544435312706467, + "acc_norm": 0.8666666666666667, + "acc_norm_stderr": 0.026544435312706467 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.9040404040404041, + "acc_stderr": 0.02098480861004794, + "acc_norm": 0.9040404040404041, + "acc_norm_stderr": 0.02098480861004794 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9637305699481865, + "acc_stderr": 0.01349265975129514, + "acc_norm": 0.9637305699481865, + "acc_norm_stderr": 0.01349265975129514 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.8025641025641026, + "acc_stderr": 0.02018264696867483, + "acc_norm": 0.8025641025641026, + "acc_norm_stderr": 0.02018264696867483 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3925925925925926, + "acc_stderr": 0.02977384701253297, + "acc_norm": 0.3925925925925926, + "acc_norm_stderr": 0.02977384701253297 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.8445378151260504, + "acc_stderr": 0.023536818625398904, + "acc_norm": 0.8445378151260504, + "acc_norm_stderr": 0.023536818625398904 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.47019867549668876, + "acc_stderr": 0.04075224992216979, + "acc_norm": 0.47019867549668876, + "acc_norm_stderr": 0.04075224992216979 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.9155963302752294, + "acc_stderr": 0.011918819327334889, + "acc_norm": 0.9155963302752294, + "acc_norm_stderr": 0.011918819327334889 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.6203703703703703, + "acc_stderr": 0.03309682581119035, + "acc_norm": 0.6203703703703703, + "acc_norm_stderr": 0.03309682581119035 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9362745098039216, + "acc_stderr": 0.01714392165552496, + "acc_norm": 0.9362745098039216, + "acc_norm_stderr": 0.01714392165552496 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.9156118143459916, + "acc_stderr": 0.01809424711647332, + "acc_norm": 0.9156118143459916, + "acc_norm_stderr": 0.01809424711647332 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.8071748878923767, + "acc_stderr": 0.026478240960489365, + "acc_norm": 0.8071748878923767, + "acc_norm_stderr": 0.026478240960489365 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8625954198473282, + "acc_stderr": 0.030194823996804468, + "acc_norm": 0.8625954198473282, + "acc_norm_stderr": 0.030194823996804468 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8760330578512396, + "acc_stderr": 0.030083098716035216, + "acc_norm": 0.8760330578512396, + "acc_norm_stderr": 0.030083098716035216 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8796296296296297, + "acc_stderr": 0.03145703854306251, + "acc_norm": 0.8796296296296297, + "acc_norm_stderr": 0.03145703854306251 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8588957055214724, + "acc_stderr": 0.027351605518389752, + "acc_norm": 0.8588957055214724, + "acc_norm_stderr": 0.027351605518389752 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.6071428571428571, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.6071428571428571, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8640776699029126, + "acc_stderr": 0.033932957297610096, + "acc_norm": 0.8640776699029126, + "acc_norm_stderr": 0.033932957297610096 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9273504273504274, + "acc_stderr": 0.01700436856813234, + "acc_norm": 0.9273504273504274, + "acc_norm_stderr": 0.01700436856813234 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.9067688378033205, + "acc_stderr": 0.010397417087292847, + "acc_norm": 0.9067688378033205, + "acc_norm_stderr": 0.010397417087292847 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.8034682080924855, + "acc_stderr": 0.021393961404363847, + "acc_norm": 0.8034682080924855, + "acc_norm_stderr": 0.021393961404363847 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.6871508379888268, + "acc_stderr": 0.015506892594647258, + "acc_norm": 0.6871508379888268, + "acc_norm_stderr": 0.015506892594647258 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.021339479988816024, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.021339479988816024 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.819935691318328, + "acc_stderr": 0.021823422857744943, + "acc_norm": 0.819935691318328, + "acc_norm_stderr": 0.021823422857744943 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8765432098765432, + "acc_stderr": 0.01830386880689179, + "acc_norm": 0.8765432098765432, + "acc_norm_stderr": 0.01830386880689179 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.6312056737588653, + "acc_stderr": 0.02878222756134726, + "acc_norm": 0.6312056737588653, + "acc_norm_stderr": 0.02878222756134726 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5984354628422425, + "acc_stderr": 0.01252031512014712, + "acc_norm": 0.5984354628422425, + "acc_norm_stderr": 0.01252031512014712 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.8272058823529411, + "acc_stderr": 0.022966067585581795, + "acc_norm": 0.8272058823529411, + "acc_norm_stderr": 0.022966067585581795 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.8186274509803921, + "acc_stderr": 0.015588643495370463, + "acc_norm": 0.8186274509803921, + "acc_norm_stderr": 0.015588643495370463 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.8081632653061225, + "acc_stderr": 0.02520696315422539, + "acc_norm": 0.8081632653061225, + "acc_norm_stderr": 0.02520696315422539 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8805970149253731, + "acc_stderr": 0.02292879327721974, + "acc_norm": 0.8805970149253731, + "acc_norm_stderr": 0.02292879327721974 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.91, + "acc_stderr": 0.02876234912646613, + "acc_norm": 0.91, + "acc_norm_stderr": 0.02876234912646613 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5843373493975904, + "acc_stderr": 0.03836722176598053, + "acc_norm": 0.5843373493975904, + "acc_norm_stderr": 0.03836722176598053 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8654970760233918, + "acc_stderr": 0.0261682213446623, + "acc_norm": 0.8654970760233918, + "acc_norm_stderr": 0.0261682213446623 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.397796817625459, + "mc1_stderr": 0.017133934248559638, + "mc2": 0.5603469779031626, + "mc2_stderr": 0.015661408014010857 + }, + "harness|winogrande|5": { + "acc": 0.8113654301499605, + "acc_stderr": 0.010995172318019799 + }, + "harness|gsm8k|5": { + "acc": 0.5921152388172858, + "acc_stderr": 0.013536742075643088 + }, + "all": { + "acc": 0.747566002523043, + "acc_stderr": 0.028749261755203245, + "acc_norm": 0.7529037953743296, + "acc_norm_stderr": 0.029285728391357593, + "mc1": 0.397796817625459, + "mc1_stderr": 0.017133934248559638, + "mc2": 0.5603469779031626, + "mc2_stderr": 0.015661408014010857 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "40489066810a8901", + "hash_cont_tokens": "e23c779c4c2dd1ec" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4682, + "non_padded": 5, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ef60cabaa9013478", + "hash_cont_tokens": "55da5ba61989a8fe" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40097, + "non_padded": 71, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "57ace135b7466127", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "42dc1ea1fa6d82c1", + "hash_cont_tokens": "5cc800feae9fa1ad" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "8aebf432b444ab39", + "hash_cont_tokens": "655dbb90034f484a" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "d7f1ea50cc3bbeb3", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "b478c0c5b9db6649", + "hash_cont_tokens": "f77b74d946d7fc02" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "b1c627c6eb96a0c5", + "hash_cont_tokens": "1ba4b1a158d8bf3f" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "9288d8baf6845601", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "aed771600d6a99f9", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "d894b8fd73f824bf", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "82c1d19db93b0d1a", + "hash_cont_tokens": "78a0ebf66d91c5cf" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "8fc19b050e0980c9", + "hash_cont_tokens": "5a030c95824fdbe5" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4bee692ef11ce74d", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "36ec55566216bf8c", + "hash_cont_tokens": "2326dc60d0bc41b6" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "5f8a6ca26af4fa24", + "hash_cont_tokens": "be908364b6f14dd6" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "f27fa41a971012bc", + "hash_cont_tokens": "179280ef597fe1bf" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 564, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "a01362f218292970", + "hash_cont_tokens": "95cdcdaf1abd0bd2" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "52298ae27ce37603", + "hash_cont_tokens": "6a4818f3c307c346" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "8a8ad22aa325ceab", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "dffa06152b2f8493", + "hash_cont_tokens": "36d0d84455f0bdba" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b2b45cc2ae4ceb25", + "hash_cont_tokens": "c678f794a9b8ee74" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "1e5e2a1ef482ec75", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "97fdb399c1648085", + "hash_cont_tokens": "e9c94304326d875c" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6c6a6e35fc178837", + "hash_cont_tokens": "f937a1349eb483eb" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "2b67aa85a6bf6a30", + "hash_cont_tokens": "8b27dd3907d25b4e" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "d62be00061c87ef4", + "hash_cont_tokens": "3763cae29e2f938c" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "5793fc23ddb8a65d", + "hash_cont_tokens": "fd7b555352d765a4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "f87090bf0f08ad42", + "hash_cont_tokens": "61f46d4a209b9aa2" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "685c468b5589f463", + "hash_cont_tokens": "4e7053e7c19d680d" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "e77a448734066d49", + "hash_cont_tokens": "84d19ae8790476bb" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "66db7c250b356884", + "hash_cont_tokens": "b119c7b668213a4e" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "584802bdd1a87b58", + "hash_cont_tokens": "a3b126bc622d571f" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "94e130c89c7d75f6", + "hash_cont_tokens": "9abf19ceb76331ff" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c6d293e76bcfb74a", + "hash_cont_tokens": "0e2e725ae9a898da" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "9a7d2880571d0567", + "hash_cont_tokens": "a94c1dea6d775249" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cd30ba63d38ab1f6", + "hash_cont_tokens": "3832f860859bb86b" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "0e68d6fc5a38055b", + "hash_cont_tokens": "9fac5a0c364fca8a" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "29871cd209a1497d", + "hash_cont_tokens": "dc53ed31134ddf3a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "0eaeff5a26043f24", + "hash_cont_tokens": "e272b5456d5552d6" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "ef74b35eddcc2485", + "hash_cont_tokens": "7119d4642957b1f0" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "9be1a8c16522eab9", + "hash_cont_tokens": "099d58c66ece3f11" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "eb07817458c8a2e6", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d3e380b9452ad897", + "hash_cont_tokens": "bae342d4e82ba8f7" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "08b0b14c698b0352", + "hash_cont_tokens": "578c64cbdbb1e0d4" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "d86956f0d743889d", + "hash_cont_tokens": "79b25f42b3fce0f9" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "61e00d762a54dbc7", + "hash_cont_tokens": "9d1f3b976417156c" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd9ffbd919a90135", + "hash_cont_tokens": "88dab560e1e06d97" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3e1813f766721f1e", + "hash_cont_tokens": "04ea847139fe9393" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "7586f4ed3328692c", + "hash_cont_tokens": "0435ff692ad17e68" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1124, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "83ae4dc0f5d2d065", + "hash_cont_tokens": "b852c74e9f8801bd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "0db2792632b8d50c", + "hash_cont_tokens": "5db0f6460652d063" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "c3a7c2a8f5a3c728", + "hash_cont_tokens": "c960676ef7f3dbe5" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "152b0e047ae22138", + "hash_cont_tokens": "3320565f412c4b01" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "6f999fb3e43c58fb", + "hash_cont_tokens": "218ed775ef60aab9" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fc84e0b3f5d27284", + "hash_cont_tokens": "20babf5cc4cc7f3d" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "22fb144e223c4380", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "f79f449601363c0f", + "hash_cont_tokens": "dc6d57296bea0882" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "2e3848d7ccb89a7a", + "hash_cont_tokens": "37f53444db289ed3" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "b67103f10d8a2f4d", + "hash_cont_tokens": "71a67034827cd30e" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "b062d01ca34fb467", + "hash_cont_tokens": "c93e9c22fa3077a0" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "ef77dd2b59b34427", + "hash_cont_tokens": "cb2c27e07f5073f4" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "b6e3fb9b64a22343", + "hash_cont_tokens": "c2371868cc15f244" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113445, + "non_padded": 1427, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Tess-XS-v1-3-yarn-128K/results_2023-12-04T11-54-49.331822.json b/eval-results/migtissera/Tess-XS-v1-3-yarn-128K/results_2023-12-04T11-54-49.331822.json new file mode 100644 index 0000000000000000000000000000000000000000..70502115add76eeb73518e651d18151a03c4190d --- /dev/null +++ b/eval-results/migtissera/Tess-XS-v1-3-yarn-128K/results_2023-12-04T11-54-49.331822.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 136378.873978453, + "end_time": 145765.990659802, + "total_evaluation_time_secondes": "9387.116681348998", + "model_name": "migtissera/Tess-XS-v1-3-yarn-128K", + "model_sha": "0f5977a5d2fa791359dc92eb1574b6112e709cad", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5844709897610921, + "acc_stderr": 0.014401366641216384, + "acc_norm": 0.6109215017064846, + "acc_norm_stderr": 0.014247309976045607 + }, + "harness|hellaswag|10": { + "acc": 0.6498705437163912, + "acc_stderr": 0.004760354191370853, + "acc_norm": 0.8295160326628161, + "acc_norm_stderr": 0.003752888662249556 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421296, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421296 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5925925925925926, + "acc_stderr": 0.04244633238353228, + "acc_norm": 0.5925925925925926, + "acc_norm_stderr": 0.04244633238353228 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6381578947368421, + "acc_stderr": 0.03910525752849724, + "acc_norm": 0.6381578947368421, + "acc_norm_stderr": 0.03910525752849724 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.660377358490566, + "acc_stderr": 0.02914690474779833, + "acc_norm": 0.660377358490566, + "acc_norm_stderr": 0.02914690474779833 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7083333333333334, + "acc_stderr": 0.03800968060554859, + "acc_norm": 0.7083333333333334, + "acc_norm_stderr": 0.03800968060554859 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5953757225433526, + "acc_stderr": 0.03742461193887248, + "acc_norm": 0.5953757225433526, + "acc_norm_stderr": 0.03742461193887248 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.39215686274509803, + "acc_stderr": 0.048580835742663454, + "acc_norm": 0.39215686274509803, + "acc_norm_stderr": 0.048580835742663454 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932261, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932261 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5617021276595745, + "acc_stderr": 0.032436186361081004, + "acc_norm": 0.5617021276595745, + "acc_norm_stderr": 0.032436186361081004 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.43859649122807015, + "acc_stderr": 0.04668000738510455, + "acc_norm": 0.43859649122807015, + "acc_norm_stderr": 0.04668000738510455 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.41534391534391535, + "acc_stderr": 0.025379524910778405, + "acc_norm": 0.41534391534391535, + "acc_norm_stderr": 0.025379524910778405 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.043435254289490965, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.043435254289490965 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7387096774193549, + "acc_stderr": 0.024993053397764826, + "acc_norm": 0.7387096774193549, + "acc_norm_stderr": 0.024993053397764826 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.46798029556650245, + "acc_stderr": 0.03510766597959217, + "acc_norm": 0.46798029556650245, + "acc_norm_stderr": 0.03510766597959217 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526066, + "acc_norm": 0.67, + "acc_norm_stderr": 0.047258156262526066 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7454545454545455, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.7454545454545455, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7525252525252525, + "acc_stderr": 0.030746300742124484, + "acc_norm": 0.7525252525252525, + "acc_norm_stderr": 0.030746300742124484 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8808290155440415, + "acc_stderr": 0.02338193534812144, + "acc_norm": 0.8808290155440415, + "acc_norm_stderr": 0.02338193534812144 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6102564102564103, + "acc_stderr": 0.024726967886647078, + "acc_norm": 0.6102564102564103, + "acc_norm_stderr": 0.024726967886647078 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34444444444444444, + "acc_stderr": 0.02897264888484427, + "acc_norm": 0.34444444444444444, + "acc_norm_stderr": 0.02897264888484427 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6302521008403361, + "acc_stderr": 0.03135709599613591, + "acc_norm": 0.6302521008403361, + "acc_norm_stderr": 0.03135709599613591 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8055045871559633, + "acc_stderr": 0.016970289090458033, + "acc_norm": 0.8055045871559633, + "acc_norm_stderr": 0.016970289090458033 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4351851851851852, + "acc_stderr": 0.03381200005643525, + "acc_norm": 0.4351851851851852, + "acc_norm_stderr": 0.03381200005643525 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7941176470588235, + "acc_stderr": 0.028379449451588667, + "acc_norm": 0.7941176470588235, + "acc_norm_stderr": 0.028379449451588667 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7974683544303798, + "acc_stderr": 0.026160568246601446, + "acc_norm": 0.7974683544303798, + "acc_norm_stderr": 0.026160568246601446 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7040358744394619, + "acc_stderr": 0.030636591348699796, + "acc_norm": 0.7040358744394619, + "acc_norm_stderr": 0.030636591348699796 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7633587786259542, + "acc_stderr": 0.03727673575596914, + "acc_norm": 0.7633587786259542, + "acc_norm_stderr": 0.03727673575596914 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7520661157024794, + "acc_stderr": 0.039418975265163025, + "acc_norm": 0.7520661157024794, + "acc_norm_stderr": 0.039418975265163025 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7685185185185185, + "acc_stderr": 0.04077494709252627, + "acc_norm": 0.7685185185185185, + "acc_norm_stderr": 0.04077494709252627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7239263803680982, + "acc_stderr": 0.035123852837050475, + "acc_norm": 0.7239263803680982, + "acc_norm_stderr": 0.035123852837050475 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.03916667762822585, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.03916667762822585 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8162393162393162, + "acc_stderr": 0.02537213967172293, + "acc_norm": 0.8162393162393162, + "acc_norm_stderr": 0.02537213967172293 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.73, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.73, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8148148148148148, + "acc_stderr": 0.013890862162876164, + "acc_norm": 0.8148148148148148, + "acc_norm_stderr": 0.013890862162876164 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6907514450867052, + "acc_stderr": 0.024883140570071762, + "acc_norm": 0.6907514450867052, + "acc_norm_stderr": 0.024883140570071762 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4212290502793296, + "acc_stderr": 0.016513676031179602, + "acc_norm": 0.4212290502793296, + "acc_norm_stderr": 0.016513676031179602 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7549019607843137, + "acc_stderr": 0.02463004897982478, + "acc_norm": 0.7549019607843137, + "acc_norm_stderr": 0.02463004897982478 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6784565916398714, + "acc_stderr": 0.026527724079528872, + "acc_norm": 0.6784565916398714, + "acc_norm_stderr": 0.026527724079528872 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.691358024691358, + "acc_stderr": 0.025702640260603742, + "acc_norm": 0.691358024691358, + "acc_norm_stderr": 0.025702640260603742 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.475177304964539, + "acc_stderr": 0.029790719243829714, + "acc_norm": 0.475177304964539, + "acc_norm_stderr": 0.029790719243829714 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4380704041720991, + "acc_stderr": 0.01267190278256765, + "acc_norm": 0.4380704041720991, + "acc_norm_stderr": 0.01267190278256765 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6507352941176471, + "acc_stderr": 0.02895975519682487, + "acc_norm": 0.6507352941176471, + "acc_norm_stderr": 0.02895975519682487 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6339869281045751, + "acc_stderr": 0.019488025745529675, + "acc_norm": 0.6339869281045751, + "acc_norm_stderr": 0.019488025745529675 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7183673469387755, + "acc_stderr": 0.02879518557429129, + "acc_norm": 0.7183673469387755, + "acc_norm_stderr": 0.02879518557429129 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7910447761194029, + "acc_stderr": 0.028748298931728655, + "acc_norm": 0.7910447761194029, + "acc_norm_stderr": 0.028748298931728655 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5662650602409639, + "acc_stderr": 0.03858158940685515, + "acc_norm": 0.5662650602409639, + "acc_norm_stderr": 0.03858158940685515 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7894736842105263, + "acc_stderr": 0.031267817146631786, + "acc_norm": 0.7894736842105263, + "acc_norm_stderr": 0.031267817146631786 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35006119951040393, + "mc1_stderr": 0.01669794942015103, + "mc2": 0.5013108457590786, + "mc2_stderr": 0.01575386133585655 + }, + "harness|winogrande|5": { + "acc": 0.744277821625888, + "acc_stderr": 0.012261253845440473 + }, + "harness|gsm8k|5": { + "acc": 0.4518574677786202, + "acc_stderr": 0.013708494995677651 + }, + "all": { + "acc": 0.62055773800793, + "acc_stderr": 0.0327675140337674, + "acc_norm": 0.6248132666096785, + "acc_norm_stderr": 0.03341842516990511, + "mc1": 0.35006119951040393, + "mc1_stderr": 0.01669794942015103, + "mc2": 0.5013108457590786, + "mc2_stderr": 0.01575386133585655 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "55ddc917f301fd64" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "a09a0fdd71f66ca4" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Tess-XS-v1-3-yarn-128K/results_2023-12-04T20-29-13.778100.json b/eval-results/migtissera/Tess-XS-v1-3-yarn-128K/results_2023-12-04T20-29-13.778100.json new file mode 100644 index 0000000000000000000000000000000000000000..f7910715418b292ca39ff71965fada79fc329fd4 --- /dev/null +++ b/eval-results/migtissera/Tess-XS-v1-3-yarn-128K/results_2023-12-04T20-29-13.778100.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 166772.039562683, + "end_time": 176618.677155861, + "total_evaluation_time_secondes": "9846.637593177991", + "model_name": "migtissera/Tess-XS-v1-3-yarn-128K", + "model_sha": "72d393d13f1bd26442e59993c57840b91ff6f6fc", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5861774744027304, + "acc_stderr": 0.014392730009221007, + "acc_norm": 0.6160409556313993, + "acc_norm_stderr": 0.01421244498065189 + }, + "harness|hellaswag|10": { + "acc": 0.6504680342561243, + "acc_stderr": 0.004758476684324042, + "acc_norm": 0.8296156144194383, + "acc_norm_stderr": 0.003752017639083751 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5851851851851851, + "acc_stderr": 0.04256193767901408, + "acc_norm": 0.5851851851851851, + "acc_norm_stderr": 0.04256193767901408 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6513157894736842, + "acc_stderr": 0.038781398887976104, + "acc_norm": 0.6513157894736842, + "acc_norm_stderr": 0.038781398887976104 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.660377358490566, + "acc_stderr": 0.02914690474779833, + "acc_norm": 0.660377358490566, + "acc_norm_stderr": 0.02914690474779833 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7152777777777778, + "acc_stderr": 0.037738099906869334, + "acc_norm": 0.7152777777777778, + "acc_norm_stderr": 0.037738099906869334 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6011560693641619, + "acc_stderr": 0.037336266553835096, + "acc_norm": 0.6011560693641619, + "acc_norm_stderr": 0.037336266553835096 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3431372549019608, + "acc_stderr": 0.04724007352383887, + "acc_norm": 0.3431372549019608, + "acc_norm_stderr": 0.04724007352383887 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.77, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.574468085106383, + "acc_stderr": 0.032321469162244695, + "acc_norm": 0.574468085106383, + "acc_norm_stderr": 0.032321469162244695 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4473684210526316, + "acc_stderr": 0.04677473004491199, + "acc_norm": 0.4473684210526316, + "acc_norm_stderr": 0.04677473004491199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.025355741263055263, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.025355741263055263 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.04325506042017086, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.04325506042017086 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7483870967741936, + "acc_stderr": 0.024685979286239952, + "acc_norm": 0.7483870967741936, + "acc_norm_stderr": 0.024685979286239952 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.46798029556650245, + "acc_stderr": 0.03510766597959217, + "acc_norm": 0.46798029556650245, + "acc_norm_stderr": 0.03510766597959217 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526066, + "acc_norm": 0.67, + "acc_norm_stderr": 0.047258156262526066 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7454545454545455, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.7454545454545455, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.03053289223393202, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.03053289223393202 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8808290155440415, + "acc_stderr": 0.023381935348121437, + "acc_norm": 0.8808290155440415, + "acc_norm_stderr": 0.023381935348121437 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6076923076923076, + "acc_stderr": 0.024756000382130956, + "acc_norm": 0.6076923076923076, + "acc_norm_stderr": 0.024756000382130956 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.35555555555555557, + "acc_stderr": 0.029185714949857403, + "acc_norm": 0.35555555555555557, + "acc_norm_stderr": 0.029185714949857403 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.634453781512605, + "acc_stderr": 0.031282177063684614, + "acc_norm": 0.634453781512605, + "acc_norm_stderr": 0.031282177063684614 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8055045871559633, + "acc_stderr": 0.01697028909045803, + "acc_norm": 0.8055045871559633, + "acc_norm_stderr": 0.01697028909045803 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4398148148148148, + "acc_stderr": 0.03385177976044811, + "acc_norm": 0.4398148148148148, + "acc_norm_stderr": 0.03385177976044811 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7941176470588235, + "acc_stderr": 0.028379449451588667, + "acc_norm": 0.7941176470588235, + "acc_norm_stderr": 0.028379449451588667 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7974683544303798, + "acc_stderr": 0.026160568246601446, + "acc_norm": 0.7974683544303798, + "acc_norm_stderr": 0.026160568246601446 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6995515695067265, + "acc_stderr": 0.03076935200822915, + "acc_norm": 0.6995515695067265, + "acc_norm_stderr": 0.03076935200822915 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7557251908396947, + "acc_stderr": 0.03768335959728744, + "acc_norm": 0.7557251908396947, + "acc_norm_stderr": 0.03768335959728744 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7520661157024794, + "acc_stderr": 0.03941897526516302, + "acc_norm": 0.7520661157024794, + "acc_norm_stderr": 0.03941897526516302 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7685185185185185, + "acc_stderr": 0.04077494709252627, + "acc_norm": 0.7685185185185185, + "acc_norm_stderr": 0.04077494709252627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7361963190184049, + "acc_stderr": 0.03462419931615624, + "acc_norm": 0.7361963190184049, + "acc_norm_stderr": 0.03462419931615624 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.49107142857142855, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.49107142857142855, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7864077669902912, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.7864077669902912, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8247863247863247, + "acc_stderr": 0.02490443909891823, + "acc_norm": 0.8247863247863247, + "acc_norm_stderr": 0.02490443909891823 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.80970625798212, + "acc_stderr": 0.014036945850381394, + "acc_norm": 0.80970625798212, + "acc_norm_stderr": 0.014036945850381394 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6936416184971098, + "acc_stderr": 0.024818350129436593, + "acc_norm": 0.6936416184971098, + "acc_norm_stderr": 0.024818350129436593 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4201117318435754, + "acc_stderr": 0.016507671073256402, + "acc_norm": 0.4201117318435754, + "acc_norm_stderr": 0.016507671073256402 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7483660130718954, + "acc_stderr": 0.024848018263875206, + "acc_norm": 0.7483660130718954, + "acc_norm_stderr": 0.024848018263875206 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6784565916398714, + "acc_stderr": 0.026527724079528872, + "acc_norm": 0.6784565916398714, + "acc_norm_stderr": 0.026527724079528872 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6851851851851852, + "acc_stderr": 0.025842248700902168, + "acc_norm": 0.6851851851851852, + "acc_norm_stderr": 0.025842248700902168 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4787234042553192, + "acc_stderr": 0.029800481645628693, + "acc_norm": 0.4787234042553192, + "acc_norm_stderr": 0.029800481645628693 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.439374185136897, + "acc_stderr": 0.012676014778580214, + "acc_norm": 0.439374185136897, + "acc_norm_stderr": 0.012676014778580214 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6360294117647058, + "acc_stderr": 0.029227192460032025, + "acc_norm": 0.6360294117647058, + "acc_norm_stderr": 0.029227192460032025 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6421568627450981, + "acc_stderr": 0.019393058402355442, + "acc_norm": 0.6421568627450981, + "acc_norm_stderr": 0.019393058402355442 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7142857142857143, + "acc_stderr": 0.028920583220675596, + "acc_norm": 0.7142857142857143, + "acc_norm_stderr": 0.028920583220675596 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7910447761194029, + "acc_stderr": 0.028748298931728655, + "acc_norm": 0.7910447761194029, + "acc_norm_stderr": 0.028748298931728655 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5662650602409639, + "acc_stderr": 0.03858158940685516, + "acc_norm": 0.5662650602409639, + "acc_norm_stderr": 0.03858158940685516 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.03094445977853321, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.03094445977853321 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3488372093023256, + "mc1_stderr": 0.016684419859986893, + "mc2": 0.5019574783346394, + "mc2_stderr": 0.015769313109990427 + }, + "harness|winogrande|5": { + "acc": 0.7474348855564326, + "acc_stderr": 0.012211148449394105 + }, + "harness|gsm8k|5": { + "acc": 0.4336618650492798, + "acc_stderr": 0.013650728047064692 + }, + "all": { + "acc": 0.6198767310043558, + "acc_stderr": 0.03278732745349028, + "acc_norm": 0.6244134729161351, + "acc_norm_stderr": 0.03344031108631591, + "mc1": 0.3488372093023256, + "mc1_stderr": 0.016684419859986893, + "mc2": 0.5019574783346394, + "mc2_stderr": 0.015769313109990427 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "bd76dbf4ef423b7f" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "145d4046df10e746" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Tess-XS-v1.0/results_2023-11-18T21-55-23.260774.json b/eval-results/migtissera/Tess-XS-v1.0/results_2023-11-18T21-55-23.260774.json new file mode 100644 index 0000000000000000000000000000000000000000..d744868ab368494a7fc1666cf3ab6f0da3a9a62c --- /dev/null +++ b/eval-results/migtissera/Tess-XS-v1.0/results_2023-11-18T21-55-23.260774.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 184455.629103816, + "end_time": 198625.314861794, + "total_evaluation_time_secondes": "14169.685757978004", + "model_name": "migtissera/Tess-XS-v1.0", + "model_sha": "a581ab1793366ff2d5f3c966ff0e7b8b1149d775", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5750853242320819, + "acc_stderr": 0.014445698968520769, + "acc_norm": 0.6143344709897611, + "acc_norm_stderr": 0.014224250973257182 + }, + "harness|hellaswag|10": { + "acc": 0.6381198964349731, + "acc_stderr": 0.00479562275732714, + "acc_norm": 0.8381796454889464, + "acc_norm_stderr": 0.003675332590681066 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6370370370370371, + "acc_stderr": 0.04153948404742398, + "acc_norm": 0.6370370370370371, + "acc_norm_stderr": 0.04153948404742398 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6644736842105263, + "acc_stderr": 0.03842498559395268, + "acc_norm": 0.6644736842105263, + "acc_norm_stderr": 0.03842498559395268 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6981132075471698, + "acc_stderr": 0.028254200344438662, + "acc_norm": 0.6981132075471698, + "acc_norm_stderr": 0.028254200344438662 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7013888888888888, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.7013888888888888, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6358381502890174, + "acc_stderr": 0.03669072477416906, + "acc_norm": 0.6358381502890174, + "acc_norm_stderr": 0.03669072477416906 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4215686274509804, + "acc_stderr": 0.04913595201274498, + "acc_norm": 0.4215686274509804, + "acc_norm_stderr": 0.04913595201274498 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5829787234042553, + "acc_stderr": 0.03223276266711712, + "acc_norm": 0.5829787234042553, + "acc_norm_stderr": 0.03223276266711712 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.04697085136647863, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.04697085136647863 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5448275862068965, + "acc_stderr": 0.04149886942192117, + "acc_norm": 0.5448275862068965, + "acc_norm_stderr": 0.04149886942192117 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3994708994708995, + "acc_stderr": 0.02522545028406788, + "acc_norm": 0.3994708994708995, + "acc_norm_stderr": 0.02522545028406788 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.043758884927270605, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.043758884927270605 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7677419354838709, + "acc_stderr": 0.024022256130308235, + "acc_norm": 0.7677419354838709, + "acc_norm_stderr": 0.024022256130308235 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5024630541871922, + "acc_stderr": 0.035179450386910616, + "acc_norm": 0.5024630541871922, + "acc_norm_stderr": 0.035179450386910616 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7515151515151515, + "acc_stderr": 0.03374402644139403, + "acc_norm": 0.7515151515151515, + "acc_norm_stderr": 0.03374402644139403 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7727272727272727, + "acc_stderr": 0.029857515673386414, + "acc_norm": 0.7727272727272727, + "acc_norm_stderr": 0.029857515673386414 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8860103626943006, + "acc_stderr": 0.022935144053919443, + "acc_norm": 0.8860103626943006, + "acc_norm_stderr": 0.022935144053919443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6564102564102564, + "acc_stderr": 0.024078696580635477, + "acc_norm": 0.6564102564102564, + "acc_norm_stderr": 0.024078696580635477 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34814814814814815, + "acc_stderr": 0.029045600290616258, + "acc_norm": 0.34814814814814815, + "acc_norm_stderr": 0.029045600290616258 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.030388353551886793, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.030388353551886793 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8275229357798165, + "acc_stderr": 0.01619780795684805, + "acc_norm": 0.8275229357798165, + "acc_norm_stderr": 0.01619780795684805 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5324074074074074, + "acc_stderr": 0.03402801581358966, + "acc_norm": 0.5324074074074074, + "acc_norm_stderr": 0.03402801581358966 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8137254901960784, + "acc_stderr": 0.027325470966716312, + "acc_norm": 0.8137254901960784, + "acc_norm_stderr": 0.027325470966716312 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7721518987341772, + "acc_stderr": 0.027303484599069425, + "acc_norm": 0.7721518987341772, + "acc_norm_stderr": 0.027303484599069425 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.695067264573991, + "acc_stderr": 0.030898610882477515, + "acc_norm": 0.695067264573991, + "acc_norm_stderr": 0.030898610882477515 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7862595419847328, + "acc_stderr": 0.0359546161177469, + "acc_norm": 0.7862595419847328, + "acc_norm_stderr": 0.0359546161177469 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8098159509202454, + "acc_stderr": 0.030833491146281235, + "acc_norm": 0.8098159509202454, + "acc_norm_stderr": 0.030833491146281235 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5178571428571429, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.5178571428571429, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.03916667762822585, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.03916667762822585 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8547008547008547, + "acc_stderr": 0.023086635086841407, + "acc_norm": 0.8547008547008547, + "acc_norm_stderr": 0.023086635086841407 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.73, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.73, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8173690932311622, + "acc_stderr": 0.013816335389973136, + "acc_norm": 0.8173690932311622, + "acc_norm_stderr": 0.013816335389973136 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7167630057803468, + "acc_stderr": 0.024257901705323374, + "acc_norm": 0.7167630057803468, + "acc_norm_stderr": 0.024257901705323374 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.33519553072625696, + "acc_stderr": 0.015788007190185884, + "acc_norm": 0.33519553072625696, + "acc_norm_stderr": 0.015788007190185884 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.738562091503268, + "acc_stderr": 0.025160998214292456, + "acc_norm": 0.738562091503268, + "acc_norm_stderr": 0.025160998214292456 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6945337620578779, + "acc_stderr": 0.026160584450140453, + "acc_norm": 0.6945337620578779, + "acc_norm_stderr": 0.026160584450140453 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.02465968518596728, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.02465968518596728 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5212765957446809, + "acc_stderr": 0.029800481645628693, + "acc_norm": 0.5212765957446809, + "acc_norm_stderr": 0.029800481645628693 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4485006518904824, + "acc_stderr": 0.012702317490559807, + "acc_norm": 0.4485006518904824, + "acc_norm_stderr": 0.012702317490559807 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6801470588235294, + "acc_stderr": 0.028332959514031215, + "acc_norm": 0.6801470588235294, + "acc_norm_stderr": 0.028332959514031215 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6781045751633987, + "acc_stderr": 0.018901015322093092, + "acc_norm": 0.6781045751633987, + "acc_norm_stderr": 0.018901015322093092 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.746938775510204, + "acc_stderr": 0.02783302387139968, + "acc_norm": 0.746938775510204, + "acc_norm_stderr": 0.02783302387139968 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.845771144278607, + "acc_stderr": 0.02553843336857833, + "acc_norm": 0.845771144278607, + "acc_norm_stderr": 0.02553843336857833 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5301204819277109, + "acc_stderr": 0.03885425420866767, + "acc_norm": 0.5301204819277109, + "acc_norm_stderr": 0.03885425420866767 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8245614035087719, + "acc_stderr": 0.02917088550072767, + "acc_norm": 0.8245614035087719, + "acc_norm_stderr": 0.02917088550072767 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3157894736842105, + "mc1_stderr": 0.016272287957916916, + "mc2": 0.4712323822712203, + "mc2_stderr": 0.014554223298121486 + }, + "harness|winogrande|5": { + "acc": 0.7892659826361483, + "acc_stderr": 0.011462046419710676 + }, + "harness|drop|3": { + "em": 0.0018875838926174498, + "em_stderr": 0.0004445109990558992, + "f1": 0.061799496644295286, + "f1_stderr": 0.0013795660027086077 + }, + "harness|gsm8k|5": { + "acc": 0.18271417740712662, + "acc_stderr": 0.010644258206326236 + }, + "all": { + "acc": 0.6348258267763893, + "acc_stderr": 0.03230372610827704, + "acc_norm": 0.6439271893072561, + "acc_norm_stderr": 0.03300134321723649, + "mc1": 0.3157894736842105, + "mc1_stderr": 0.016272287957916916, + "mc2": 0.4712323822712203, + "mc2_stderr": 0.014554223298121486, + "em": 0.0018875838926174498, + "em_stderr": 0.0004445109990558992, + "f1": 0.061799496644295286, + "f1_stderr": 0.0013795660027086077 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "9a227908574d47e1" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "608fc32e545ec57c" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "4ecac10462f9a805" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Tess-XS-v1.1/results_2023-11-23T08-35-10.663595.json b/eval-results/migtissera/Tess-XS-v1.1/results_2023-11-23T08-35-10.663595.json new file mode 100644 index 0000000000000000000000000000000000000000..325da0d96f35c7b502c105968d355809614e258c --- /dev/null +++ b/eval-results/migtissera/Tess-XS-v1.1/results_2023-11-23T08-35-10.663595.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 549714.796404272, + "end_time": 562800.885971636, + "total_evaluation_time_secondes": "13086.089567364077", + "model_name": "migtissera/Tess-XS-v1.1", + "model_sha": "e8850e534a3a9f602f72201b09c7ef8f879c1c0b", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5930034129692833, + "acc_stderr": 0.014356399418009126, + "acc_norm": 0.6390784982935154, + "acc_norm_stderr": 0.014034761386175452 + }, + "harness|hellaswag|10": { + "acc": 0.6512646883091018, + "acc_stderr": 0.004755960559929163, + "acc_norm": 0.8405696076478789, + "acc_norm_stderr": 0.003653288043555801 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6148148148148148, + "acc_stderr": 0.04203921040156279, + "acc_norm": 0.6148148148148148, + "acc_norm_stderr": 0.04203921040156279 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6776315789473685, + "acc_stderr": 0.03803510248351585, + "acc_norm": 0.6776315789473685, + "acc_norm_stderr": 0.03803510248351585 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.61, + "acc_stderr": 0.049020713000019756, + "acc_norm": 0.61, + "acc_norm_stderr": 0.049020713000019756 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6867924528301886, + "acc_stderr": 0.028544793319055326, + "acc_norm": 0.6867924528301886, + "acc_norm_stderr": 0.028544793319055326 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.03852084696008534, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.03852084696008534 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.036928207672648664, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.036928207672648664 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.047840607041056527, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.047840607041056527 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5404255319148936, + "acc_stderr": 0.03257901482099835, + "acc_norm": 0.5404255319148936, + "acc_norm_stderr": 0.03257901482099835 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.45614035087719296, + "acc_stderr": 0.046854730419077895, + "acc_norm": 0.45614035087719296, + "acc_norm_stderr": 0.046854730419077895 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.02535574126305527, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.02535574126305527 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.04451807959055328, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.04451807959055328 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7516129032258064, + "acc_stderr": 0.024580028921481003, + "acc_norm": 0.7516129032258064, + "acc_norm_stderr": 0.024580028921481003 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4975369458128079, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.4975369458128079, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7515151515151515, + "acc_stderr": 0.033744026441394036, + "acc_norm": 0.7515151515151515, + "acc_norm_stderr": 0.033744026441394036 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.029126522834586808, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.029126522834586808 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8756476683937824, + "acc_stderr": 0.023814477086593542, + "acc_norm": 0.8756476683937824, + "acc_norm_stderr": 0.023814477086593542 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6641025641025641, + "acc_stderr": 0.023946724741563976, + "acc_norm": 0.6641025641025641, + "acc_norm_stderr": 0.023946724741563976 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34444444444444444, + "acc_stderr": 0.02897264888484427, + "acc_norm": 0.34444444444444444, + "acc_norm_stderr": 0.02897264888484427 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6932773109243697, + "acc_stderr": 0.02995382389188704, + "acc_norm": 0.6932773109243697, + "acc_norm_stderr": 0.02995382389188704 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.03879687024073327, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.03879687024073327 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8220183486238533, + "acc_stderr": 0.016399436366612917, + "acc_norm": 0.8220183486238533, + "acc_norm_stderr": 0.016399436366612917 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5, + "acc_stderr": 0.034099716973523674, + "acc_norm": 0.5, + "acc_norm_stderr": 0.034099716973523674 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8235294117647058, + "acc_stderr": 0.026756401538078962, + "acc_norm": 0.8235294117647058, + "acc_norm_stderr": 0.026756401538078962 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7721518987341772, + "acc_stderr": 0.027303484599069425, + "acc_norm": 0.7721518987341772, + "acc_norm_stderr": 0.027303484599069425 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6860986547085202, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.6860986547085202, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7480916030534351, + "acc_stderr": 0.03807387116306086, + "acc_norm": 0.7480916030534351, + "acc_norm_stderr": 0.03807387116306086 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.768595041322314, + "acc_stderr": 0.03849856098794088, + "acc_norm": 0.768595041322314, + "acc_norm_stderr": 0.03849856098794088 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7685185185185185, + "acc_stderr": 0.04077494709252627, + "acc_norm": 0.7685185185185185, + "acc_norm_stderr": 0.04077494709252627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.754601226993865, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.754601226993865, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.45535714285714285, + "acc_stderr": 0.047268355537191, + "acc_norm": 0.45535714285714285, + "acc_norm_stderr": 0.047268355537191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7475728155339806, + "acc_stderr": 0.04301250399690878, + "acc_norm": 0.7475728155339806, + "acc_norm_stderr": 0.04301250399690878 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.02190190511507332, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.02190190511507332 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8212005108556832, + "acc_stderr": 0.013702643715368985, + "acc_norm": 0.8212005108556832, + "acc_norm_stderr": 0.013702643715368985 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7138728323699421, + "acc_stderr": 0.02433214677913413, + "acc_norm": 0.7138728323699421, + "acc_norm_stderr": 0.02433214677913413 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.36312849162011174, + "acc_stderr": 0.016083749986853697, + "acc_norm": 0.36312849162011174, + "acc_norm_stderr": 0.016083749986853697 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7320261437908496, + "acc_stderr": 0.025360603796242557, + "acc_norm": 0.7320261437908496, + "acc_norm_stderr": 0.025360603796242557 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7202572347266881, + "acc_stderr": 0.025494259350694912, + "acc_norm": 0.7202572347266881, + "acc_norm_stderr": 0.025494259350694912 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7283950617283951, + "acc_stderr": 0.024748624490537368, + "acc_norm": 0.7283950617283951, + "acc_norm_stderr": 0.024748624490537368 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.44680851063829785, + "acc_stderr": 0.029658235097666907, + "acc_norm": 0.44680851063829785, + "acc_norm_stderr": 0.029658235097666907 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4706649282920469, + "acc_stderr": 0.012748238397365549, + "acc_norm": 0.4706649282920469, + "acc_norm_stderr": 0.012748238397365549 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6617647058823529, + "acc_stderr": 0.028739328513983576, + "acc_norm": 0.6617647058823529, + "acc_norm_stderr": 0.028739328513983576 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6633986928104575, + "acc_stderr": 0.019117213911495155, + "acc_norm": 0.6633986928104575, + "acc_norm_stderr": 0.019117213911495155 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7061224489795919, + "acc_stderr": 0.02916273841024977, + "acc_norm": 0.7061224489795919, + "acc_norm_stderr": 0.02916273841024977 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8407960199004975, + "acc_stderr": 0.02587064676616914, + "acc_norm": 0.8407960199004975, + "acc_norm_stderr": 0.02587064676616914 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.82, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5240963855421686, + "acc_stderr": 0.03887971849597264, + "acc_norm": 0.5240963855421686, + "acc_norm_stderr": 0.03887971849597264 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7894736842105263, + "acc_stderr": 0.031267817146631786, + "acc_norm": 0.7894736842105263, + "acc_norm_stderr": 0.031267817146631786 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3463892288861689, + "mc1_stderr": 0.01665699710912514, + "mc2": 0.49923681207340576, + "mc2_stderr": 0.01551504317540587 + }, + "harness|winogrande|5": { + "acc": 0.7916337805840569, + "acc_stderr": 0.011414554399987726 + }, + "harness|drop|3": { + "em": 0.18278104026845637, + "em_stderr": 0.003957987703151033, + "f1": 0.27069211409396043, + "f1_stderr": 0.004030013722161818 + }, + "harness|gsm8k|5": { + "acc": 0.16224412433661864, + "acc_stderr": 0.010155130880393524 + }, + "all": { + "acc": 0.6253362884117736, + "acc_stderr": 0.03254975101958803, + "acc_norm": 0.6343561981840767, + "acc_norm_stderr": 0.0332634036672251, + "mc1": 0.3463892288861689, + "mc1_stderr": 0.01665699710912514, + "mc2": 0.49923681207340576, + "mc2_stderr": 0.01551504317540587, + "em": 0.18278104026845637, + "em_stderr": 0.003957987703151033, + "f1": 0.27069211409396043, + "f1_stderr": 0.004030013722161818 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "4c9e05e8b8652e73" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "2f62b9c23527d617" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "168c7115ff5596b3" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/migtissera/Tess-XS-v1.1/results_2023-11-23T08-39-10.846213.json b/eval-results/migtissera/Tess-XS-v1.1/results_2023-11-23T08-39-10.846213.json new file mode 100644 index 0000000000000000000000000000000000000000..f140569e02b42c06aca7005509d50a9635a835ef --- /dev/null +++ b/eval-results/migtissera/Tess-XS-v1.1/results_2023-11-23T08-39-10.846213.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 564222.639814126, + "end_time": 577578.779256475, + "total_evaluation_time_secondes": "13356.139442349086", + "model_name": "migtissera/Tess-XS-v1.1", + "model_sha": "e8850e534a3a9f602f72201b09c7ef8f879c1c0b", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5930034129692833, + "acc_stderr": 0.014356399418009126, + "acc_norm": 0.6390784982935154, + "acc_norm_stderr": 0.014034761386175452 + }, + "harness|hellaswag|10": { + "acc": 0.6512646883091018, + "acc_stderr": 0.004755960559929163, + "acc_norm": 0.8405696076478789, + "acc_norm_stderr": 0.003653288043555801 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6148148148148148, + "acc_stderr": 0.04203921040156279, + "acc_norm": 0.6148148148148148, + "acc_norm_stderr": 0.04203921040156279 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6776315789473685, + "acc_stderr": 0.03803510248351585, + "acc_norm": 0.6776315789473685, + "acc_norm_stderr": 0.03803510248351585 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.61, + "acc_stderr": 0.049020713000019756, + "acc_norm": 0.61, + "acc_norm_stderr": 0.049020713000019756 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6867924528301886, + "acc_stderr": 0.028544793319055326, + "acc_norm": 0.6867924528301886, + "acc_norm_stderr": 0.028544793319055326 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.03852084696008534, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.03852084696008534 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.036928207672648664, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.036928207672648664 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.047840607041056527, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.047840607041056527 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5404255319148936, + "acc_stderr": 0.03257901482099835, + "acc_norm": 0.5404255319148936, + "acc_norm_stderr": 0.03257901482099835 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.45614035087719296, + "acc_stderr": 0.046854730419077895, + "acc_norm": 0.45614035087719296, + "acc_norm_stderr": 0.046854730419077895 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.02535574126305527, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.02535574126305527 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.04451807959055328, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.04451807959055328 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7516129032258064, + "acc_stderr": 0.024580028921481003, + "acc_norm": 0.7516129032258064, + "acc_norm_stderr": 0.024580028921481003 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4975369458128079, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.4975369458128079, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7515151515151515, + "acc_stderr": 0.033744026441394036, + "acc_norm": 0.7515151515151515, + "acc_norm_stderr": 0.033744026441394036 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.029126522834586808, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.029126522834586808 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8756476683937824, + "acc_stderr": 0.023814477086593542, + "acc_norm": 0.8756476683937824, + "acc_norm_stderr": 0.023814477086593542 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6641025641025641, + "acc_stderr": 0.023946724741563976, + "acc_norm": 0.6641025641025641, + "acc_norm_stderr": 0.023946724741563976 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34444444444444444, + "acc_stderr": 0.02897264888484427, + "acc_norm": 0.34444444444444444, + "acc_norm_stderr": 0.02897264888484427 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6932773109243697, + "acc_stderr": 0.02995382389188704, + "acc_norm": 0.6932773109243697, + "acc_norm_stderr": 0.02995382389188704 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.03879687024073327, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.03879687024073327 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8220183486238533, + "acc_stderr": 0.016399436366612917, + "acc_norm": 0.8220183486238533, + "acc_norm_stderr": 0.016399436366612917 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5, + "acc_stderr": 0.034099716973523674, + "acc_norm": 0.5, + "acc_norm_stderr": 0.034099716973523674 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8235294117647058, + "acc_stderr": 0.026756401538078962, + "acc_norm": 0.8235294117647058, + "acc_norm_stderr": 0.026756401538078962 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7721518987341772, + "acc_stderr": 0.027303484599069425, + "acc_norm": 0.7721518987341772, + "acc_norm_stderr": 0.027303484599069425 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6860986547085202, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.6860986547085202, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7480916030534351, + "acc_stderr": 0.03807387116306086, + "acc_norm": 0.7480916030534351, + "acc_norm_stderr": 0.03807387116306086 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.768595041322314, + "acc_stderr": 0.03849856098794088, + "acc_norm": 0.768595041322314, + "acc_norm_stderr": 0.03849856098794088 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7685185185185185, + "acc_stderr": 0.04077494709252627, + "acc_norm": 0.7685185185185185, + "acc_norm_stderr": 0.04077494709252627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.754601226993865, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.754601226993865, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.45535714285714285, + "acc_stderr": 0.047268355537191, + "acc_norm": 0.45535714285714285, + "acc_norm_stderr": 0.047268355537191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7475728155339806, + "acc_stderr": 0.04301250399690878, + "acc_norm": 0.7475728155339806, + "acc_norm_stderr": 0.04301250399690878 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.02190190511507332, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.02190190511507332 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8212005108556832, + "acc_stderr": 0.013702643715368985, + "acc_norm": 0.8212005108556832, + "acc_norm_stderr": 0.013702643715368985 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7138728323699421, + "acc_stderr": 0.02433214677913413, + "acc_norm": 0.7138728323699421, + "acc_norm_stderr": 0.02433214677913413 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.36312849162011174, + "acc_stderr": 0.016083749986853697, + "acc_norm": 0.36312849162011174, + "acc_norm_stderr": 0.016083749986853697 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7320261437908496, + "acc_stderr": 0.025360603796242557, + "acc_norm": 0.7320261437908496, + "acc_norm_stderr": 0.025360603796242557 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7202572347266881, + "acc_stderr": 0.025494259350694912, + "acc_norm": 0.7202572347266881, + "acc_norm_stderr": 0.025494259350694912 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7283950617283951, + "acc_stderr": 0.024748624490537368, + "acc_norm": 0.7283950617283951, + "acc_norm_stderr": 0.024748624490537368 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.44680851063829785, + "acc_stderr": 0.029658235097666907, + "acc_norm": 0.44680851063829785, + "acc_norm_stderr": 0.029658235097666907 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4706649282920469, + "acc_stderr": 0.012748238397365549, + "acc_norm": 0.4706649282920469, + "acc_norm_stderr": 0.012748238397365549 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6617647058823529, + "acc_stderr": 0.028739328513983576, + "acc_norm": 0.6617647058823529, + "acc_norm_stderr": 0.028739328513983576 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6633986928104575, + "acc_stderr": 0.019117213911495155, + "acc_norm": 0.6633986928104575, + "acc_norm_stderr": 0.019117213911495155 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7061224489795919, + "acc_stderr": 0.02916273841024977, + "acc_norm": 0.7061224489795919, + "acc_norm_stderr": 0.02916273841024977 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8407960199004975, + "acc_stderr": 0.02587064676616914, + "acc_norm": 0.8407960199004975, + "acc_norm_stderr": 0.02587064676616914 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.82, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5240963855421686, + "acc_stderr": 0.03887971849597264, + "acc_norm": 0.5240963855421686, + "acc_norm_stderr": 0.03887971849597264 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7894736842105263, + "acc_stderr": 0.031267817146631786, + "acc_norm": 0.7894736842105263, + "acc_norm_stderr": 0.031267817146631786 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3463892288861689, + "mc1_stderr": 0.01665699710912514, + "mc2": 0.49923681207340576, + "mc2_stderr": 0.01551504317540587 + }, + "harness|winogrande|5": { + "acc": 0.7916337805840569, + "acc_stderr": 0.011414554399987726 + }, + "harness|drop|3": { + "em": 0.18278104026845637, + "em_stderr": 0.003957987703151033, + "f1": 0.27069211409396043, + "f1_stderr": 0.004030013722161818 + }, + "harness|gsm8k|5": { + "acc": 0.16224412433661864, + "acc_stderr": 0.010155130880393524 + }, + "all": { + "acc": 0.6253362884117736, + "acc_stderr": 0.03254975101958803, + "acc_norm": 0.6343561981840767, + "acc_norm_stderr": 0.0332634036672251, + "mc1": 0.3463892288861689, + "mc1_stderr": 0.01665699710912514, + "mc2": 0.49923681207340576, + "mc2_stderr": 0.01551504317540587, + "em": 0.18278104026845637, + "em_stderr": 0.003957987703151033, + "f1": 0.27069211409396043, + "f1_stderr": 0.004030013722161818 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "4c9e05e8b8652e73" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "2f62b9c23527d617" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "168c7115ff5596b3" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mindy-labs/mindy-7b/results_2023-12-16T16-55-36.192402.json b/eval-results/mindy-labs/mindy-7b/results_2023-12-16T16-55-36.192402.json new file mode 100644 index 0000000000000000000000000000000000000000..0dbf9d47511953c95f5c21149d0f91a7dbdd46de --- /dev/null +++ b/eval-results/mindy-labs/mindy-7b/results_2023-12-16T16-55-36.192402.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 369283.01781278, + "end_time": 376703.109780842, + "total_evaluation_time_secondes": "7420.0919680619845", + "model_name": "mindy-labs/mindy-7b", + "model_sha": "ce0d461a6de81d5b8ec4d338fb0c6e7991d0b1ff", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.658703071672355, + "acc_stderr": 0.013855831287497723, + "acc_norm": 0.6911262798634812, + "acc_norm_stderr": 0.013501770929344003 + }, + "harness|hellaswag|10": { + "acc": 0.6795459071898028, + "acc_stderr": 0.004656974162147996, + "acc_norm": 0.86566421031667, + "acc_norm_stderr": 0.003403158010309538 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6444444444444445, + "acc_stderr": 0.04135176749720385, + "acc_norm": 0.6444444444444445, + "acc_norm_stderr": 0.04135176749720385 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6907894736842105, + "acc_stderr": 0.037610708698674805, + "acc_norm": 0.6907894736842105, + "acc_norm_stderr": 0.037610708698674805 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.720754716981132, + "acc_stderr": 0.027611163402399715, + "acc_norm": 0.720754716981132, + "acc_norm_stderr": 0.027611163402399715 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7638888888888888, + "acc_stderr": 0.03551446610810826, + "acc_norm": 0.7638888888888888, + "acc_norm_stderr": 0.03551446610810826 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6589595375722543, + "acc_stderr": 0.036146654241808254, + "acc_norm": 0.6589595375722543, + "acc_norm_stderr": 0.036146654241808254 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.04897104952726366, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.04897104952726366 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816508, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816508 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5914893617021276, + "acc_stderr": 0.032134180267015755, + "acc_norm": 0.5914893617021276, + "acc_norm_stderr": 0.032134180267015755 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5, + "acc_stderr": 0.047036043419179864, + "acc_norm": 0.5, + "acc_norm_stderr": 0.047036043419179864 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4365079365079365, + "acc_stderr": 0.0255428468174005, + "acc_norm": 0.4365079365079365, + "acc_norm_stderr": 0.0255428468174005 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4365079365079365, + "acc_stderr": 0.04435932892851466, + "acc_norm": 0.4365079365079365, + "acc_norm_stderr": 0.04435932892851466 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7709677419354839, + "acc_stderr": 0.02390491431178265, + "acc_norm": 0.7709677419354839, + "acc_norm_stderr": 0.02390491431178265 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4975369458128079, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.4975369458128079, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7757575757575758, + "acc_stderr": 0.03256866661681102, + "acc_norm": 0.7757575757575758, + "acc_norm_stderr": 0.03256866661681102 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7929292929292929, + "acc_stderr": 0.02886977846026705, + "acc_norm": 0.7929292929292929, + "acc_norm_stderr": 0.02886977846026705 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8860103626943006, + "acc_stderr": 0.022935144053919436, + "acc_norm": 0.8860103626943006, + "acc_norm_stderr": 0.022935144053919436 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.023901157979402534, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.023901157979402534 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3592592592592593, + "acc_stderr": 0.029252905927251972, + "acc_norm": 0.3592592592592593, + "acc_norm_stderr": 0.029252905927251972 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6848739495798319, + "acc_stderr": 0.030176808288974337, + "acc_norm": 0.6848739495798319, + "acc_norm_stderr": 0.030176808288974337 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526732, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526732 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8568807339449541, + "acc_stderr": 0.015014462497168589, + "acc_norm": 0.8568807339449541, + "acc_norm_stderr": 0.015014462497168589 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5138888888888888, + "acc_stderr": 0.03408655867977749, + "acc_norm": 0.5138888888888888, + "acc_norm_stderr": 0.03408655867977749 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8235294117647058, + "acc_stderr": 0.026756401538078966, + "acc_norm": 0.8235294117647058, + "acc_norm_stderr": 0.026756401538078966 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7932489451476793, + "acc_stderr": 0.0263616516683891, + "acc_norm": 0.7932489451476793, + "acc_norm_stderr": 0.0263616516683891 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.695067264573991, + "acc_stderr": 0.030898610882477515, + "acc_norm": 0.695067264573991, + "acc_norm_stderr": 0.030898610882477515 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7786259541984732, + "acc_stderr": 0.03641297081313729, + "acc_norm": 0.7786259541984732, + "acc_norm_stderr": 0.03641297081313729 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7851239669421488, + "acc_stderr": 0.037494924487096966, + "acc_norm": 0.7851239669421488, + "acc_norm_stderr": 0.037494924487096966 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7668711656441718, + "acc_stderr": 0.0332201579577674, + "acc_norm": 0.7668711656441718, + "acc_norm_stderr": 0.0332201579577674 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.45535714285714285, + "acc_stderr": 0.047268355537191, + "acc_norm": 0.45535714285714285, + "acc_norm_stderr": 0.047268355537191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.02190190511507333, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.02190190511507333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.73, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.73, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8288633461047255, + "acc_stderr": 0.013468201614066304, + "acc_norm": 0.8288633461047255, + "acc_norm_stderr": 0.013468201614066304 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7630057803468208, + "acc_stderr": 0.02289408248992599, + "acc_norm": 0.7630057803468208, + "acc_norm_stderr": 0.02289408248992599 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4134078212290503, + "acc_stderr": 0.016469814928406167, + "acc_norm": 0.4134078212290503, + "acc_norm_stderr": 0.016469814928406167 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7254901960784313, + "acc_stderr": 0.025553169991826524, + "acc_norm": 0.7254901960784313, + "acc_norm_stderr": 0.025553169991826524 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7106109324758842, + "acc_stderr": 0.025755865922632945, + "acc_norm": 0.7106109324758842, + "acc_norm_stderr": 0.025755865922632945 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7530864197530864, + "acc_stderr": 0.02399350170904211, + "acc_norm": 0.7530864197530864, + "acc_norm_stderr": 0.02399350170904211 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4787234042553192, + "acc_stderr": 0.029800481645628693, + "acc_norm": 0.4787234042553192, + "acc_norm_stderr": 0.029800481645628693 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4589308996088657, + "acc_stderr": 0.012727084826799798, + "acc_norm": 0.4589308996088657, + "acc_norm_stderr": 0.012727084826799798 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.02841820861940676, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.02841820861940676 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6813725490196079, + "acc_stderr": 0.01885008469646872, + "acc_norm": 0.6813725490196079, + "acc_norm_stderr": 0.01885008469646872 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6818181818181818, + "acc_stderr": 0.04461272175910509, + "acc_norm": 0.6818181818181818, + "acc_norm_stderr": 0.04461272175910509 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7346938775510204, + "acc_stderr": 0.0282638899437846, + "acc_norm": 0.7346938775510204, + "acc_norm_stderr": 0.0282638899437846 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8557213930348259, + "acc_stderr": 0.02484575321230604, + "acc_norm": 0.8557213930348259, + "acc_norm_stderr": 0.02484575321230604 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5421686746987951, + "acc_stderr": 0.0387862677100236, + "acc_norm": 0.5421686746987951, + "acc_norm_stderr": 0.0387862677100236 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8245614035087719, + "acc_stderr": 0.029170885500727665, + "acc_norm": 0.8245614035087719, + "acc_norm_stderr": 0.029170885500727665 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.4565483476132191, + "mc1_stderr": 0.01743728095318369, + "mc2": 0.6089064094277691, + "mc2_stderr": 0.015171237128105348 + }, + "harness|winogrande|5": { + "acc": 0.8105761641673244, + "acc_stderr": 0.011012790432989247 + }, + "harness|gsm8k|5": { + "acc": 0.7172100075815011, + "acc_stderr": 0.012405020417873615 + }, + "all": { + "acc": 0.6514955299787132, + "acc_stderr": 0.03213385246233697, + "acc_norm": 0.6513895367503504, + "acc_norm_stderr": 0.03279897140409662, + "mc1": 0.4565483476132191, + "mc1_stderr": 0.01743728095318369, + "mc2": 0.6089064094277691, + "mc2_stderr": 0.015171237128105348 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "e6d4c941baa2d717" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "222572afe6c14007" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mistralai/Mistral-7B-Instruct-v0.1/results_2023-10-10T06-38-48.353025.json b/eval-results/mistralai/Mistral-7B-Instruct-v0.1/results_2023-10-10T06-38-48.353025.json new file mode 100644 index 0000000000000000000000000000000000000000..9ed4a187dd0146c15b69d1430b0ff76c1a74abd7 --- /dev/null +++ b/eval-results/mistralai/Mistral-7B-Instruct-v0.1/results_2023-10-10T06-38-48.353025.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "mistralai/Mistral-7B-Instruct-v0.1", + "model_sha": "7961f5aa9b736bf8e364b2e6f201190f97a27931", + "model_size": "13.99 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.523037542662116, + "acc_stderr": 0.014595873205358269, + "acc_norm": 0.5452218430034129, + "acc_norm_stderr": 0.014551507060836357 + }, + "harness|hellaswag|10": { + "acc": 0.5694084843656642, + "acc_stderr": 0.004941470620074867, + "acc_norm": 0.7563234415455089, + "acc_norm_stderr": 0.0042842240337755385 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4222222222222222, + "acc_stderr": 0.04266763404099582, + "acc_norm": 0.4222222222222222, + "acc_norm_stderr": 0.04266763404099582 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5657894736842105, + "acc_stderr": 0.040335656678483205, + "acc_norm": 0.5657894736842105, + "acc_norm_stderr": 0.040335656678483205 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.52, + "acc_stderr": 0.05021167315686779, + "acc_norm": 0.52, + "acc_norm_stderr": 0.05021167315686779 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5849056603773585, + "acc_stderr": 0.030325945789286105, + "acc_norm": 0.5849056603773585, + "acc_norm_stderr": 0.030325945789286105 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5902777777777778, + "acc_stderr": 0.04112490974670787, + "acc_norm": 0.5902777777777778, + "acc_norm_stderr": 0.04112490974670787 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5202312138728323, + "acc_stderr": 0.03809342081273956, + "acc_norm": 0.5202312138728323, + "acc_norm_stderr": 0.03809342081273956 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.30392156862745096, + "acc_stderr": 0.045766654032077636, + "acc_norm": 0.30392156862745096, + "acc_norm_stderr": 0.045766654032077636 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526094, + "acc_norm": 0.67, + "acc_norm_stderr": 0.047258156262526094 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4808510638297872, + "acc_stderr": 0.032662042990646775, + "acc_norm": 0.4808510638297872, + "acc_norm_stderr": 0.032662042990646775 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.37719298245614036, + "acc_stderr": 0.04559522141958216, + "acc_norm": 0.37719298245614036, + "acc_norm_stderr": 0.04559522141958216 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5517241379310345, + "acc_stderr": 0.04144311810878151, + "acc_norm": 0.5517241379310345, + "acc_norm_stderr": 0.04144311810878151 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.024796060602699947, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.024796060602699947 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.043435254289490965, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.043435254289490965 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6419354838709678, + "acc_stderr": 0.027273890594300642, + "acc_norm": 0.6419354838709678, + "acc_norm_stderr": 0.027273890594300642 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.41379310344827586, + "acc_stderr": 0.03465304488406795, + "acc_norm": 0.41379310344827586, + "acc_norm_stderr": 0.03465304488406795 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.036810508691615486, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.036810508691615486 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.031911782267135466, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.031911782267135466 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7253886010362695, + "acc_stderr": 0.03221024508041154, + "acc_norm": 0.7253886010362695, + "acc_norm_stderr": 0.03221024508041154 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5205128205128206, + "acc_stderr": 0.02532966316348994, + "acc_norm": 0.5205128205128206, + "acc_norm_stderr": 0.02532966316348994 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.027940457136228416, + "acc_norm": 0.3, + "acc_norm_stderr": 0.027940457136228416 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5462184873949579, + "acc_stderr": 0.032339434681820885, + "acc_norm": 0.5462184873949579, + "acc_norm_stderr": 0.032339434681820885 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.03822746937658753, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.03822746937658753 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.710091743119266, + "acc_stderr": 0.019453066609201597, + "acc_norm": 0.710091743119266, + "acc_norm_stderr": 0.019453066609201597 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4537037037037037, + "acc_stderr": 0.03395322726375797, + "acc_norm": 0.4537037037037037, + "acc_norm_stderr": 0.03395322726375797 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7058823529411765, + "acc_stderr": 0.03198001660115072, + "acc_norm": 0.7058823529411765, + "acc_norm_stderr": 0.03198001660115072 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6962025316455697, + "acc_stderr": 0.0299366963871386, + "acc_norm": 0.6962025316455697, + "acc_norm_stderr": 0.0299366963871386 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6547085201793722, + "acc_stderr": 0.03191100192835794, + "acc_norm": 0.6547085201793722, + "acc_norm_stderr": 0.03191100192835794 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6717557251908397, + "acc_stderr": 0.04118438565806299, + "acc_norm": 0.6717557251908397, + "acc_norm_stderr": 0.04118438565806299 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6776859504132231, + "acc_stderr": 0.042664163633521685, + "acc_norm": 0.6776859504132231, + "acc_norm_stderr": 0.042664163633521685 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.044531975073749834, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.044531975073749834 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6503067484662577, + "acc_stderr": 0.03746668325470021, + "acc_norm": 0.6503067484662577, + "acc_norm_stderr": 0.03746668325470021 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4732142857142857, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.4732142857142857, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6893203883495146, + "acc_stderr": 0.04582124160161551, + "acc_norm": 0.6893203883495146, + "acc_norm_stderr": 0.04582124160161551 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8418803418803419, + "acc_stderr": 0.023902325549560392, + "acc_norm": 0.8418803418803419, + "acc_norm_stderr": 0.023902325549560392 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526094, + "acc_norm": 0.67, + "acc_norm_stderr": 0.047258156262526094 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7432950191570882, + "acc_stderr": 0.015620480263064533, + "acc_norm": 0.7432950191570882, + "acc_norm_stderr": 0.015620480263064533 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5895953757225434, + "acc_stderr": 0.026483392042098174, + "acc_norm": 0.5895953757225434, + "acc_norm_stderr": 0.026483392042098174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2446927374301676, + "acc_stderr": 0.014378169884098417, + "acc_norm": 0.2446927374301676, + "acc_norm_stderr": 0.014378169884098417 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6143790849673203, + "acc_stderr": 0.02787074527829027, + "acc_norm": 0.6143790849673203, + "acc_norm_stderr": 0.02787074527829027 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6077170418006431, + "acc_stderr": 0.027731258647012, + "acc_norm": 0.6077170418006431, + "acc_norm_stderr": 0.027731258647012 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5771604938271605, + "acc_stderr": 0.027487472980871595, + "acc_norm": 0.5771604938271605, + "acc_norm_stderr": 0.027487472980871595 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3829787234042553, + "acc_stderr": 0.028999080904806185, + "acc_norm": 0.3829787234042553, + "acc_norm_stderr": 0.028999080904806185 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.40091264667535853, + "acc_stderr": 0.012516960350640824, + "acc_norm": 0.40091264667535853, + "acc_norm_stderr": 0.012516960350640824 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5808823529411765, + "acc_stderr": 0.02997280717046462, + "acc_norm": 0.5808823529411765, + "acc_norm_stderr": 0.02997280717046462 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5163398692810458, + "acc_stderr": 0.02021703065318646, + "acc_norm": 0.5163398692810458, + "acc_norm_stderr": 0.02021703065318646 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6775510204081633, + "acc_stderr": 0.02992310056368391, + "acc_norm": 0.6775510204081633, + "acc_norm_stderr": 0.02992310056368391 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.746268656716418, + "acc_stderr": 0.03076944496729602, + "acc_norm": 0.746268656716418, + "acc_norm_stderr": 0.03076944496729602 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.46987951807228917, + "acc_stderr": 0.03885425420866766, + "acc_norm": 0.46987951807228917, + "acc_norm_stderr": 0.03885425420866766 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7251461988304093, + "acc_stderr": 0.03424042924691584, + "acc_norm": 0.7251461988304093, + "acc_norm_stderr": 0.03424042924691584 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3953488372093023, + "mc1_stderr": 0.017115815632418194, + "mc2": 0.5628382292113293, + "mc2_stderr": 0.015351892312006444 + }, + "all": { + "acc": 0.5534994306638509, + "acc_stderr": 0.03475700070795008, + "acc_norm": 0.5570434858760736, + "acc_norm_stderr": 0.03474510896674971, + "mc1": 0.3953488372093023, + "mc1_stderr": 0.017115815632418194, + "mc2": 0.5628382292113293, + "mc2_stderr": 0.015351892312006444 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "4118.244748592377", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mistralai/Mistral-7B-Instruct-v0.1/results_2023-10-24T09-43-48.997990.json b/eval-results/mistralai/Mistral-7B-Instruct-v0.1/results_2023-10-24T09-43-48.997990.json new file mode 100644 index 0000000000000000000000000000000000000000..628050d7e2bc08df0227d8733a4b4ed0dee3a2a5 --- /dev/null +++ b/eval-results/mistralai/Mistral-7B-Instruct-v0.1/results_2023-10-24T09-43-48.997990.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "mistralai/Mistral-7B-Instruct-v0.1", + "model_sha": "7ad5799710574ba1c1d953eba3077af582f3a773", + "model_size": "13.99 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.37038590604026844, + "em_stderr": 0.00494543044549648, + "f1": 0.43100566275167973, + "f1_stderr": 0.00478990485809286 + }, + "harness|gsm8k|5": { + "acc": 0.1425322213798332, + "acc_stderr": 0.009629588445673814 + }, + "harness|winogrande|5": { + "acc": 0.7371744277821626, + "acc_stderr": 0.012370922527262006 + }, + "all": { + "em": 0.37038590604026844, + "em_stderr": 0.00494543044549648, + "f1": 0.43100566275167973, + "f1_stderr": 0.00478990485809286, + "acc": 0.4398533245809979, + "acc_stderr": 0.01100025548646791 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "8abb410b1c8dbaab" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "037813bce24df84c" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2397, + "non-padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "6c4bb735f4f49625" + }, + "total_evaluation_time_secondes": "8170.495888233185", + "truncated": 0, + "non-truncated": 13389, + "padded": 2397, + "non-padded": 10992, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mistralai/Mistral-7B-Instruct-v0.2/results_2023-12-12T03-37-50.599841.json b/eval-results/mistralai/Mistral-7B-Instruct-v0.2/results_2023-12-12T03-37-50.599841.json new file mode 100644 index 0000000000000000000000000000000000000000..508e09681b21bfbf366af857c5b1fd927e4aa812 --- /dev/null +++ b/eval-results/mistralai/Mistral-7B-Instruct-v0.2/results_2023-12-12T03-37-50.599841.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 799683.843728635, + "end_time": 807141.350763106, + "total_evaluation_time_secondes": "7457.507034471026", + "model_name": "mistralai/Mistral-7B-Instruct-v0.2", + "model_sha": "c72e5d1908b1e2929ec8fc4c8820e9706af1f80f", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5895904436860068, + "acc_stderr": 0.014374922192642664, + "acc_norm": 0.6313993174061433, + "acc_norm_stderr": 0.014097810678042203 + }, + "harness|hellaswag|10": { + "acc": 0.6677952599083847, + "acc_stderr": 0.0047004138249425636, + "acc_norm": 0.8488348934475204, + "acc_norm_stderr": 0.003574776594108505 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5703703703703704, + "acc_stderr": 0.042763494943765995, + "acc_norm": 0.5703703703703704, + "acc_norm_stderr": 0.042763494943765995 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.625, + "acc_stderr": 0.039397364351956274, + "acc_norm": 0.625, + "acc_norm_stderr": 0.039397364351956274 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6716981132075471, + "acc_stderr": 0.02890159361241178, + "acc_norm": 0.6716981132075471, + "acc_norm_stderr": 0.02890159361241178 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.03852084696008534, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.03852084696008534 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5895953757225434, + "acc_stderr": 0.037507570448955356, + "acc_norm": 0.5895953757225434, + "acc_norm_stderr": 0.037507570448955356 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4215686274509804, + "acc_stderr": 0.04913595201274498, + "acc_norm": 0.4215686274509804, + "acc_norm_stderr": 0.04913595201274498 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5319148936170213, + "acc_stderr": 0.03261936918467382, + "acc_norm": 0.5319148936170213, + "acc_norm_stderr": 0.03261936918467382 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.40350877192982454, + "acc_stderr": 0.04615186962583703, + "acc_norm": 0.40350877192982454, + "acc_norm_stderr": 0.04615186962583703 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6137931034482759, + "acc_stderr": 0.04057324734419035, + "acc_norm": 0.6137931034482759, + "acc_norm_stderr": 0.04057324734419035 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.36772486772486773, + "acc_stderr": 0.024833839825562413, + "acc_norm": 0.36772486772486773, + "acc_norm_stderr": 0.024833839825562413 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.04415438226743744, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.04415438226743744 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.632258064516129, + "acc_stderr": 0.02743086657997347, + "acc_norm": 0.632258064516129, + "acc_norm_stderr": 0.02743086657997347 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5123152709359606, + "acc_stderr": 0.035169204442208966, + "acc_norm": 0.5123152709359606, + "acc_norm_stderr": 0.035169204442208966 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.65, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.65, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7393939393939394, + "acc_stderr": 0.034277431758165236, + "acc_norm": 0.7393939393939394, + "acc_norm_stderr": 0.034277431758165236 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7626262626262627, + "acc_stderr": 0.030313710538198896, + "acc_norm": 0.7626262626262627, + "acc_norm_stderr": 0.030313710538198896 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8549222797927462, + "acc_stderr": 0.025416343096306443, + "acc_norm": 0.8549222797927462, + "acc_norm_stderr": 0.025416343096306443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5564102564102564, + "acc_stderr": 0.025189149894764205, + "acc_norm": 0.5564102564102564, + "acc_norm_stderr": 0.025189149894764205 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.027940457136228395, + "acc_norm": 0.3, + "acc_norm_stderr": 0.027940457136228395 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6554621848739496, + "acc_stderr": 0.030868682604121626, + "acc_norm": 0.6554621848739496, + "acc_norm_stderr": 0.030868682604121626 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3576158940397351, + "acc_stderr": 0.03913453431177258, + "acc_norm": 0.3576158940397351, + "acc_norm_stderr": 0.03913453431177258 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7908256880733945, + "acc_stderr": 0.017437937173343233, + "acc_norm": 0.7908256880733945, + "acc_norm_stderr": 0.017437937173343233 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.03388857118502326, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.03388857118502326 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7647058823529411, + "acc_stderr": 0.029771775228145624, + "acc_norm": 0.7647058823529411, + "acc_norm_stderr": 0.029771775228145624 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7552742616033755, + "acc_stderr": 0.027985699387036423, + "acc_norm": 0.7552742616033755, + "acc_norm_stderr": 0.027985699387036423 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6188340807174888, + "acc_stderr": 0.03259625118416827, + "acc_norm": 0.6188340807174888, + "acc_norm_stderr": 0.03259625118416827 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7404580152671756, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.7404580152671756, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8099173553719008, + "acc_stderr": 0.03581796951709282, + "acc_norm": 0.8099173553719008, + "acc_norm_stderr": 0.03581796951709282 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7300613496932515, + "acc_stderr": 0.034878251684978906, + "acc_norm": 0.7300613496932515, + "acc_norm_stderr": 0.034878251684978906 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.44642857142857145, + "acc_stderr": 0.047184714852195886, + "acc_norm": 0.44642857142857145, + "acc_norm_stderr": 0.047184714852195886 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8632478632478633, + "acc_stderr": 0.022509033937077785, + "acc_norm": 0.8632478632478633, + "acc_norm_stderr": 0.022509033937077785 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252609, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252609 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7803320561941252, + "acc_stderr": 0.014805384478371153, + "acc_norm": 0.7803320561941252, + "acc_norm_stderr": 0.014805384478371153 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6965317919075145, + "acc_stderr": 0.024752411960917205, + "acc_norm": 0.6965317919075145, + "acc_norm_stderr": 0.024752411960917205 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.311731843575419, + "acc_stderr": 0.015491756531894637, + "acc_norm": 0.311731843575419, + "acc_norm_stderr": 0.015491756531894637 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6862745098039216, + "acc_stderr": 0.026568921015457138, + "acc_norm": 0.6862745098039216, + "acc_norm_stderr": 0.026568921015457138 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7041800643086816, + "acc_stderr": 0.025922371788818777, + "acc_norm": 0.7041800643086816, + "acc_norm_stderr": 0.025922371788818777 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7006172839506173, + "acc_stderr": 0.02548311560119546, + "acc_norm": 0.7006172839506173, + "acc_norm_stderr": 0.02548311560119546 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.450354609929078, + "acc_stderr": 0.029680105565029036, + "acc_norm": 0.450354609929078, + "acc_norm_stderr": 0.029680105565029036 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4348109517601043, + "acc_stderr": 0.012661233805616302, + "acc_norm": 0.4348109517601043, + "acc_norm_stderr": 0.012661233805616302 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.02952009569768776, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.02952009569768776 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.630718954248366, + "acc_stderr": 0.019524316744866353, + "acc_norm": 0.630718954248366, + "acc_norm_stderr": 0.019524316744866353 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.04350271442923243, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.04350271442923243 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7061224489795919, + "acc_stderr": 0.02916273841024977, + "acc_norm": 0.7061224489795919, + "acc_norm_stderr": 0.02916273841024977 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7313432835820896, + "acc_stderr": 0.03134328358208954, + "acc_norm": 0.7313432835820896, + "acc_norm_stderr": 0.03134328358208954 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036625, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036625 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4939759036144578, + "acc_stderr": 0.03892212195333047, + "acc_norm": 0.4939759036144578, + "acc_norm_stderr": 0.03892212195333047 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.5275397796817626, + "mc1_stderr": 0.01747693019071219, + "mc2": 0.6825629969752945, + "mc2_stderr": 0.015176655501749976 + }, + "harness|winogrande|5": { + "acc": 0.7719021310181531, + "acc_stderr": 0.011793015817663597 + }, + "harness|gsm8k|5": { + "acc": 0.400303260045489, + "acc_stderr": 0.013495926436566438 + }, + "all": { + "acc": 0.6077550413417533, + "acc_stderr": 0.03310328786623656, + "acc_norm": 0.6122661125091963, + "acc_norm_stderr": 0.03377303167526721, + "mc1": 0.5275397796817626, + "mc1_stderr": 0.01747693019071219, + "mc2": 0.6825629969752945, + "mc2_stderr": 0.015176655501749976 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "290f45eedc561b39" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "612f615f0269bb92" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mistralai/Mistral-7B-v0.1/results_2023-09-27T15-30-59.039834.json b/eval-results/mistralai/Mistral-7B-v0.1/results_2023-09-27T15-30-59.039834.json new file mode 100644 index 0000000000000000000000000000000000000000..6b870d1f68317a1b8abcd7fc0e71c4a2eee1019a --- /dev/null +++ b/eval-results/mistralai/Mistral-7B-v0.1/results_2023-09-27T15-30-59.039834.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "mistralai/Mistral-7B-v0.1", + "model_sha": "e836d8f71b5812f9fee65618453dc537c66bd82a", + "model_size": "13.99 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.568259385665529, + "acc_stderr": 0.014474591427196202, + "acc_norm": 0.5998293515358362, + "acc_norm_stderr": 0.014317197787809172 + }, + "harness|hellaswag|10": { + "acc": 0.6294562836088429, + "acc_stderr": 0.00481963366883254, + "acc_norm": 0.8331009759012149, + "acc_norm_stderr": 0.0037212361965025162 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6296296296296297, + "acc_stderr": 0.041716541613545426, + "acc_norm": 0.6296296296296297, + "acc_norm_stderr": 0.041716541613545426 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6578947368421053, + "acc_stderr": 0.03860731599316091, + "acc_norm": 0.6578947368421053, + "acc_norm_stderr": 0.03860731599316091 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6943396226415094, + "acc_stderr": 0.028353298073322663, + "acc_norm": 0.6943396226415094, + "acc_norm_stderr": 0.028353298073322663 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7291666666666666, + "acc_stderr": 0.03716177437566017, + "acc_norm": 0.7291666666666666, + "acc_norm_stderr": 0.03716177437566017 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6647398843930635, + "acc_stderr": 0.03599586301247077, + "acc_norm": 0.6647398843930635, + "acc_norm_stderr": 0.03599586301247077 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.39215686274509803, + "acc_stderr": 0.04858083574266346, + "acc_norm": 0.39215686274509803, + "acc_norm_stderr": 0.04858083574266346 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.77, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.574468085106383, + "acc_stderr": 0.03232146916224468, + "acc_norm": 0.574468085106383, + "acc_norm_stderr": 0.03232146916224468 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5, + "acc_stderr": 0.047036043419179864, + "acc_norm": 0.5, + "acc_norm_stderr": 0.047036043419179864 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5724137931034483, + "acc_stderr": 0.04122737111370332, + "acc_norm": 0.5724137931034483, + "acc_norm_stderr": 0.04122737111370332 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.02490699045899257, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.02490699045899257 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.04403438954768177, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.04403438954768177 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7709677419354839, + "acc_stderr": 0.023904914311782648, + "acc_norm": 0.7709677419354839, + "acc_norm_stderr": 0.023904914311782648 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5270935960591133, + "acc_stderr": 0.03512819077876106, + "acc_norm": 0.5270935960591133, + "acc_norm_stderr": 0.03512819077876106 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7818181818181819, + "acc_stderr": 0.032250781083062896, + "acc_norm": 0.7818181818181819, + "acc_norm_stderr": 0.032250781083062896 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7727272727272727, + "acc_stderr": 0.029857515673386417, + "acc_norm": 0.7727272727272727, + "acc_norm_stderr": 0.029857515673386417 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8652849740932642, + "acc_stderr": 0.02463978909770944, + "acc_norm": 0.8652849740932642, + "acc_norm_stderr": 0.02463978909770944 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.023901157979402534, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.023901157979402534 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.337037037037037, + "acc_stderr": 0.028820884666253255, + "acc_norm": 0.337037037037037, + "acc_norm_stderr": 0.028820884666253255 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6596638655462185, + "acc_stderr": 0.030778057422931673, + "acc_norm": 0.6596638655462185, + "acc_norm_stderr": 0.030778057422931673 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.038227469376587525, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.038227469376587525 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8238532110091743, + "acc_stderr": 0.016332882393431385, + "acc_norm": 0.8238532110091743, + "acc_norm_stderr": 0.016332882393431385 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5740740740740741, + "acc_stderr": 0.03372343271653062, + "acc_norm": 0.5740740740740741, + "acc_norm_stderr": 0.03372343271653062 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.028125972265654373, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.028125972265654373 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7721518987341772, + "acc_stderr": 0.027303484599069436, + "acc_norm": 0.7721518987341772, + "acc_norm_stderr": 0.027303484599069436 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7040358744394619, + "acc_stderr": 0.030636591348699803, + "acc_norm": 0.7040358744394619, + "acc_norm_stderr": 0.030636591348699803 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7938931297709924, + "acc_stderr": 0.03547771004159463, + "acc_norm": 0.7938931297709924, + "acc_norm_stderr": 0.03547771004159463 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7768595041322314, + "acc_stderr": 0.03800754475228732, + "acc_norm": 0.7768595041322314, + "acc_norm_stderr": 0.03800754475228732 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7914110429447853, + "acc_stderr": 0.031921934489347235, + "acc_norm": 0.7914110429447853, + "acc_norm_stderr": 0.031921934489347235 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.48214285714285715, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.48214285714285715, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8155339805825242, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.8155339805825242, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.02190190511507333, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.02190190511507333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8173690932311622, + "acc_stderr": 0.013816335389973136, + "acc_norm": 0.8173690932311622, + "acc_norm_stderr": 0.013816335389973136 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7109826589595376, + "acc_stderr": 0.02440517393578323, + "acc_norm": 0.7109826589595376, + "acc_norm_stderr": 0.02440517393578323 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.32513966480446926, + "acc_stderr": 0.01566654278505355, + "acc_norm": 0.32513966480446926, + "acc_norm_stderr": 0.01566654278505355 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7581699346405228, + "acc_stderr": 0.024518195641879334, + "acc_norm": 0.7581699346405228, + "acc_norm_stderr": 0.024518195641879334 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6977491961414791, + "acc_stderr": 0.026082700695399665, + "acc_norm": 0.6977491961414791, + "acc_norm_stderr": 0.026082700695399665 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7345679012345679, + "acc_stderr": 0.024569223600460845, + "acc_norm": 0.7345679012345679, + "acc_norm_stderr": 0.024569223600460845 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4858156028368794, + "acc_stderr": 0.02981549448368206, + "acc_norm": 0.4858156028368794, + "acc_norm_stderr": 0.02981549448368206 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44784876140808344, + "acc_stderr": 0.01270058240476822, + "acc_norm": 0.44784876140808344, + "acc_norm_stderr": 0.01270058240476822 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6911764705882353, + "acc_stderr": 0.02806499816704009, + "acc_norm": 0.6911764705882353, + "acc_norm_stderr": 0.02806499816704009 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6813725490196079, + "acc_stderr": 0.01885008469646872, + "acc_norm": 0.6813725490196079, + "acc_norm_stderr": 0.01885008469646872 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.0449429086625209, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.0449429086625209 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.726530612244898, + "acc_stderr": 0.028535560337128448, + "acc_norm": 0.726530612244898, + "acc_norm_stderr": 0.028535560337128448 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8308457711442786, + "acc_stderr": 0.026508590656233264, + "acc_norm": 0.8308457711442786, + "acc_norm_stderr": 0.026508590656233264 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.034873508801977704, + "acc_norm": 0.86, + "acc_norm_stderr": 0.034873508801977704 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5542168674698795, + "acc_stderr": 0.03869543323472101, + "acc_norm": 0.5542168674698795, + "acc_norm_stderr": 0.03869543323472101 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8304093567251462, + "acc_stderr": 0.02878210810540171, + "acc_norm": 0.8304093567251462, + "acc_norm_stderr": 0.02878210810540171 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2802937576499388, + "mc1_stderr": 0.015723139524608763, + "mc2": 0.4215317106968115, + "mc2_stderr": 0.014138129483133954 + }, + "all": { + "acc": 0.6401944309606807, + "acc_stderr": 0.03294634536642965, + "acc_norm": 0.6441811200820821, + "acc_norm_stderr": 0.0329250607713158, + "mc1": 0.2802937576499388, + "mc1_stderr": 0.015723139524608763, + "mc2": 0.4215317106968115, + "mc2_stderr": 0.014138129483133954 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "4100.285229921341", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mistralai/Mistral-7B-v0.1/results_2023-10-25T23-48-21.884715.json b/eval-results/mistralai/Mistral-7B-v0.1/results_2023-10-25T23-48-21.884715.json new file mode 100644 index 0000000000000000000000000000000000000000..1b175cb2aad548796d00752a73507e3d144559b5 --- /dev/null +++ b/eval-results/mistralai/Mistral-7B-v0.1/results_2023-10-25T23-48-21.884715.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "mistralai/Mistral-7B-v0.1", + "model_sha": "5e9c98b96d071dce59368012254c55b0ec6f8658", + "model_size": "13.99 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0014681208053691276, + "em_stderr": 0.00039210421902984423, + "f1": 0.061364303691275233, + "f1_stderr": 0.0013684794735811887 + }, + "harness|gsm8k|5": { + "acc": 0.18119787717968158, + "acc_stderr": 0.010609827611527364 + }, + "harness|winogrande|5": { + "acc": 0.7837411207576953, + "acc_stderr": 0.01157061486140935 + }, + "all": { + "em": 0.0014681208053691276, + "em_stderr": 0.00039210421902984423, + "f1": 0.061364303691275233, + "f1_stderr": 0.0013684794735811887, + "acc": 0.48246949896868846, + "acc_stderr": 0.011090221236468356 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "0a6965e8ea12133f" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "b2ad38ab14b90b20" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2397, + "non-padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "750aedd5a6e50f36" + }, + "total_evaluation_time_secondes": "10093.810881137848", + "truncated": 0, + "non-truncated": 13389, + "padded": 2397, + "non-padded": 10992, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mistralai/Mistral-7B-v0.1/results_2023-10-26T01-29-53.089924.json b/eval-results/mistralai/Mistral-7B-v0.1/results_2023-10-26T01-29-53.089924.json new file mode 100644 index 0000000000000000000000000000000000000000..cbd586e4ac1ae5665e3de095d6e08e8d3981388f --- /dev/null +++ b/eval-results/mistralai/Mistral-7B-v0.1/results_2023-10-26T01-29-53.089924.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "mistralai/Mistral-7B-v0.1", + "model_sha": "5e9c98b96d071dce59368012254c55b0ec6f8658", + "model_size": "13.99 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001572986577181208, + "em_stderr": 0.00040584511324177333, + "f1": 0.06143666107382555, + "f1_stderr": 0.0013713061256604275 + }, + "harness|gsm8k|5": { + "acc": 0.17968157695223655, + "acc_stderr": 0.010575119964242255 + }, + "harness|winogrande|5": { + "acc": 0.7861089187056038, + "acc_stderr": 0.011524466954090254 + }, + "all": { + "em": 0.001572986577181208, + "em_stderr": 0.00040584511324177333, + "f1": 0.06143666107382555, + "f1_stderr": 0.0013713061256604275, + "acc": 0.4828952478289202, + "acc_stderr": 0.011049793459166254 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "098b6fc5f60912d5" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "62ba956675a59044" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2397, + "non-padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "e9b4e60c640503f6" + }, + "total_evaluation_time_secondes": "10262.63999247551", + "truncated": 0, + "non-truncated": 13389, + "padded": 2397, + "non-padded": 10992, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mistralai/Mistral-7B-v0.1/results_2023-12-01T11-13-53.246042.json b/eval-results/mistralai/Mistral-7B-v0.1/results_2023-12-01T11-13-53.246042.json new file mode 100644 index 0000000000000000000000000000000000000000..2cbfa4320eddf8424d103d2074db9861aa75fe4c --- /dev/null +++ b/eval-results/mistralai/Mistral-7B-v0.1/results_2023-12-01T11-13-53.246042.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1267831.561133012, + "end_time": 1270736.073971998, + "total_evaluation_time_secondes": "2904.5128389860038", + "model_name": "mistralai/Mistral-7B-v0.1", + "model_sha": "5e9c98b96d071dce59368012254c55b0ec6f8658", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.378316906747536, + "acc_stderr": 0.013358407831777117 + }, + "all": { + "acc": 0.378316906747536, + "acc_stderr": 0.013358407831777117 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "b2ad38ab14b90b20" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "f17391d49d33b9c0", + "hash_cont_tokens": "9f4ecba884e8a450" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mistralai/Mistral-7B-v0.1/results_2023-12-02T13-01-55.687268.json b/eval-results/mistralai/Mistral-7B-v0.1/results_2023-12-02T13-01-55.687268.json new file mode 100644 index 0000000000000000000000000000000000000000..39cff977cba1ba9e1550bd164959307745868a05 --- /dev/null +++ b/eval-results/mistralai/Mistral-7B-v0.1/results_2023-12-02T13-01-55.687268.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1399093.463810019, + "end_time": 1402016.750649734, + "total_evaluation_time_secondes": "2923.2868397149723", + "model_name": "mistralai/Mistral-7B-v0.1", + "model_sha": "5e9c98b96d071dce59368012254c55b0ec6f8658", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.378316906747536, + "acc_stderr": 0.013358407831777117 + }, + "all": { + "acc": 0.378316906747536, + "acc_stderr": 0.013358407831777117 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "b2ad38ab14b90b20" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "f17391d49d33b9c0", + "hash_cont_tokens": "9f4ecba884e8a450" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mistralai/Mistral-7B-v0.1/results_2023-12-02T13-02-14.153054.json b/eval-results/mistralai/Mistral-7B-v0.1/results_2023-12-02T13-02-14.153054.json new file mode 100644 index 0000000000000000000000000000000000000000..0ece058c43ffb81b56d845748b42e64cc23e7b15 --- /dev/null +++ b/eval-results/mistralai/Mistral-7B-v0.1/results_2023-12-02T13-02-14.153054.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1380551.878295495, + "end_time": 1383496.880915024, + "total_evaluation_time_secondes": "2945.002619529143", + "model_name": "mistralai/Mistral-7B-v0.1", + "model_sha": "5e9c98b96d071dce59368012254c55b0ec6f8658", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.3707354056103108, + "acc_stderr": 0.013304267705458433 + }, + "all": { + "acc": 0.3707354056103108, + "acc_stderr": 0.013304267705458433 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "62ba956675a59044" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "f17391d49d33b9c0", + "hash_cont_tokens": "8177662553497f90" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mistralai/Mixtral-8x7B-Instruct-v0.1/results_2023-12-12T00-47-55.246032.json b/eval-results/mistralai/Mixtral-8x7B-Instruct-v0.1/results_2023-12-12T00-47-55.246032.json new file mode 100644 index 0000000000000000000000000000000000000000..9897881a88e60779dded6ee9e3908b26d041fd03 --- /dev/null +++ b/eval-results/mistralai/Mixtral-8x7B-Instruct-v0.1/results_2023-12-12T00-47-55.246032.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 775608.175886996, + "end_time": 796946.119069259, + "total_evaluation_time_secondes": "21337.943182263058", + "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_sha": "3de0408ae8b591d9ac516a2384925dd98ebc66f4", + "model_dtype": "torch.bfloat16", + "model_size": "87.49 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6689419795221843, + "acc_stderr": 0.013752062419817829, + "acc_norm": 0.7022184300341296, + "acc_norm_stderr": 0.013363080107244482 + }, + "harness|hellaswag|10": { + "acc": 0.6845249950209121, + "acc_stderr": 0.004637550478007367, + "acc_norm": 0.8763194582752439, + "acc_norm_stderr": 0.0032854391911219046 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237103, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237103 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6814814814814815, + "acc_stderr": 0.04024778401977108, + "acc_norm": 0.6814814814814815, + "acc_norm_stderr": 0.04024778401977108 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7894736842105263, + "acc_stderr": 0.033176727875331574, + "acc_norm": 0.7894736842105263, + "acc_norm_stderr": 0.033176727875331574 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.73, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.73, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7811320754716982, + "acc_stderr": 0.02544786382510861, + "acc_norm": 0.7811320754716982, + "acc_norm_stderr": 0.02544786382510861 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8125, + "acc_stderr": 0.032639560491693344, + "acc_norm": 0.8125, + "acc_norm_stderr": 0.032639560491693344 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.7572254335260116, + "acc_stderr": 0.0326926380614177, + "acc_norm": 0.7572254335260116, + "acc_norm_stderr": 0.0326926380614177 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.048971049527263666, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.048971049527263666 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.81, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.81, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6638297872340425, + "acc_stderr": 0.030881618520676942, + "acc_norm": 0.6638297872340425, + "acc_norm_stderr": 0.030881618520676942 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.6052631578947368, + "acc_stderr": 0.04598188057816542, + "acc_norm": 0.6052631578947368, + "acc_norm_stderr": 0.04598188057816542 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6344827586206897, + "acc_stderr": 0.04013124195424385, + "acc_norm": 0.6344827586206897, + "acc_norm_stderr": 0.04013124195424385 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.48677248677248675, + "acc_stderr": 0.025742297289575142, + "acc_norm": 0.48677248677248675, + "acc_norm_stderr": 0.025742297289575142 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5158730158730159, + "acc_stderr": 0.044698818540726076, + "acc_norm": 0.5158730158730159, + "acc_norm_stderr": 0.044698818540726076 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8516129032258064, + "acc_stderr": 0.020222737554330378, + "acc_norm": 0.8516129032258064, + "acc_norm_stderr": 0.020222737554330378 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.6108374384236454, + "acc_stderr": 0.03430462416103872, + "acc_norm": 0.6108374384236454, + "acc_norm_stderr": 0.03430462416103872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932262, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932262 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.793939393939394, + "acc_stderr": 0.0315841532404771, + "acc_norm": 0.793939393939394, + "acc_norm_stderr": 0.0315841532404771 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8585858585858586, + "acc_stderr": 0.024825909793343343, + "acc_norm": 0.8585858585858586, + "acc_norm_stderr": 0.024825909793343343 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9585492227979274, + "acc_stderr": 0.01438543285747646, + "acc_norm": 0.9585492227979274, + "acc_norm_stderr": 0.01438543285747646 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6948717948717948, + "acc_stderr": 0.023346335293325884, + "acc_norm": 0.6948717948717948, + "acc_norm_stderr": 0.023346335293325884 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.029723278961476664, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.029723278961476664 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.8025210084033614, + "acc_stderr": 0.02585916412205145, + "acc_norm": 0.8025210084033614, + "acc_norm_stderr": 0.02585916412205145 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.46357615894039733, + "acc_stderr": 0.04071636065944215, + "acc_norm": 0.46357615894039733, + "acc_norm_stderr": 0.04071636065944215 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8807339449541285, + "acc_stderr": 0.01389572929258896, + "acc_norm": 0.8807339449541285, + "acc_norm_stderr": 0.01389572929258896 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.03362277436608043, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.03362277436608043 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8578431372549019, + "acc_stderr": 0.024509803921568617, + "acc_norm": 0.8578431372549019, + "acc_norm_stderr": 0.024509803921568617 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8523206751054853, + "acc_stderr": 0.023094329582595694, + "acc_norm": 0.8523206751054853, + "acc_norm_stderr": 0.023094329582595694 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.757847533632287, + "acc_stderr": 0.028751392398694755, + "acc_norm": 0.757847533632287, + "acc_norm_stderr": 0.028751392398694755 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8091603053435115, + "acc_stderr": 0.034465133507525975, + "acc_norm": 0.8091603053435115, + "acc_norm_stderr": 0.034465133507525975 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8760330578512396, + "acc_stderr": 0.030083098716035202, + "acc_norm": 0.8760330578512396, + "acc_norm_stderr": 0.030083098716035202 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8240740740740741, + "acc_stderr": 0.036809181416738807, + "acc_norm": 0.8240740740740741, + "acc_norm_stderr": 0.036809181416738807 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.803680981595092, + "acc_stderr": 0.031207970394709218, + "acc_norm": 0.803680981595092, + "acc_norm_stderr": 0.031207970394709218 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5803571428571429, + "acc_stderr": 0.04684099321077106, + "acc_norm": 0.5803571428571429, + "acc_norm_stderr": 0.04684099321077106 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8349514563106796, + "acc_stderr": 0.036756688322331886, + "acc_norm": 0.8349514563106796, + "acc_norm_stderr": 0.036756688322331886 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9273504273504274, + "acc_stderr": 0.017004368568132366, + "acc_norm": 0.9273504273504274, + "acc_norm_stderr": 0.017004368568132366 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932263, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932263 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.879948914431673, + "acc_stderr": 0.011622736692041283, + "acc_norm": 0.879948914431673, + "acc_norm_stderr": 0.011622736692041283 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7774566473988439, + "acc_stderr": 0.02239421566194282, + "acc_norm": 0.7774566473988439, + "acc_norm_stderr": 0.02239421566194282 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4592178770949721, + "acc_stderr": 0.016666783616525776, + "acc_norm": 0.4592178770949721, + "acc_norm_stderr": 0.016666783616525776 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.8235294117647058, + "acc_stderr": 0.021828596053108395, + "acc_norm": 0.8235294117647058, + "acc_norm_stderr": 0.021828596053108395 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7909967845659164, + "acc_stderr": 0.023093140398374224, + "acc_norm": 0.7909967845659164, + "acc_norm_stderr": 0.023093140398374224 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8271604938271605, + "acc_stderr": 0.021038517770157365, + "acc_norm": 0.8271604938271605, + "acc_norm_stderr": 0.021038517770157365 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5425531914893617, + "acc_stderr": 0.029719281272236837, + "acc_norm": 0.5425531914893617, + "acc_norm_stderr": 0.029719281272236837 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5449804432855281, + "acc_stderr": 0.012718456618701787, + "acc_norm": 0.5449804432855281, + "acc_norm_stderr": 0.012718456618701787 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7904411764705882, + "acc_stderr": 0.02472311040767708, + "acc_norm": 0.7904411764705882, + "acc_norm_stderr": 0.02472311040767708 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.761437908496732, + "acc_stderr": 0.017242385828779593, + "acc_norm": 0.761437908496732, + "acc_norm_stderr": 0.017242385828779593 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.04350271442923243, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.04350271442923243 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7591836734693878, + "acc_stderr": 0.02737294220178816, + "acc_norm": 0.7591836734693878, + "acc_norm_stderr": 0.02737294220178816 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8905472636815921, + "acc_stderr": 0.02207632610182466, + "acc_norm": 0.8905472636815921, + "acc_norm_stderr": 0.02207632610182466 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.9, + "acc_stderr": 0.030151134457776334, + "acc_norm": 0.9, + "acc_norm_stderr": 0.030151134457776334 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5120481927710844, + "acc_stderr": 0.03891364495835816, + "acc_norm": 0.5120481927710844, + "acc_norm_stderr": 0.03891364495835816 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8830409356725146, + "acc_stderr": 0.02464806896136615, + "acc_norm": 0.8830409356725146, + "acc_norm_stderr": 0.02464806896136615 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.4969400244798042, + "mc1_stderr": 0.01750317326096062, + "mc2": 0.645771015224494, + "mc2_stderr": 0.015124943582646601 + }, + "harness|winogrande|5": { + "acc": 0.813733228097869, + "acc_stderr": 0.010941877955676207 + }, + "harness|gsm8k|5": { + "acc": 0.6072782410917361, + "acc_stderr": 0.013451745349586574 + }, + "all": { + "acc": 0.7104422883926242, + "acc_stderr": 0.03028537694525455, + "acc_norm": 0.7142548989241821, + "acc_norm_stderr": 0.03086903858908146, + "mc1": 0.4969400244798042, + "mc1_stderr": 0.01750317326096062, + "mc2": 0.645771015224494, + "mc2_stderr": 0.015124943582646601 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "82c39129f1e34f5b" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "930adbc4ff0d5c0e" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mistralai/Mixtral-8x7B-v0.1/results_2023-12-11T18-04-02.035270.json b/eval-results/mistralai/Mixtral-8x7B-v0.1/results_2023-12-11T18-04-02.035270.json new file mode 100644 index 0000000000000000000000000000000000000000..648188343a948179fbef71425c00bb3188387a6f --- /dev/null +++ b/eval-results/mistralai/Mixtral-8x7B-v0.1/results_2023-12-11T18-04-02.035270.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 751888.700743949, + "end_time": 772712.911735817, + "total_evaluation_time_secondes": "20824.210991867934", + "model_name": "mistralai/Mixtral-8x7B-v0.1", + "model_sha": "4dd4b0f2d577d7b74152732d5543a92201481fe2", + "model_dtype": "torch.bfloat16", + "model_size": "87.49 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6313993174061433, + "acc_stderr": 0.014097810678042194, + "acc_norm": 0.6604095563139932, + "acc_norm_stderr": 0.01383903976282017 + }, + "harness|hellaswag|10": { + "acc": 0.6694881497709619, + "acc_stderr": 0.004694360968929404, + "acc_norm": 0.8648675562636925, + "acc_norm_stderr": 0.0034116630716511075 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6814814814814815, + "acc_stderr": 0.04024778401977109, + "acc_norm": 0.6814814814814815, + "acc_norm_stderr": 0.04024778401977109 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.8355263157894737, + "acc_stderr": 0.03016753346863271, + "acc_norm": 0.8355263157894737, + "acc_norm_stderr": 0.03016753346863271 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7849056603773585, + "acc_stderr": 0.02528839450289137, + "acc_norm": 0.7849056603773585, + "acc_norm_stderr": 0.02528839450289137 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8402777777777778, + "acc_stderr": 0.030635578972093274, + "acc_norm": 0.8402777777777778, + "acc_norm_stderr": 0.030635578972093274 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.62, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.62, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.7109826589595376, + "acc_stderr": 0.03456425745086999, + "acc_norm": 0.7109826589595376, + "acc_norm_stderr": 0.03456425745086999 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.47058823529411764, + "acc_stderr": 0.049665709039785295, + "acc_norm": 0.47058823529411764, + "acc_norm_stderr": 0.049665709039785295 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.82, + "acc_stderr": 0.03861229196653695, + "acc_norm": 0.82, + "acc_norm_stderr": 0.03861229196653695 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6723404255319149, + "acc_stderr": 0.030683020843231008, + "acc_norm": 0.6723404255319149, + "acc_norm_stderr": 0.030683020843231008 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.6578947368421053, + "acc_stderr": 0.044629175353369376, + "acc_norm": 0.6578947368421053, + "acc_norm_stderr": 0.044629175353369376 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6689655172413793, + "acc_stderr": 0.03921545312467122, + "acc_norm": 0.6689655172413793, + "acc_norm_stderr": 0.03921545312467122 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4708994708994709, + "acc_stderr": 0.025707658614154954, + "acc_norm": 0.4708994708994709, + "acc_norm_stderr": 0.025707658614154954 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5634920634920635, + "acc_stderr": 0.04435932892851466, + "acc_norm": 0.5634920634920635, + "acc_norm_stderr": 0.04435932892851466 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8354838709677419, + "acc_stderr": 0.021090847745939313, + "acc_norm": 0.8354838709677419, + "acc_norm_stderr": 0.021090847745939313 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.6354679802955665, + "acc_stderr": 0.0338640574606209, + "acc_norm": 0.6354679802955665, + "acc_norm_stderr": 0.0338640574606209 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8242424242424242, + "acc_stderr": 0.02972094300622445, + "acc_norm": 0.8242424242424242, + "acc_norm_stderr": 0.02972094300622445 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8636363636363636, + "acc_stderr": 0.024450155973189835, + "acc_norm": 0.8636363636363636, + "acc_norm_stderr": 0.024450155973189835 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9430051813471503, + "acc_stderr": 0.01673108529360755, + "acc_norm": 0.9430051813471503, + "acc_norm_stderr": 0.01673108529360755 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7128205128205128, + "acc_stderr": 0.022939925418530613, + "acc_norm": 0.7128205128205128, + "acc_norm_stderr": 0.022939925418530613 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.029723278961476664, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.029723278961476664 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7983193277310925, + "acc_stderr": 0.026064313406304527, + "acc_norm": 0.7983193277310925, + "acc_norm_stderr": 0.026064313406304527 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.5033112582781457, + "acc_stderr": 0.04082393379449654, + "acc_norm": 0.5033112582781457, + "acc_norm_stderr": 0.04082393379449654 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8862385321100917, + "acc_stderr": 0.013613614800232812, + "acc_norm": 0.8862385321100917, + "acc_norm_stderr": 0.013613614800232812 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.6296296296296297, + "acc_stderr": 0.03293377139415191, + "acc_norm": 0.6296296296296297, + "acc_norm_stderr": 0.03293377139415191 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8480392156862745, + "acc_stderr": 0.025195658428931792, + "acc_norm": 0.8480392156862745, + "acc_norm_stderr": 0.025195658428931792 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8945147679324894, + "acc_stderr": 0.019995560723758545, + "acc_norm": 0.8945147679324894, + "acc_norm_stderr": 0.019995560723758545 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7757847533632287, + "acc_stderr": 0.027991534258519517, + "acc_norm": 0.7757847533632287, + "acc_norm_stderr": 0.027991534258519517 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.816793893129771, + "acc_stderr": 0.03392770926494732, + "acc_norm": 0.816793893129771, + "acc_norm_stderr": 0.03392770926494732 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8677685950413223, + "acc_stderr": 0.030922788320445784, + "acc_norm": 0.8677685950413223, + "acc_norm_stderr": 0.030922788320445784 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8240740740740741, + "acc_stderr": 0.036809181416738807, + "acc_norm": 0.8240740740740741, + "acc_norm_stderr": 0.036809181416738807 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7668711656441718, + "acc_stderr": 0.0332201579577674, + "acc_norm": 0.7668711656441718, + "acc_norm_stderr": 0.0332201579577674 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5625, + "acc_stderr": 0.04708567521880525, + "acc_norm": 0.5625, + "acc_norm_stderr": 0.04708567521880525 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.883495145631068, + "acc_stderr": 0.03176683948640407, + "acc_norm": 0.883495145631068, + "acc_norm_stderr": 0.03176683948640407 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9188034188034188, + "acc_stderr": 0.017893784904018533, + "acc_norm": 0.9188034188034188, + "acc_norm_stderr": 0.017893784904018533 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.78, + "acc_stderr": 0.041633319989322626, + "acc_norm": 0.78, + "acc_norm_stderr": 0.041633319989322626 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8722860791826309, + "acc_stderr": 0.011935626313999878, + "acc_norm": 0.8722860791826309, + "acc_norm_stderr": 0.011935626313999878 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7976878612716763, + "acc_stderr": 0.021628077380196124, + "acc_norm": 0.7976878612716763, + "acc_norm_stderr": 0.021628077380196124 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.39888268156424583, + "acc_stderr": 0.016376966142610073, + "acc_norm": 0.39888268156424583, + "acc_norm_stderr": 0.016376966142610073 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.8202614379084967, + "acc_stderr": 0.02198603218206415, + "acc_norm": 0.8202614379084967, + "acc_norm_stderr": 0.02198603218206415 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7942122186495176, + "acc_stderr": 0.022961339906764244, + "acc_norm": 0.7942122186495176, + "acc_norm_stderr": 0.022961339906764244 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8425925925925926, + "acc_stderr": 0.020263764996385714, + "acc_norm": 0.8425925925925926, + "acc_norm_stderr": 0.020263764996385714 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5212765957446809, + "acc_stderr": 0.029800481645628693, + "acc_norm": 0.5212765957446809, + "acc_norm_stderr": 0.029800481645628693 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5371577574967406, + "acc_stderr": 0.01273492357953206, + "acc_norm": 0.5371577574967406, + "acc_norm_stderr": 0.01273492357953206 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.8014705882352942, + "acc_stderr": 0.024231013370541083, + "acc_norm": 0.8014705882352942, + "acc_norm_stderr": 0.024231013370541083 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.016639319350313264, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.016639319350313264 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7, + "acc_stderr": 0.04389311454644287, + "acc_norm": 0.7, + "acc_norm_stderr": 0.04389311454644287 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7836734693877551, + "acc_stderr": 0.026358916334904014, + "acc_norm": 0.7836734693877551, + "acc_norm_stderr": 0.026358916334904014 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8905472636815921, + "acc_stderr": 0.022076326101824657, + "acc_norm": 0.8905472636815921, + "acc_norm_stderr": 0.022076326101824657 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.93, + "acc_stderr": 0.0256432399976243, + "acc_norm": 0.93, + "acc_norm_stderr": 0.0256432399976243 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5120481927710844, + "acc_stderr": 0.03891364495835816, + "acc_norm": 0.5120481927710844, + "acc_norm_stderr": 0.03891364495835816 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8713450292397661, + "acc_stderr": 0.02567934272327692, + "acc_norm": 0.8713450292397661, + "acc_norm_stderr": 0.02567934272327692 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3157894736842105, + "mc1_stderr": 0.016272287957916926, + "mc2": 0.46783056023586933, + "mc2_stderr": 0.014139800140253312 + }, + "harness|winogrande|5": { + "acc": 0.819258089976322, + "acc_stderr": 0.010814911009613975 + }, + "harness|gsm8k|5": { + "acc": 0.574677786201668, + "acc_stderr": 0.013618006363084792 + }, + "all": { + "acc": 0.715239707860844, + "acc_stderr": 0.03005693577554029, + "acc_norm": 0.7196623042158319, + "acc_norm_stderr": 0.03063557112072472, + "mc1": 0.3157894736842105, + "mc1_stderr": 0.016272287957916926, + "mc2": 0.46783056023586933, + "mc2_stderr": 0.014139800140253312 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "4ca3b5ab308674e1" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "f0cdc3a5e56132ae" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mistralai/Mixtral-8x7B-v0.1/results_2023-12-15T14-35-04.630519.json b/eval-results/mistralai/Mixtral-8x7B-v0.1/results_2023-12-15T14-35-04.630519.json new file mode 100644 index 0000000000000000000000000000000000000000..2c54cecb9757ee06222394681e79115301319161 --- /dev/null +++ b/eval-results/mistralai/Mixtral-8x7B-v0.1/results_2023-12-15T14-35-04.630519.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 267640.500646981, + "end_time": 281861.689964623, + "total_evaluation_time_secondes": "14221.189317642013", + "model_name": "mistralai/Mixtral-8x7B-v0.1", + "model_sha": "c2b2ae2f1f9532c7c50045bc57d643f46acf8d30", + "model_dtype": "4bit", + "model_size": "22.61 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6109215017064846, + "acc_stderr": 0.014247309976045607, + "acc_norm": 0.643344709897611, + "acc_norm_stderr": 0.01399805690262019 + }, + "harness|hellaswag|10": { + "acc": 0.6548496315475005, + "acc_stderr": 0.004744456628455121, + "acc_norm": 0.8570005974905397, + "acc_norm_stderr": 0.00349356791409329 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.674074074074074, + "acc_stderr": 0.040491220417025055, + "acc_norm": 0.674074074074074, + "acc_norm_stderr": 0.040491220417025055 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.8026315789473685, + "acc_stderr": 0.03238981601699397, + "acc_norm": 0.8026315789473685, + "acc_norm_stderr": 0.03238981601699397 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252609, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252609 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.769811320754717, + "acc_stderr": 0.025907897122408173, + "acc_norm": 0.769811320754717, + "acc_norm_stderr": 0.025907897122408173 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8472222222222222, + "acc_stderr": 0.030085743248565666, + "acc_norm": 0.8472222222222222, + "acc_norm_stderr": 0.030085743248565666 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6878612716763006, + "acc_stderr": 0.03533133389323657, + "acc_norm": 0.6878612716763006, + "acc_norm_stderr": 0.03533133389323657 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.47058823529411764, + "acc_stderr": 0.049665709039785295, + "acc_norm": 0.47058823529411764, + "acc_norm_stderr": 0.049665709039785295 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.79, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6680851063829787, + "acc_stderr": 0.030783736757745647, + "acc_norm": 0.6680851063829787, + "acc_norm_stderr": 0.030783736757745647 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.6228070175438597, + "acc_stderr": 0.04559522141958216, + "acc_norm": 0.6228070175438597, + "acc_norm_stderr": 0.04559522141958216 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6413793103448275, + "acc_stderr": 0.039966295748767186, + "acc_norm": 0.6413793103448275, + "acc_norm_stderr": 0.039966295748767186 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4417989417989418, + "acc_stderr": 0.025576257061253833, + "acc_norm": 0.4417989417989418, + "acc_norm_stderr": 0.025576257061253833 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5079365079365079, + "acc_stderr": 0.044715725362943486, + "acc_norm": 0.5079365079365079, + "acc_norm_stderr": 0.044715725362943486 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.45, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.45, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8290322580645161, + "acc_stderr": 0.021417242936321582, + "acc_norm": 0.8290322580645161, + "acc_norm_stderr": 0.021417242936321582 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.6157635467980296, + "acc_stderr": 0.034223985656575515, + "acc_norm": 0.6157635467980296, + "acc_norm_stderr": 0.034223985656575515 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8121212121212121, + "acc_stderr": 0.03050193405942914, + "acc_norm": 0.8121212121212121, + "acc_norm_stderr": 0.03050193405942914 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8484848484848485, + "acc_stderr": 0.025545650426603613, + "acc_norm": 0.8484848484848485, + "acc_norm_stderr": 0.025545650426603613 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.917098445595855, + "acc_stderr": 0.01989934131572178, + "acc_norm": 0.917098445595855, + "acc_norm_stderr": 0.01989934131572178 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7051282051282052, + "acc_stderr": 0.023119362758232304, + "acc_norm": 0.7051282051282052, + "acc_norm_stderr": 0.023119362758232304 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.02911661760608301, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.02911661760608301 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7605042016806722, + "acc_stderr": 0.027722065493361252, + "acc_norm": 0.7605042016806722, + "acc_norm_stderr": 0.027722065493361252 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.5165562913907285, + "acc_stderr": 0.04080244185628972, + "acc_norm": 0.5165562913907285, + "acc_norm_stderr": 0.04080244185628972 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8678899082568807, + "acc_stderr": 0.014517801914598238, + "acc_norm": 0.8678899082568807, + "acc_norm_stderr": 0.014517801914598238 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.6203703703703703, + "acc_stderr": 0.03309682581119035, + "acc_norm": 0.6203703703703703, + "acc_norm_stderr": 0.03309682581119035 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8578431372549019, + "acc_stderr": 0.024509803921568606, + "acc_norm": 0.8578431372549019, + "acc_norm_stderr": 0.024509803921568606 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8734177215189873, + "acc_stderr": 0.021644195727955173, + "acc_norm": 0.8734177215189873, + "acc_norm_stderr": 0.021644195727955173 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7443946188340808, + "acc_stderr": 0.029275891003969927, + "acc_norm": 0.7443946188340808, + "acc_norm_stderr": 0.029275891003969927 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8244274809160306, + "acc_stderr": 0.03336820338476076, + "acc_norm": 0.8244274809160306, + "acc_norm_stderr": 0.03336820338476076 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8760330578512396, + "acc_stderr": 0.030083098716035202, + "acc_norm": 0.8760330578512396, + "acc_norm_stderr": 0.030083098716035202 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8240740740740741, + "acc_stderr": 0.036809181416738807, + "acc_norm": 0.8240740740740741, + "acc_norm_stderr": 0.036809181416738807 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7730061349693251, + "acc_stderr": 0.03291099578615769, + "acc_norm": 0.7730061349693251, + "acc_norm_stderr": 0.03291099578615769 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5089285714285714, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.5089285714285714, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.03760178006026622, + "acc_norm": 0.8252427184466019, + "acc_norm_stderr": 0.03760178006026622 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9273504273504274, + "acc_stderr": 0.017004368568132366, + "acc_norm": 0.9273504273504274, + "acc_norm_stderr": 0.017004368568132366 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932263, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932263 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8773946360153256, + "acc_stderr": 0.011728672144131563, + "acc_norm": 0.8773946360153256, + "acc_norm_stderr": 0.011728672144131563 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.791907514450867, + "acc_stderr": 0.021855255263421795, + "acc_norm": 0.791907514450867, + "acc_norm_stderr": 0.021855255263421795 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.37318435754189944, + "acc_stderr": 0.01617569201338196, + "acc_norm": 0.37318435754189944, + "acc_norm_stderr": 0.01617569201338196 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7647058823529411, + "acc_stderr": 0.0242886194660461, + "acc_norm": 0.7647058823529411, + "acc_norm_stderr": 0.0242886194660461 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7620578778135049, + "acc_stderr": 0.024185150647818707, + "acc_norm": 0.7620578778135049, + "acc_norm_stderr": 0.024185150647818707 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8364197530864198, + "acc_stderr": 0.02058146613825715, + "acc_norm": 0.8364197530864198, + "acc_norm_stderr": 0.02058146613825715 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.549645390070922, + "acc_stderr": 0.02968010556502904, + "acc_norm": 0.549645390070922, + "acc_norm_stderr": 0.02968010556502904 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5215123859191656, + "acc_stderr": 0.012758410941038925, + "acc_norm": 0.5215123859191656, + "acc_norm_stderr": 0.012758410941038925 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7904411764705882, + "acc_stderr": 0.024723110407677055, + "acc_norm": 0.7904411764705882, + "acc_norm_stderr": 0.024723110407677055 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.761437908496732, + "acc_stderr": 0.017242385828779613, + "acc_norm": 0.761437908496732, + "acc_norm_stderr": 0.017242385828779613 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.04350271442923243, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.04350271442923243 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7795918367346939, + "acc_stderr": 0.02653704531214529, + "acc_norm": 0.7795918367346939, + "acc_norm_stderr": 0.02653704531214529 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8557213930348259, + "acc_stderr": 0.024845753212306046, + "acc_norm": 0.8557213930348259, + "acc_norm_stderr": 0.024845753212306046 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.89, + "acc_stderr": 0.03144660377352203, + "acc_norm": 0.89, + "acc_norm_stderr": 0.03144660377352203 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5240963855421686, + "acc_stderr": 0.03887971849597264, + "acc_norm": 0.5240963855421686, + "acc_norm_stderr": 0.03887971849597264 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.847953216374269, + "acc_stderr": 0.027539122889061452, + "acc_norm": 0.847953216374269, + "acc_norm_stderr": 0.027539122889061452 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3268053855569155, + "mc1_stderr": 0.01641987473113503, + "mc2": 0.47623983056821834, + "mc2_stderr": 0.01423216788461584 + }, + "harness|winogrande|5": { + "acc": 0.8018942383583267, + "acc_stderr": 0.011201862744487047 + }, + "harness|gsm8k|5": { + "acc": 0.46853677028051555, + "acc_stderr": 0.01374518994845042 + }, + "all": { + "acc": 0.6972963513811701, + "acc_stderr": 0.03075228309763406, + "acc_norm": 0.7033766203346898, + "acc_norm_stderr": 0.03134647583855853, + "mc1": 0.3268053855569155, + "mc1_stderr": 0.01641987473113503, + "mc2": 0.47623983056821834, + "mc2_stderr": 0.01423216788461584 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "b15269181fbdf7dd" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "82e74ab7a068f9b8" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/Llama2-7B-guanaco-1k/results_2023-10-03T19-29-13.374969.json b/eval-results/mncai/Llama2-7B-guanaco-1k/results_2023-10-03T19-29-13.374969.json new file mode 100644 index 0000000000000000000000000000000000000000..f47e8bc4fd647da1ffd1da416cd600d848ccfe7b --- /dev/null +++ b/eval-results/mncai/Llama2-7B-guanaco-1k/results_2023-10-03T19-29-13.374969.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "mncai/Llama2-7B-guanaco-1k", + "model_sha": "5f3194b779897bbc4c4218a9dddc44a9b5faea15", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5204778156996587, + "acc_stderr": 0.014599131353035004, + "acc_norm": 0.5511945392491467, + "acc_norm_stderr": 0.014534599585097667 + }, + "harness|hellaswag|10": { + "acc": 0.6121290579565823, + "acc_stderr": 0.004862690594815707, + "acc_norm": 0.8053176658036247, + "acc_norm_stderr": 0.003951467386597723 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4144736842105263, + "acc_stderr": 0.04008973785779206, + "acc_norm": 0.4144736842105263, + "acc_norm_stderr": 0.04008973785779206 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5056603773584906, + "acc_stderr": 0.030770900763851316, + "acc_norm": 0.5056603773584906, + "acc_norm_stderr": 0.030770900763851316 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.04174752578923185, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.04174752578923185 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4624277456647399, + "acc_stderr": 0.0380168510452446, + "acc_norm": 0.4624277456647399, + "acc_norm_stderr": 0.0380168510452446 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.0379328118530781, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.0379328118530781 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.55, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.55, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.43829787234042555, + "acc_stderr": 0.03243618636108101, + "acc_norm": 0.43829787234042555, + "acc_norm_stderr": 0.03243618636108101 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159393, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159393 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5103448275862069, + "acc_stderr": 0.04165774775728763, + "acc_norm": 0.5103448275862069, + "acc_norm_stderr": 0.04165774775728763 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29894179894179895, + "acc_stderr": 0.023577604791655802, + "acc_norm": 0.29894179894179895, + "acc_norm_stderr": 0.023577604791655802 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.04073524322147125, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.04073524322147125 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5225806451612903, + "acc_stderr": 0.02841498501970786, + "acc_norm": 0.5225806451612903, + "acc_norm_stderr": 0.02841498501970786 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3645320197044335, + "acc_stderr": 0.033864057460620905, + "acc_norm": 0.3645320197044335, + "acc_norm_stderr": 0.033864057460620905 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6121212121212121, + "acc_stderr": 0.038049136539710114, + "acc_norm": 0.6121212121212121, + "acc_norm_stderr": 0.038049136539710114 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.0347327959083696, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.0347327959083696 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6735751295336787, + "acc_stderr": 0.033840286211432945, + "acc_norm": 0.6735751295336787, + "acc_norm_stderr": 0.033840286211432945 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.46923076923076923, + "acc_stderr": 0.025302958890850154, + "acc_norm": 0.46923076923076923, + "acc_norm_stderr": 0.025302958890850154 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085626, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085626 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.44537815126050423, + "acc_stderr": 0.032284106267163895, + "acc_norm": 0.44537815126050423, + "acc_norm_stderr": 0.032284106267163895 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6697247706422018, + "acc_stderr": 0.020164466336342977, + "acc_norm": 0.6697247706422018, + "acc_norm_stderr": 0.020164466336342977 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3055555555555556, + "acc_stderr": 0.03141554629402546, + "acc_norm": 0.3055555555555556, + "acc_norm_stderr": 0.03141554629402546 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5441176470588235, + "acc_stderr": 0.03495624522015476, + "acc_norm": 0.5441176470588235, + "acc_norm_stderr": 0.03495624522015476 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6371308016877637, + "acc_stderr": 0.03129920825530213, + "acc_norm": 0.6371308016877637, + "acc_norm_stderr": 0.03129920825530213 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5426008968609866, + "acc_stderr": 0.033435777055830646, + "acc_norm": 0.5426008968609866, + "acc_norm_stderr": 0.033435777055830646 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6030534351145038, + "acc_stderr": 0.04291135671009224, + "acc_norm": 0.6030534351145038, + "acc_norm_stderr": 0.04291135671009224 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.628099173553719, + "acc_stderr": 0.044120158066245044, + "acc_norm": 0.628099173553719, + "acc_norm_stderr": 0.044120158066245044 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.04820403072760627, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.04820403072760627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5276073619631901, + "acc_stderr": 0.03922378290610991, + "acc_norm": 0.5276073619631901, + "acc_norm_stderr": 0.03922378290610991 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5825242718446602, + "acc_stderr": 0.048828405482122375, + "acc_norm": 0.5825242718446602, + "acc_norm_stderr": 0.048828405482122375 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.02934311479809446, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.02934311479809446 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6475095785440613, + "acc_stderr": 0.01708415024408138, + "acc_norm": 0.6475095785440613, + "acc_norm_stderr": 0.01708415024408138 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5346820809248555, + "acc_stderr": 0.026854257928258872, + "acc_norm": 0.5346820809248555, + "acc_norm_stderr": 0.026854257928258872 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.29832402234636873, + "acc_stderr": 0.015301840045129278, + "acc_norm": 0.29832402234636873, + "acc_norm_stderr": 0.015301840045129278 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5228758169934641, + "acc_stderr": 0.028599936776089782, + "acc_norm": 0.5228758169934641, + "acc_norm_stderr": 0.028599936776089782 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5884244372990354, + "acc_stderr": 0.027950481494401262, + "acc_norm": 0.5884244372990354, + "acc_norm_stderr": 0.027950481494401262 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.4876543209876543, + "acc_stderr": 0.027812262269327228, + "acc_norm": 0.4876543209876543, + "acc_norm_stderr": 0.027812262269327228 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.36524822695035464, + "acc_stderr": 0.028723863853281278, + "acc_norm": 0.36524822695035464, + "acc_norm_stderr": 0.028723863853281278 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.36897001303780963, + "acc_stderr": 0.01232393665017486, + "acc_norm": 0.36897001303780963, + "acc_norm_stderr": 0.01232393665017486 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5367647058823529, + "acc_stderr": 0.030290619180485694, + "acc_norm": 0.5367647058823529, + "acc_norm_stderr": 0.030290619180485694 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.45098039215686275, + "acc_stderr": 0.02013038831290453, + "acc_norm": 0.45098039215686275, + "acc_norm_stderr": 0.02013038831290453 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5363636363636364, + "acc_stderr": 0.04776449162396197, + "acc_norm": 0.5363636363636364, + "acc_norm_stderr": 0.04776449162396197 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4897959183673469, + "acc_stderr": 0.03200255347893782, + "acc_norm": 0.4897959183673469, + "acc_norm_stderr": 0.03200255347893782 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03333333333333335, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03333333333333335 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.038367221765980515, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.038367221765980515 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7076023391812866, + "acc_stderr": 0.03488647713457923, + "acc_norm": 0.7076023391812866, + "acc_norm_stderr": 0.03488647713457923 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.31211750305997554, + "mc1_stderr": 0.016220756769520926, + "mc2": 0.4769485190285405, + "mc2_stderr": 0.015017841350265305 + }, + "all": { + "acc": 0.48223917491095497, + "acc_stderr": 0.03531211391105555, + "acc_norm": 0.4860341805278453, + "acc_norm_stderr": 0.03529557569112071, + "mc1": 0.31211750305997554, + "mc1_stderr": 0.016220756769520926, + "mc2": 0.4769485190285405, + "mc2_stderr": 0.015017841350265305 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "2768.4088854789734", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/Llama2-7B-guanaco-1k/results_2023-10-24T22-26-12.007542.json b/eval-results/mncai/Llama2-7B-guanaco-1k/results_2023-10-24T22-26-12.007542.json new file mode 100644 index 0000000000000000000000000000000000000000..4893bfd709649fc2ba48a418c7c1a204417e3bfa --- /dev/null +++ b/eval-results/mncai/Llama2-7B-guanaco-1k/results_2023-10-24T22-26-12.007542.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "mncai/Llama2-7B-guanaco-1k", + "model_sha": "5f3194b779897bbc4c4218a9dddc44a9b5faea15", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.001363255033557047, + "em_stderr": 0.00037786091964606556, + "f1": 0.05869022651006706, + "f1_stderr": 0.001351280630481856 + }, + "harness|gsm8k|5": { + "acc": 0.0758150113722517, + "acc_stderr": 0.007291205723162611 + }, + "harness|winogrande|5": { + "acc": 0.7482241515390686, + "acc_stderr": 0.012198489100259778 + }, + "all": { + "em": 0.001363255033557047, + "em_stderr": 0.00037786091964606556, + "f1": 0.05869022651006706, + "f1_stderr": 0.001351280630481856, + "acc": 0.41201958145566014, + "acc_stderr": 0.009744847411711194 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "712b144d91603dba" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "4020fedd08a2922b" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "dd6ee2390f5cb1e9" + }, + "total_evaluation_time_secondes": "9414.698763847351", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/Llama2-7B-guanaco-dolphin-500/results_2023-10-03T19-36-50.573905.json b/eval-results/mncai/Llama2-7B-guanaco-dolphin-500/results_2023-10-03T19-36-50.573905.json new file mode 100644 index 0000000000000000000000000000000000000000..e7cded413dcdbb307df7d7c6fef43b29493acfa0 --- /dev/null +++ b/eval-results/mncai/Llama2-7B-guanaco-dolphin-500/results_2023-10-03T19-36-50.573905.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "mncai/Llama2-7B-guanaco-dolphin-500", + "model_sha": "afe00170f084f773e401ba7d738d692533cca6b4", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5255972696245734, + "acc_stderr": 0.014592230885298964, + "acc_norm": 0.5674061433447098, + "acc_norm_stderr": 0.014478005694182526 + }, + "harness|hellaswag|10": { + "acc": 0.6216889065923122, + "acc_stderr": 0.0048397464915235135, + "acc_norm": 0.8162716590320653, + "acc_norm_stderr": 0.003864710367645059 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4605263157894737, + "acc_stderr": 0.04056242252249033, + "acc_norm": 0.4605263157894737, + "acc_norm_stderr": 0.04056242252249033 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.47547169811320755, + "acc_stderr": 0.030735822206205615, + "acc_norm": 0.47547169811320755, + "acc_norm_stderr": 0.030735822206205615 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4861111111111111, + "acc_stderr": 0.041795966175810016, + "acc_norm": 0.4861111111111111, + "acc_norm_stderr": 0.041795966175810016 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4508670520231214, + "acc_stderr": 0.0379401267469703, + "acc_norm": 0.4508670520231214, + "acc_norm_stderr": 0.0379401267469703 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.18627450980392157, + "acc_stderr": 0.03873958714149352, + "acc_norm": 0.18627450980392157, + "acc_norm_stderr": 0.03873958714149352 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4340425531914894, + "acc_stderr": 0.03240038086792747, + "acc_norm": 0.4340425531914894, + "acc_norm_stderr": 0.03240038086792747 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159393, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159393 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5103448275862069, + "acc_stderr": 0.04165774775728763, + "acc_norm": 0.5103448275862069, + "acc_norm_stderr": 0.04165774775728763 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.0236369759961018, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.0236369759961018 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3412698412698413, + "acc_stderr": 0.04240799327574925, + "acc_norm": 0.3412698412698413, + "acc_norm_stderr": 0.04240799327574925 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5387096774193548, + "acc_stderr": 0.02835863485983694, + "acc_norm": 0.5387096774193548, + "acc_norm_stderr": 0.02835863485983694 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3399014778325123, + "acc_stderr": 0.0333276906841079, + "acc_norm": 0.3399014778325123, + "acc_norm_stderr": 0.0333276906841079 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6060606060606061, + "acc_stderr": 0.038154943086889305, + "acc_norm": 0.6060606060606061, + "acc_norm_stderr": 0.038154943086889305 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5909090909090909, + "acc_stderr": 0.03502975799413007, + "acc_norm": 0.5909090909090909, + "acc_norm_stderr": 0.03502975799413007 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6632124352331606, + "acc_stderr": 0.03410780251836184, + "acc_norm": 0.6632124352331606, + "acc_norm_stderr": 0.03410780251836184 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.45384615384615384, + "acc_stderr": 0.025242770987126177, + "acc_norm": 0.45384615384615384, + "acc_norm_stderr": 0.025242770987126177 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3074074074074074, + "acc_stderr": 0.02813325257881563, + "acc_norm": 0.3074074074074074, + "acc_norm_stderr": 0.02813325257881563 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.46218487394957986, + "acc_stderr": 0.032385469487589795, + "acc_norm": 0.46218487394957986, + "acc_norm_stderr": 0.032385469487589795 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.038020397601079024, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.038020397601079024 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.671559633027523, + "acc_stderr": 0.020135902797298405, + "acc_norm": 0.671559633027523, + "acc_norm_stderr": 0.020135902797298405 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3101851851851852, + "acc_stderr": 0.03154696285656629, + "acc_norm": 0.3101851851851852, + "acc_norm_stderr": 0.03154696285656629 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5490196078431373, + "acc_stderr": 0.03492406104163613, + "acc_norm": 0.5490196078431373, + "acc_norm_stderr": 0.03492406104163613 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6371308016877637, + "acc_stderr": 0.031299208255302136, + "acc_norm": 0.6371308016877637, + "acc_norm_stderr": 0.031299208255302136 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5650224215246636, + "acc_stderr": 0.033272833702713445, + "acc_norm": 0.5650224215246636, + "acc_norm_stderr": 0.033272833702713445 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5954198473282443, + "acc_stderr": 0.043046937953806645, + "acc_norm": 0.5954198473282443, + "acc_norm_stderr": 0.043046937953806645 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6446280991735537, + "acc_stderr": 0.0436923632657398, + "acc_norm": 0.6446280991735537, + "acc_norm_stderr": 0.0436923632657398 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5462962962962963, + "acc_stderr": 0.04812917324536823, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.04812917324536823 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5214723926380368, + "acc_stderr": 0.03924746876751129, + "acc_norm": 0.5214723926380368, + "acc_norm_stderr": 0.03924746876751129 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6019417475728155, + "acc_stderr": 0.048467482539772386, + "acc_norm": 0.6019417475728155, + "acc_norm_stderr": 0.048467482539772386 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7393162393162394, + "acc_stderr": 0.028760348956523414, + "acc_norm": 0.7393162393162394, + "acc_norm_stderr": 0.028760348956523414 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6513409961685823, + "acc_stderr": 0.01704124314349097, + "acc_norm": 0.6513409961685823, + "acc_norm_stderr": 0.01704124314349097 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.546242774566474, + "acc_stderr": 0.026803720583206174, + "acc_norm": 0.546242774566474, + "acc_norm_stderr": 0.026803720583206174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25251396648044694, + "acc_stderr": 0.014530330201468636, + "acc_norm": 0.25251396648044694, + "acc_norm_stderr": 0.014530330201468636 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5424836601307189, + "acc_stderr": 0.02852638345214263, + "acc_norm": 0.5424836601307189, + "acc_norm_stderr": 0.02852638345214263 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5884244372990354, + "acc_stderr": 0.027950481494401262, + "acc_norm": 0.5884244372990354, + "acc_norm_stderr": 0.027950481494401262 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5092592592592593, + "acc_stderr": 0.027815973433878014, + "acc_norm": 0.5092592592592593, + "acc_norm_stderr": 0.027815973433878014 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.37943262411347517, + "acc_stderr": 0.0289473388516141, + "acc_norm": 0.37943262411347517, + "acc_norm_stderr": 0.0289473388516141 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3663624511082138, + "acc_stderr": 0.012305658346838444, + "acc_norm": 0.3663624511082138, + "acc_norm_stderr": 0.012305658346838444 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5625, + "acc_stderr": 0.030134614954403924, + "acc_norm": 0.5625, + "acc_norm_stderr": 0.030134614954403924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.47058823529411764, + "acc_stderr": 0.02019280827143379, + "acc_norm": 0.47058823529411764, + "acc_norm_stderr": 0.02019280827143379 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5545454545454546, + "acc_stderr": 0.047605488214603246, + "acc_norm": 0.5545454545454546, + "acc_norm_stderr": 0.047605488214603246 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4857142857142857, + "acc_stderr": 0.03199615232806287, + "acc_norm": 0.4857142857142857, + "acc_norm_stderr": 0.03199615232806287 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6766169154228856, + "acc_stderr": 0.03307615947979033, + "acc_norm": 0.6766169154228856, + "acc_norm_stderr": 0.03307615947979033 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.64, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.64, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.038367221765980515, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.038367221765980515 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7192982456140351, + "acc_stderr": 0.034462962170884265, + "acc_norm": 0.7192982456140351, + "acc_norm_stderr": 0.034462962170884265 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.31701346389228885, + "mc1_stderr": 0.016289203374403385, + "mc2": 0.46938056221953073, + "mc2_stderr": 0.015439179764216509 + }, + "all": { + "acc": 0.48983448652651995, + "acc_stderr": 0.035363637901079645, + "acc_norm": 0.49384112425804355, + "acc_norm_stderr": 0.03534517584489329, + "mc1": 0.31701346389228885, + "mc1_stderr": 0.016289203374403385, + "mc2": 0.46938056221953073, + "mc2_stderr": 0.015439179764216509 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "2806.996089220047", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/Llama2-7B-guanaco-dolphin-500/results_2023-10-25T23-43-24.108245.json b/eval-results/mncai/Llama2-7B-guanaco-dolphin-500/results_2023-10-25T23-43-24.108245.json new file mode 100644 index 0000000000000000000000000000000000000000..6b3329a5fd31386df5e751764c7efd7f6d0c279a --- /dev/null +++ b/eval-results/mncai/Llama2-7B-guanaco-dolphin-500/results_2023-10-25T23-43-24.108245.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "mncai/Llama2-7B-guanaco-dolphin-500", + "model_sha": "afe00170f084f773e401ba7d738d692533cca6b4", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.005243288590604027, + "em_stderr": 0.0007396052260777966, + "f1": 0.06350776006711419, + "f1_stderr": 0.0014962736738025012 + }, + "harness|gsm8k|5": { + "acc": 0.05989385898407885, + "acc_stderr": 0.006536148151288703 + }, + "harness|winogrande|5": { + "acc": 0.7426992896606156, + "acc_stderr": 0.012285989618865704 + }, + "all": { + "em": 0.005243288590604027, + "em_stderr": 0.0007396052260777966, + "f1": 0.06350776006711419, + "f1_stderr": 0.0014962736738025012, + "acc": 0.4012965743223472, + "acc_stderr": 0.009411068885077204 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "cba3678433314ba8" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "552bd331af286b1b" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "52d908d145672853" + }, + "total_evaluation_time_secondes": "9260.790706157684", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/Mistral-7B-OpenOrca-1k/results_2023-10-10T11-19-13.410150.json b/eval-results/mncai/Mistral-7B-OpenOrca-1k/results_2023-10-10T11-19-13.410150.json new file mode 100644 index 0000000000000000000000000000000000000000..ddecd1ceed9b8e033985faa36586255540180b08 --- /dev/null +++ b/eval-results/mncai/Mistral-7B-OpenOrca-1k/results_2023-10-10T11-19-13.410150.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "mncai/Mistral-7B-OpenOrca-1k", + "model_sha": "ae9e37811a54ffe45f41a572c7e68363aa11b062", + "model_size": "13.99 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5930034129692833, + "acc_stderr": 0.014356399418009126, + "acc_norm": 0.6296928327645052, + "acc_norm_stderr": 0.01411129875167495 + }, + "harness|hellaswag|10": { + "acc": 0.6537542322246565, + "acc_stderr": 0.004748003276466209, + "acc_norm": 0.8466440948018323, + "acc_norm_stderr": 0.003595938124166216 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6074074074074074, + "acc_stderr": 0.0421850621536888, + "acc_norm": 0.6074074074074074, + "acc_norm_stderr": 0.0421850621536888 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6513157894736842, + "acc_stderr": 0.038781398887976104, + "acc_norm": 0.6513157894736842, + "acc_norm_stderr": 0.038781398887976104 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7169811320754716, + "acc_stderr": 0.027724236492700918, + "acc_norm": 0.7169811320754716, + "acc_norm_stderr": 0.027724236492700918 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.037455547914624555, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.037455547914624555 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.653179190751445, + "acc_stderr": 0.036291466701596636, + "acc_norm": 0.653179190751445, + "acc_norm_stderr": 0.036291466701596636 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.77, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5702127659574469, + "acc_stderr": 0.03236214467715564, + "acc_norm": 0.5702127659574469, + "acc_norm_stderr": 0.03236214467715564 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5087719298245614, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.5087719298245614, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.025010749116137595, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.025010749116137595 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.04403438954768176, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.04403438954768176 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7451612903225806, + "acc_stderr": 0.024790118459332208, + "acc_norm": 0.7451612903225806, + "acc_norm_stderr": 0.024790118459332208 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4876847290640394, + "acc_stderr": 0.035169204442208966, + "acc_norm": 0.4876847290640394, + "acc_norm_stderr": 0.035169204442208966 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526066, + "acc_norm": 0.67, + "acc_norm_stderr": 0.047258156262526066 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.0347769116216366, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.0347769116216366 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7929292929292929, + "acc_stderr": 0.02886977846026705, + "acc_norm": 0.7929292929292929, + "acc_norm_stderr": 0.02886977846026705 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8652849740932642, + "acc_stderr": 0.02463978909770944, + "acc_norm": 0.8652849740932642, + "acc_norm_stderr": 0.02463978909770944 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6564102564102564, + "acc_stderr": 0.024078696580635484, + "acc_norm": 0.6564102564102564, + "acc_norm_stderr": 0.024078696580635484 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.35555555555555557, + "acc_stderr": 0.02918571494985741, + "acc_norm": 0.35555555555555557, + "acc_norm_stderr": 0.02918571494985741 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6680672268907563, + "acc_stderr": 0.03058869701378364, + "acc_norm": 0.6680672268907563, + "acc_norm_stderr": 0.03058869701378364 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526732, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526732 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8256880733944955, + "acc_stderr": 0.01626567563201036, + "acc_norm": 0.8256880733944955, + "acc_norm_stderr": 0.01626567563201036 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.03388857118502326, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.03388857118502326 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.803921568627451, + "acc_stderr": 0.027865942286639318, + "acc_norm": 0.803921568627451, + "acc_norm_stderr": 0.027865942286639318 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7848101265822784, + "acc_stderr": 0.026750826994676177, + "acc_norm": 0.7848101265822784, + "acc_norm_stderr": 0.026750826994676177 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6681614349775785, + "acc_stderr": 0.03160295143776679, + "acc_norm": 0.6681614349775785, + "acc_norm_stderr": 0.03160295143776679 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7557251908396947, + "acc_stderr": 0.03768335959728744, + "acc_norm": 0.7557251908396947, + "acc_norm_stderr": 0.03768335959728744 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7520661157024794, + "acc_stderr": 0.03941897526516303, + "acc_norm": 0.7520661157024794, + "acc_norm_stderr": 0.03941897526516303 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.039578354719809784, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.039578354719809784 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7730061349693251, + "acc_stderr": 0.03291099578615769, + "acc_norm": 0.7730061349693251, + "acc_norm_stderr": 0.03291099578615769 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764376, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7378640776699029, + "acc_stderr": 0.04354631077260595, + "acc_norm": 0.7378640776699029, + "acc_norm_stderr": 0.04354631077260595 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.021901905115073325, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.021901905115073325 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8109833971902938, + "acc_stderr": 0.014000791294407006, + "acc_norm": 0.8109833971902938, + "acc_norm_stderr": 0.014000791294407006 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6647398843930635, + "acc_stderr": 0.02541600377316555, + "acc_norm": 0.6647398843930635, + "acc_norm_stderr": 0.02541600377316555 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2849162011173184, + "acc_stderr": 0.015096222302469799, + "acc_norm": 0.2849162011173184, + "acc_norm_stderr": 0.015096222302469799 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7254901960784313, + "acc_stderr": 0.025553169991826517, + "acc_norm": 0.7254901960784313, + "acc_norm_stderr": 0.025553169991826517 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.684887459807074, + "acc_stderr": 0.026385273703464492, + "acc_norm": 0.684887459807074, + "acc_norm_stderr": 0.026385273703464492 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7253086419753086, + "acc_stderr": 0.024836057868294677, + "acc_norm": 0.7253086419753086, + "acc_norm_stderr": 0.024836057868294677 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.450354609929078, + "acc_stderr": 0.02968010556502904, + "acc_norm": 0.450354609929078, + "acc_norm_stderr": 0.02968010556502904 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.45045632333767927, + "acc_stderr": 0.012707390438502346, + "acc_norm": 0.45045632333767927, + "acc_norm_stderr": 0.012707390438502346 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6911764705882353, + "acc_stderr": 0.02806499816704009, + "acc_norm": 0.6911764705882353, + "acc_norm_stderr": 0.02806499816704009 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6388888888888888, + "acc_stderr": 0.01943177567703731, + "acc_norm": 0.6388888888888888, + "acc_norm_stderr": 0.01943177567703731 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6272727272727273, + "acc_stderr": 0.04631381319425465, + "acc_norm": 0.6272727272727273, + "acc_norm_stderr": 0.04631381319425465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6857142857142857, + "acc_stderr": 0.02971932942241747, + "acc_norm": 0.6857142857142857, + "acc_norm_stderr": 0.02971932942241747 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.845771144278607, + "acc_stderr": 0.02553843336857833, + "acc_norm": 0.845771144278607, + "acc_norm_stderr": 0.02553843336857833 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036623, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036623 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.536144578313253, + "acc_stderr": 0.038823108508905954, + "acc_norm": 0.536144578313253, + "acc_norm_stderr": 0.038823108508905954 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8187134502923976, + "acc_stderr": 0.029547741687640038, + "acc_norm": 0.8187134502923976, + "acc_norm_stderr": 0.029547741687640038 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.37821297429620565, + "mc1_stderr": 0.016976335907546866, + "mc2": 0.5296423544101955, + "mc2_stderr": 0.015339874902349726 + }, + "all": { + "acc": 0.6220828368688038, + "acc_stderr": 0.0333381086288331, + "acc_norm": 0.6259740111463021, + "acc_norm_stderr": 0.03331442785224609, + "mc1": 0.37821297429620565, + "mc1_stderr": 0.016976335907546866, + "mc2": 0.5296423544101955, + "mc2_stderr": 0.015339874902349726 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "4437.052177906036", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/Mistral-7B-OpenOrca-1k/results_2023-10-25T07-41-12.101153.json b/eval-results/mncai/Mistral-7B-OpenOrca-1k/results_2023-10-25T07-41-12.101153.json new file mode 100644 index 0000000000000000000000000000000000000000..dd632c7082fe920a4e4d731aba92530e4b7cfdb8 --- /dev/null +++ b/eval-results/mncai/Mistral-7B-OpenOrca-1k/results_2023-10-25T07-41-12.101153.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "mncai/Mistral-7B-OpenOrca-1k", + "model_sha": "ae9e37811a54ffe45f41a572c7e68363aa11b062", + "model_size": "13.99 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0053481543624161075, + "em_stderr": 0.0007469252903319289, + "f1": 0.09739828020134218, + "f1_stderr": 0.001857285751420582 + }, + "harness|gsm8k|5": { + "acc": 0.1197877179681577, + "acc_stderr": 0.008944213403553095 + }, + "harness|winogrande|5": { + "acc": 0.7861089187056038, + "acc_stderr": 0.011524466954090247 + }, + "all": { + "em": 0.0053481543624161075, + "em_stderr": 0.0007469252903319289, + "f1": 0.09739828020134218, + "f1_stderr": 0.001857285751420582, + "acc": 0.45294831833688076, + "acc_stderr": 0.01023434017882167 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "243692b5f5e53377" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "ad6903da4301d4f2" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2397, + "non-padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "e6e4928d9f0e2d5b" + }, + "total_evaluation_time_secondes": "10496.110456705093", + "truncated": 0, + "non-truncated": 13389, + "padded": 2397, + "non-padded": 10992, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/Mistral-7B-openplatypus-1k/results_2023-10-10T11-26-36.133476.json b/eval-results/mncai/Mistral-7B-openplatypus-1k/results_2023-10-10T11-26-36.133476.json new file mode 100644 index 0000000000000000000000000000000000000000..ff5b3f05c4821482abf5836c3cac92f29b1c6cb7 --- /dev/null +++ b/eval-results/mncai/Mistral-7B-openplatypus-1k/results_2023-10-10T11-26-36.133476.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "mncai/Mistral-7B-openplatypus-1k", + "model_sha": "dad401175da3782475a122008720ddc3338e2632", + "model_size": "13.99 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5674061433447098, + "acc_stderr": 0.014478005694182528, + "acc_norm": 0.6015358361774744, + "acc_norm_stderr": 0.014306946052735565 + }, + "harness|hellaswag|10": { + "acc": 0.6487751443935471, + "acc_stderr": 0.004763774981834674, + "acc_norm": 0.8424616610237005, + "acc_norm_stderr": 0.0036356303524759065 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6578947368421053, + "acc_stderr": 0.03860731599316092, + "acc_norm": 0.6578947368421053, + "acc_norm_stderr": 0.03860731599316092 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6415094339622641, + "acc_stderr": 0.02951470358398177, + "acc_norm": 0.6415094339622641, + "acc_norm_stderr": 0.02951470358398177 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6736111111111112, + "acc_stderr": 0.03921067198982266, + "acc_norm": 0.6736111111111112, + "acc_norm_stderr": 0.03921067198982266 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5838150289017341, + "acc_stderr": 0.03758517775404948, + "acc_norm": 0.5838150289017341, + "acc_norm_stderr": 0.03758517775404948 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.043364327079931785, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.043364327079931785 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.73, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.73, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4978723404255319, + "acc_stderr": 0.03268572658667492, + "acc_norm": 0.4978723404255319, + "acc_norm_stderr": 0.03268572658667492 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.04697085136647863, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.04697085136647863 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.025424835086923992, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.025424835086923992 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7161290322580646, + "acc_stderr": 0.025649381063029265, + "acc_norm": 0.7161290322580646, + "acc_norm_stderr": 0.025649381063029265 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5024630541871922, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.5024630541871922, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.62, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.62, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6848484848484848, + "acc_stderr": 0.0362773057502241, + "acc_norm": 0.6848484848484848, + "acc_norm_stderr": 0.0362773057502241 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7121212121212122, + "acc_stderr": 0.03225883512300993, + "acc_norm": 0.7121212121212122, + "acc_norm_stderr": 0.03225883512300993 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8082901554404145, + "acc_stderr": 0.02840895362624528, + "acc_norm": 0.8082901554404145, + "acc_norm_stderr": 0.02840895362624528 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5846153846153846, + "acc_stderr": 0.02498535492310234, + "acc_norm": 0.5846153846153846, + "acc_norm_stderr": 0.02498535492310234 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3074074074074074, + "acc_stderr": 0.02813325257881564, + "acc_norm": 0.3074074074074074, + "acc_norm_stderr": 0.02813325257881564 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5966386554621849, + "acc_stderr": 0.03186608121408832, + "acc_norm": 0.5966386554621849, + "acc_norm_stderr": 0.03186608121408832 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.037345356767871984, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.037345356767871984 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7761467889908257, + "acc_stderr": 0.01787121776779022, + "acc_norm": 0.7761467889908257, + "acc_norm_stderr": 0.01787121776779022 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.36574074074074076, + "acc_stderr": 0.03284738857647207, + "acc_norm": 0.36574074074074076, + "acc_norm_stderr": 0.03284738857647207 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.028867431449849313, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.028867431449849313 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7426160337552743, + "acc_stderr": 0.0284588209914603, + "acc_norm": 0.7426160337552743, + "acc_norm_stderr": 0.0284588209914603 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6457399103139013, + "acc_stderr": 0.032100621541349864, + "acc_norm": 0.6457399103139013, + "acc_norm_stderr": 0.032100621541349864 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6946564885496184, + "acc_stderr": 0.040393149787245605, + "acc_norm": 0.6946564885496184, + "acc_norm_stderr": 0.040393149787245605 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.768595041322314, + "acc_stderr": 0.0384985609879409, + "acc_norm": 0.768595041322314, + "acc_norm_stderr": 0.0384985609879409 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7129629629629629, + "acc_stderr": 0.043733130409147614, + "acc_norm": 0.7129629629629629, + "acc_norm_stderr": 0.043733130409147614 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.754601226993865, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.754601226993865, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4375, + "acc_stderr": 0.04708567521880525, + "acc_norm": 0.4375, + "acc_norm_stderr": 0.04708567521880525 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8349514563106796, + "acc_stderr": 0.03675668832233188, + "acc_norm": 0.8349514563106796, + "acc_norm_stderr": 0.03675668832233188 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.024414947304543674, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.024414947304543674 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526094, + "acc_norm": 0.67, + "acc_norm_stderr": 0.047258156262526094 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7790549169859514, + "acc_stderr": 0.014836205167333569, + "acc_norm": 0.7790549169859514, + "acc_norm_stderr": 0.014836205167333569 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6734104046242775, + "acc_stderr": 0.025248264774242836, + "acc_norm": 0.6734104046242775, + "acc_norm_stderr": 0.025248264774242836 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.37094972067039106, + "acc_stderr": 0.016155910721341767, + "acc_norm": 0.37094972067039106, + "acc_norm_stderr": 0.016155910721341767 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.673202614379085, + "acc_stderr": 0.026857294663281406, + "acc_norm": 0.673202614379085, + "acc_norm_stderr": 0.026857294663281406 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6913183279742765, + "acc_stderr": 0.026236965881153266, + "acc_norm": 0.6913183279742765, + "acc_norm_stderr": 0.026236965881153266 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6697530864197531, + "acc_stderr": 0.026168298456732846, + "acc_norm": 0.6697530864197531, + "acc_norm_stderr": 0.026168298456732846 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.425531914893617, + "acc_stderr": 0.029494827600144373, + "acc_norm": 0.425531914893617, + "acc_norm_stderr": 0.029494827600144373 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4556714471968709, + "acc_stderr": 0.012719949543032197, + "acc_norm": 0.4556714471968709, + "acc_norm_stderr": 0.012719949543032197 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6580882352941176, + "acc_stderr": 0.02881472242225418, + "acc_norm": 0.6580882352941176, + "acc_norm_stderr": 0.02881472242225418 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6160130718954249, + "acc_stderr": 0.019675808135281508, + "acc_norm": 0.6160130718954249, + "acc_norm_stderr": 0.019675808135281508 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.045820048415054174, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.045820048415054174 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6612244897959184, + "acc_stderr": 0.030299506562154185, + "acc_norm": 0.6612244897959184, + "acc_norm_stderr": 0.030299506562154185 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8159203980099502, + "acc_stderr": 0.027403859410786848, + "acc_norm": 0.8159203980099502, + "acc_norm_stderr": 0.027403859410786848 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036625, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036625 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.572289156626506, + "acc_stderr": 0.038515976837185335, + "acc_norm": 0.572289156626506, + "acc_norm_stderr": 0.038515976837185335 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.783625730994152, + "acc_stderr": 0.031581495393387324, + "acc_norm": 0.783625730994152, + "acc_norm_stderr": 0.031581495393387324 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3378212974296206, + "mc1_stderr": 0.01655716732251688, + "mc2": 0.498569630806063, + "mc2_stderr": 0.015133442762891728 + }, + "all": { + "acc": 0.5987327353548585, + "acc_stderr": 0.033868715674081146, + "acc_norm": 0.6025940270406707, + "acc_norm_stderr": 0.033846695262711564, + "mc1": 0.3378212974296206, + "mc1_stderr": 0.01655716732251688, + "mc2": 0.498569630806063, + "mc2_stderr": 0.015133442762891728 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "4476.204344987869", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/Mistral-7B-openplatypus-1k/results_2023-10-27T04-31-44.728538.json b/eval-results/mncai/Mistral-7B-openplatypus-1k/results_2023-10-27T04-31-44.728538.json new file mode 100644 index 0000000000000000000000000000000000000000..93e262f5c58fe8b05fb4e88c2e774536019cf6a0 --- /dev/null +++ b/eval-results/mncai/Mistral-7B-openplatypus-1k/results_2023-10-27T04-31-44.728538.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "mncai/Mistral-7B-openplatypus-1k", + "model_sha": "2e007e67cd5945948190210d31420190edb29669", + "model_size": "13.99 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0019924496644295304, + "em_stderr": 0.00045666764626669425, + "f1": 0.06536912751677865, + "f1_stderr": 0.001427220169024926 + }, + "harness|gsm8k|5": { + "acc": 0.17437452615617893, + "acc_stderr": 0.010451421361976233 + }, + "harness|winogrande|5": { + "acc": 0.7687450670876085, + "acc_stderr": 0.01185004012485051 + }, + "all": { + "em": 0.0019924496644295304, + "em_stderr": 0.00045666764626669425, + "f1": 0.06536912751677865, + "f1_stderr": 0.001427220169024926, + "acc": 0.47155979662189373, + "acc_stderr": 0.01115073074341337 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "382dad662851d094" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "bf055d1c8f15157c" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2397, + "non-padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "7221aa91fe7d2679" + }, + "total_evaluation_time_secondes": "10220.359910964966", + "truncated": 0, + "non-truncated": 13389, + "padded": 2397, + "non-padded": 10992, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/SGPT-1.3B-insurance-epoch10/results_2023-08-21T17-20-25.133054.json b/eval-results/mncai/SGPT-1.3B-insurance-epoch10/results_2023-08-21T17-20-25.133054.json new file mode 100644 index 0000000000000000000000000000000000000000..49699c879509c9912af5f94c7d786107d6b7ff89 --- /dev/null +++ b/eval-results/mncai/SGPT-1.3B-insurance-epoch10/results_2023-08-21T17-20-25.133054.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.20051194539249148, + "acc_stderr": 0.011700318050499375, + "acc_norm": 0.24573378839590443, + "acc_norm_stderr": 0.012581033453730102 + }, + "harness|hellaswag|10": { + "acc": 0.2522405895239992, + "acc_stderr": 0.004334110169012359, + "acc_norm": 0.2424815773750249, + "acc_norm_stderr": 0.004277081150258468 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.32592592592592595, + "acc_stderr": 0.040491220417025055, + "acc_norm": 0.32592592592592595, + "acc_norm_stderr": 0.040491220417025055 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.034597776068105365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.034597776068105365 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.22641509433962265, + "acc_stderr": 0.02575755989310674, + "acc_norm": 0.22641509433962265, + "acc_norm_stderr": 0.02575755989310674 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.25, + "acc_stderr": 0.03621034121889507, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03621034121889507 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.16, + "acc_stderr": 0.036845294917747115, + "acc_norm": 0.16, + "acc_norm_stderr": 0.036845294917747115 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.22, + "acc_stderr": 0.0416333199893227, + "acc_norm": 0.22, + "acc_norm_stderr": 0.0416333199893227 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.26011560693641617, + "acc_stderr": 0.033450369167889925, + "acc_norm": 0.26011560693641617, + "acc_norm_stderr": 0.033450369167889925 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.042801058373643966, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.042801058373643966 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3021276595744681, + "acc_stderr": 0.030017554471880557, + "acc_norm": 0.3021276595744681, + "acc_norm_stderr": 0.030017554471880557 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748143, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748143 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2689655172413793, + "acc_stderr": 0.036951833116502325, + "acc_norm": 0.2689655172413793, + "acc_norm_stderr": 0.036951833116502325 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2751322751322751, + "acc_stderr": 0.023000086859068635, + "acc_norm": 0.2751322751322751, + "acc_norm_stderr": 0.023000086859068635 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.19047619047619047, + "acc_stderr": 0.035122074123020534, + "acc_norm": 0.19047619047619047, + "acc_norm_stderr": 0.035122074123020534 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25161290322580643, + "acc_stderr": 0.024685979286239956, + "acc_norm": 0.25161290322580643, + "acc_norm_stderr": 0.024685979286239956 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2660098522167488, + "acc_stderr": 0.03108982600293752, + "acc_norm": 0.2660098522167488, + "acc_norm_stderr": 0.03108982600293752 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.0340150671524904, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.0340150671524904 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.24242424242424243, + "acc_stderr": 0.030532892233932026, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.030532892233932026 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.19170984455958548, + "acc_stderr": 0.02840895362624528, + "acc_norm": 0.19170984455958548, + "acc_norm_stderr": 0.02840895362624528 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.23846153846153847, + "acc_stderr": 0.021606294494647727, + "acc_norm": 0.23846153846153847, + "acc_norm_stderr": 0.021606294494647727 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2111111111111111, + "acc_stderr": 0.0248821168576551, + "acc_norm": 0.2111111111111111, + "acc_norm_stderr": 0.0248821168576551 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21008403361344538, + "acc_stderr": 0.026461398717471874, + "acc_norm": 0.21008403361344538, + "acc_norm_stderr": 0.026461398717471874 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23178807947019867, + "acc_stderr": 0.03445406271987054, + "acc_norm": 0.23178807947019867, + "acc_norm_stderr": 0.03445406271987054 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.22385321100917432, + "acc_stderr": 0.01787121776779022, + "acc_norm": 0.22385321100917432, + "acc_norm_stderr": 0.01787121776779022 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.19444444444444445, + "acc_stderr": 0.02699145450203673, + "acc_norm": 0.19444444444444445, + "acc_norm_stderr": 0.02699145450203673 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.3137254901960784, + "acc_stderr": 0.032566854844603886, + "acc_norm": 0.3137254901960784, + "acc_norm_stderr": 0.032566854844603886 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.21524663677130046, + "acc_stderr": 0.027584066602208263, + "acc_norm": 0.21524663677130046, + "acc_norm_stderr": 0.027584066602208263 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.22900763358778625, + "acc_stderr": 0.036853466317118506, + "acc_norm": 0.22900763358778625, + "acc_norm_stderr": 0.036853466317118506 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.371900826446281, + "acc_stderr": 0.044120158066245044, + "acc_norm": 0.371900826446281, + "acc_norm_stderr": 0.044120158066245044 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.041331194402438376, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.041331194402438376 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3067484662576687, + "acc_stderr": 0.036230899157241474, + "acc_norm": 0.3067484662576687, + "acc_norm_stderr": 0.036230899157241474 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25, + "acc_stderr": 0.04109974682633932, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04109974682633932 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.20388349514563106, + "acc_stderr": 0.039891398595317706, + "acc_norm": 0.20388349514563106, + "acc_norm_stderr": 0.039891398595317706 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.24786324786324787, + "acc_stderr": 0.028286324075564407, + "acc_norm": 0.24786324786324787, + "acc_norm_stderr": 0.028286324075564407 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.2, + "acc_stderr": 0.040201512610368445, + "acc_norm": 0.2, + "acc_norm_stderr": 0.040201512610368445 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2541507024265645, + "acc_stderr": 0.015569254692045766, + "acc_norm": 0.2541507024265645, + "acc_norm_stderr": 0.015569254692045766 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.29190751445086704, + "acc_stderr": 0.024476994076247333, + "acc_norm": 0.29190751445086704, + "acc_norm_stderr": 0.024476994076247333 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24692737430167597, + "acc_stderr": 0.014422292204808835, + "acc_norm": 0.24692737430167597, + "acc_norm_stderr": 0.014422292204808835 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.26143790849673204, + "acc_stderr": 0.025160998214292456, + "acc_norm": 0.26143790849673204, + "acc_norm_stderr": 0.025160998214292456 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.26366559485530544, + "acc_stderr": 0.02502553850053234, + "acc_norm": 0.26366559485530544, + "acc_norm_stderr": 0.02502553850053234 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.02438366553103546, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.02438366553103546 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2695035460992908, + "acc_stderr": 0.026469036818590634, + "acc_norm": 0.2695035460992908, + "acc_norm_stderr": 0.026469036818590634 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2757496740547588, + "acc_stderr": 0.011413813609160998, + "acc_norm": 0.2757496740547588, + "acc_norm_stderr": 0.011413813609160998 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.1801470588235294, + "acc_stderr": 0.02334516361654486, + "acc_norm": 0.1801470588235294, + "acc_norm_stderr": 0.02334516361654486 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.23202614379084968, + "acc_stderr": 0.017077373377857002, + "acc_norm": 0.23202614379084968, + "acc_norm_stderr": 0.017077373377857002 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.19090909090909092, + "acc_stderr": 0.03764425585984926, + "acc_norm": 0.19090909090909092, + "acc_norm_stderr": 0.03764425585984926 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2530612244897959, + "acc_stderr": 0.02783302387139968, + "acc_norm": 0.2530612244897959, + "acc_norm_stderr": 0.02783302387139968 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.263681592039801, + "acc_stderr": 0.031157150869355547, + "acc_norm": 0.263681592039801, + "acc_norm_stderr": 0.031157150869355547 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.27710843373493976, + "acc_stderr": 0.034843315926805875, + "acc_norm": 0.27710843373493976, + "acc_norm_stderr": 0.034843315926805875 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.29239766081871343, + "acc_stderr": 0.034886477134579215, + "acc_norm": 0.29239766081871343, + "acc_norm_stderr": 0.034886477134579215 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.19706242350061198, + "mc1_stderr": 0.013925080734473747, + "mc2": 0.45237462811890433, + "mc2_stderr": 0.017068865808266467 + }, + "all": { + "acc": 0.25146781709904775, + "acc_stderr": 0.03156188157089177, + "acc_norm": 0.25206888202878397, + "acc_norm_stderr": 0.031575842357069346, + "mc1": 0.19706242350061198, + "mc1_stderr": 0.013925080734473747, + "mc2": 0.45237462811890433, + "mc2_stderr": 0.017068865808266467 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "mncai/SGPT-1.3B-insurance-epoch10", + "model_sha": "df685c0bbf838f0627383c28f48e577ee901ba68", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "cb020b91924152a6", + "hash_cont_tokens": "ebdda2ac34d5a3a0" + }, + "truncated": 4399, + "non-truncated": 288, + "padded": 282, + "non-padded": 4405, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "e2bf5af641e9308d", + "hash_cont_tokens": "28f8fe1d24821ac4" + }, + "truncated": 10657, + "non-truncated": 29511, + "padded": 29419, + "non-padded": 10749, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "11f5319f8f254b62", + "hash_cont_tokens": "2d838316b5496f75" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "cfb4267923f6cfa5", + "hash_cont_tokens": "03b6e023b0027ffd" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "7b2112b0ffbf5f73", + "hash_cont_tokens": "47a0ecd10d2dd1f4" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "67d27a7724d4bb72", + "hash_cont_tokens": "2b104c9f1da53463" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "a29e3e94383e551e", + "hash_cont_tokens": "7b62c7a12791ee13" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "31b99e27703230cf", + "hash_cont_tokens": "cf9317c10f469063" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "4ac0de24bf4a8920", + "hash_cont_tokens": "e98de8e4786e8db4" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "b1a18b1a9b09fc0b", + "hash_cont_tokens": "14f0076a2f12deff" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "fd38f797de3746ac", + "hash_cont_tokens": "f36cda54151c51b1" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6d7477df6fad4b0f", + "hash_cont_tokens": "46173e103e560a08" + }, + "truncated": 20, + "non-truncated": 672, + "padded": 672, + "non-padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "8c44ba656dfdebaa", + "hash_cont_tokens": "d27cc7e2ecf3a806" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "aca017a88eb776d3", + "hash_cont_tokens": "2d838316b5496f75" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "362665a75b191bf6", + "hash_cont_tokens": "1b6cd556c4cca959" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7efaeb09d42b23b0", + "hash_cont_tokens": "746f93fd04d89589" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "338d63d33c4a54d2", + "hash_cont_tokens": "ad974db5cadd6803" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "8770a1217549d8ae", + "hash_cont_tokens": "b9bdc64e58f72d44" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "24de846399570a87", + "hash_cont_tokens": "8b4faab97299590a" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "263704d8d1bcd18f", + "hash_cont_tokens": "2d838316b5496f75" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "392484e24b844e23", + "hash_cont_tokens": "e75cb579abbe95b6" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "d84d0a2d53b4d739", + "hash_cont_tokens": "bf8b3b24e663757f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "79c4f29f8dc6bb44", + "hash_cont_tokens": "700be125c98c51d4" + }, + "truncated": 12, + "non-truncated": 388, + "padded": 376, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "273207fb98b67dd3", + "hash_cont_tokens": "1817a3a2ad93ac9c" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "e86fd69baea15490", + "hash_cont_tokens": "62348cc87544f9aa" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "7e5edf73e3872881", + "hash_cont_tokens": "9fff224b511a8983" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "6785f9b7a41a5bdf", + "hash_cont_tokens": "d0b62937e27fa9d9" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1552, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "e7c01aad86cb5018", + "hash_cont_tokens": "8ecc8bff07192f31" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1077, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "2b85a62473e8dad2", + "hash_cont_tokens": "29e040f09fd2f206" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "01ecd15e718f3e5c", + "hash_cont_tokens": "74d0ef5eac8e43b5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "63bce7aa7045e103", + "hash_cont_tokens": "a13b93642ded4742" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "b1594fbcad079b0c", + "hash_cont_tokens": "0e76fcb429e96333" + }, + "truncated": 4, + "non-truncated": 860, + "padded": 860, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "d400fe57b0fa652a", + "hash_cont_tokens": "a705802328b941dd" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "a47591130b5b5192", + "hash_cont_tokens": "051962b81fe779d9" + }, + "truncated": 948, + "non-truncated": 0, + "padded": 0, + "non-padded": 948, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "6c60d65faedc7ac1", + "hash_cont_tokens": "37584cb800dd1e79" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "75a1f988cacb1b6d", + "hash_cont_tokens": "b1b18c67b469c0f4" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "285a899f669098dc", + "hash_cont_tokens": "a72c1063849483c7" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "ee2a1c654667bdd8", + "hash_cont_tokens": "fec1891e0b732f42" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "c2d81b3800a86326", + "hash_cont_tokens": "72d9ae0fbb9720f4" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "3a81d91902a95f19", + "hash_cont_tokens": "82e5b84cae8faab5" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "a8f5e215171afe50", + "hash_cont_tokens": "d922956e1f7ed87b" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "e1103172ae76e027", + "hash_cont_tokens": "a30534152683f6b6" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "57b5dc68c47c1408", + "hash_cont_tokens": "2d838316b5496f75" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "ec93172d1f7fca13", + "hash_cont_tokens": "c1faa00069a0b508" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "1766c3f82713467f", + "hash_cont_tokens": "29ca0e92206d9e95" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1384, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "36e6f818f913e89a", + "hash_cont_tokens": "dc2024b91a43ee8c" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "2ad18d43ce0ffea5", + "hash_cont_tokens": "e02835cc3a53de41" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "abd47a273c796e62", + "hash_cont_tokens": "c005cd0728161b45" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7ea43c35a5c57ac8", + "hash_cont_tokens": "a4bbb108ec8c7ba4" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "3ea7dd853c9b6a6a", + "hash_cont_tokens": "441db209caf6d11b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c024c0ed7d19feb7", + "hash_cont_tokens": "d74b838ca21b863e" + }, + "truncated": 6136, + "non-truncated": 0, + "padded": 0, + "non-padded": 6136, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "8fd6791406cf69a4", + "hash_cont_tokens": "22c007b162c5aa1e" + }, + "truncated": 1088, + "non-truncated": 0, + "padded": 0, + "non-padded": 1088, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "01f24520426676a0", + "hash_cont_tokens": "6bf3aadfdbd28d14" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "8cfc7b9736623624", + "hash_cont_tokens": "72b03db9b167a7c8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c0429ed6708a82bc", + "hash_cont_tokens": "647429c0cae969cf" + }, + "truncated": 980, + "non-truncated": 0, + "padded": 0, + "non-padded": 980, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "d48cccd603f1bc31", + "hash_cont_tokens": "7b11214bd7f77495" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "793009a98e6a2d2d", + "hash_cont_tokens": "2d838316b5496f75" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "7567624f9031768b", + "hash_cont_tokens": "aff5e8c86bf7848c" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "81632d5a65d83664", + "hash_cont_tokens": "29fbcccf98fe46db" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "be7dd018f8781d5a", + "hash_cont_tokens": "791fb7c318cbb933" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "581cd629b221096f", + "hash_cont_tokens": "7c089524581c10c5" + }, + "total_evaluation_time_secondes": "2170.707494735718", + "truncated": 25720, + "non-truncated": 85299, + "padded": 85178, + "non-padded": 25841, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/SGPT-1.3B-insurance-epoch10/results_2023-09-18T00-09-04.877490.json b/eval-results/mncai/SGPT-1.3B-insurance-epoch10/results_2023-09-18T00-09-04.877490.json new file mode 100644 index 0000000000000000000000000000000000000000..d285ac64dc2cef06c2fbe7286e8ca338007c5bb3 --- /dev/null +++ b/eval-results/mncai/SGPT-1.3B-insurance-epoch10/results_2023-09-18T00-09-04.877490.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "mncai/SGPT-1.3B-insurance-epoch10", + "model_sha": "df685c0bbf838f0627383c28f48e577ee901ba68", + "model_size": "2.49 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 1.99244966442953e-05, + "f1_stderr": 5.6438034448796525e-06 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5090765588003157, + "acc_stderr": 0.014050170094497704 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 1.99244966442953e-05, + "f1_stderr": 5.6438034448796525e-06, + "acc": 0.25453827940015783, + "acc_stderr": 0.007025085047248852 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f71c324c94d253a2", + "hash_cont_tokens": "ab62fd5755d531d0" + }, + "truncated": 8744, + "non-truncated": 792, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "1dc88369329dde73", + "hash_cont_tokens": "b6dbe1ec9e176915" + }, + "truncated": 265, + "non-truncated": 1054, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "85f2f85218c99daa", + "hash_cont_tokens": "9288492b2287791e" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2460, + "non-padded": 74, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "54ee9ea5c29b8168", + "hash_cont_tokens": "0f6d55f1b0c8cf96" + }, + "total_evaluation_time_secondes": "18667.261464118958", + "truncated": 9009, + "non-truncated": 4380, + "padded": 2460, + "non-padded": 10929, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/agiin-11.1B-v0.0/results_2023-12-16T15-20-44.774696.json b/eval-results/mncai/agiin-11.1B-v0.0/results_2023-12-16T15-20-44.774696.json new file mode 100644 index 0000000000000000000000000000000000000000..d4ef613517bf8892b7a196f18279ca8cee14bc4f --- /dev/null +++ b/eval-results/mncai/agiin-11.1B-v0.0/results_2023-12-16T15-20-44.774696.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 360759.980349644, + "end_time": 371002.667952813, + "total_evaluation_time_secondes": "10242.687603168946", + "model_name": "mncai/agiin-11.1B-v0.0", + "model_sha": "0b086b46a672f450d7b2e8c307526e62d8d0cfdf", + "model_dtype": "torch.float16", + "model_size": "20.9 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6245733788395904, + "acc_stderr": 0.014150631435111726, + "acc_norm": 0.6732081911262798, + "acc_norm_stderr": 0.013706665975587333 + }, + "harness|hellaswag|10": { + "acc": 0.6907986456881099, + "acc_stderr": 0.004612198061600092, + "acc_norm": 0.8634734116709819, + "acc_norm_stderr": 0.003426451744507847 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6074074074074074, + "acc_stderr": 0.0421850621536888, + "acc_norm": 0.6074074074074074, + "acc_norm_stderr": 0.0421850621536888 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7039473684210527, + "acc_stderr": 0.03715062154998904, + "acc_norm": 0.7039473684210527, + "acc_norm_stderr": 0.03715062154998904 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7094339622641509, + "acc_stderr": 0.027943219989337142, + "acc_norm": 0.7094339622641509, + "acc_norm_stderr": 0.027943219989337142 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7638888888888888, + "acc_stderr": 0.03551446610810826, + "acc_norm": 0.7638888888888888, + "acc_norm_stderr": 0.03551446610810826 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.036430371689585475, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.036430371689585475 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.43137254901960786, + "acc_stderr": 0.04928099597287534, + "acc_norm": 0.43137254901960786, + "acc_norm_stderr": 0.04928099597287534 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5659574468085107, + "acc_stderr": 0.03240038086792747, + "acc_norm": 0.5659574468085107, + "acc_norm_stderr": 0.03240038086792747 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5087719298245614, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.5087719298245614, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6, + "acc_stderr": 0.040824829046386284, + "acc_norm": 0.6, + "acc_norm_stderr": 0.040824829046386284 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.43386243386243384, + "acc_stderr": 0.025525034382474894, + "acc_norm": 0.43386243386243384, + "acc_norm_stderr": 0.025525034382474894 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.04444444444444449, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.04444444444444449 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7806451612903226, + "acc_stderr": 0.023540799358723295, + "acc_norm": 0.7806451612903226, + "acc_norm_stderr": 0.023540799358723295 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5073891625615764, + "acc_stderr": 0.035176035403610105, + "acc_norm": 0.5073891625615764, + "acc_norm_stderr": 0.035176035403610105 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7757575757575758, + "acc_stderr": 0.03256866661681102, + "acc_norm": 0.7757575757575758, + "acc_norm_stderr": 0.03256866661681102 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.803030303030303, + "acc_stderr": 0.028335609732463362, + "acc_norm": 0.803030303030303, + "acc_norm_stderr": 0.028335609732463362 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8860103626943006, + "acc_stderr": 0.022935144053919443, + "acc_norm": 0.8860103626943006, + "acc_norm_stderr": 0.022935144053919443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.676923076923077, + "acc_stderr": 0.02371088850197057, + "acc_norm": 0.676923076923077, + "acc_norm_stderr": 0.02371088850197057 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34814814814814815, + "acc_stderr": 0.029045600290616255, + "acc_norm": 0.34814814814814815, + "acc_norm_stderr": 0.029045600290616255 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7184873949579832, + "acc_stderr": 0.029213549414372177, + "acc_norm": 0.7184873949579832, + "acc_norm_stderr": 0.029213549414372177 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8440366972477065, + "acc_stderr": 0.015555802713590172, + "acc_norm": 0.8440366972477065, + "acc_norm_stderr": 0.015555802713590172 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.03388857118502325, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.03388857118502325 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8431372549019608, + "acc_stderr": 0.025524722324553346, + "acc_norm": 0.8431372549019608, + "acc_norm_stderr": 0.025524722324553346 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8143459915611815, + "acc_stderr": 0.025310495376944853, + "acc_norm": 0.8143459915611815, + "acc_norm_stderr": 0.025310495376944853 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6995515695067265, + "acc_stderr": 0.030769352008229143, + "acc_norm": 0.6995515695067265, + "acc_norm_stderr": 0.030769352008229143 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7862595419847328, + "acc_stderr": 0.0359546161177469, + "acc_norm": 0.7862595419847328, + "acc_norm_stderr": 0.0359546161177469 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7520661157024794, + "acc_stderr": 0.03941897526516302, + "acc_norm": 0.7520661157024794, + "acc_norm_stderr": 0.03941897526516302 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8148148148148148, + "acc_stderr": 0.03755265865037181, + "acc_norm": 0.8148148148148148, + "acc_norm_stderr": 0.03755265865037181 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7791411042944786, + "acc_stderr": 0.03259177392742178, + "acc_norm": 0.7791411042944786, + "acc_norm_stderr": 0.03259177392742178 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5089285714285714, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.5089285714285714, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7378640776699029, + "acc_stderr": 0.043546310772605956, + "acc_norm": 0.7378640776699029, + "acc_norm_stderr": 0.043546310772605956 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8589743589743589, + "acc_stderr": 0.022801382534597528, + "acc_norm": 0.8589743589743589, + "acc_norm_stderr": 0.022801382534597528 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8212005108556832, + "acc_stderr": 0.013702643715368985, + "acc_norm": 0.8212005108556832, + "acc_norm_stderr": 0.013702643715368985 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7196531791907514, + "acc_stderr": 0.024182427496577615, + "acc_norm": 0.7196531791907514, + "acc_norm_stderr": 0.024182427496577615 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4011173184357542, + "acc_stderr": 0.016392221899407082, + "acc_norm": 0.4011173184357542, + "acc_norm_stderr": 0.016392221899407082 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.025646863097137897, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.025646863097137897 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7234726688102894, + "acc_stderr": 0.02540383297817961, + "acc_norm": 0.7234726688102894, + "acc_norm_stderr": 0.02540383297817961 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.024383665531035457, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.024383665531035457 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4787234042553192, + "acc_stderr": 0.029800481645628693, + "acc_norm": 0.4787234042553192, + "acc_norm_stderr": 0.029800481645628693 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4641460234680574, + "acc_stderr": 0.012737361318730583, + "acc_norm": 0.4641460234680574, + "acc_norm_stderr": 0.012737361318730583 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6727941176470589, + "acc_stderr": 0.02850145286039656, + "acc_norm": 0.6727941176470589, + "acc_norm_stderr": 0.02850145286039656 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6683006535947712, + "acc_stderr": 0.019047485239360378, + "acc_norm": 0.6683006535947712, + "acc_norm_stderr": 0.019047485239360378 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7061224489795919, + "acc_stderr": 0.02916273841024977, + "acc_norm": 0.7061224489795919, + "acc_norm_stderr": 0.02916273841024977 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8557213930348259, + "acc_stderr": 0.024845753212306046, + "acc_norm": 0.8557213930348259, + "acc_norm_stderr": 0.024845753212306046 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.89, + "acc_stderr": 0.03144660377352203, + "acc_norm": 0.89, + "acc_norm_stderr": 0.03144660377352203 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5301204819277109, + "acc_stderr": 0.03885425420866767, + "acc_norm": 0.5301204819277109, + "acc_norm_stderr": 0.03885425420866767 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.5104039167686658, + "mc1_stderr": 0.017499711430249268, + "mc2": 0.6767350158422528, + "mc2_stderr": 0.015433642831645542 + }, + "harness|winogrande|5": { + "acc": 0.7884767166535123, + "acc_stderr": 0.01147774768422318 + }, + "harness|gsm8k|5": { + "acc": 0.43442001516300227, + "acc_stderr": 0.013653507211411406 + }, + "all": { + "acc": 0.6488802179534787, + "acc_stderr": 0.0321079939080906, + "acc_norm": 0.6539001040951737, + "acc_norm_stderr": 0.03274282477493687, + "mc1": 0.5104039167686658, + "mc1_stderr": 0.017499711430249268, + "mc2": 0.6767350158422528, + "mc2_stderr": 0.015433642831645542 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "35c2fbdc7b506002" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "cebd432086a7eecc" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/agiin-13.6B-v0.0/results_2023-12-16T15-55-21.950393.json b/eval-results/mncai/agiin-13.6B-v0.0/results_2023-12-16T15-55-21.950393.json new file mode 100644 index 0000000000000000000000000000000000000000..e4193d572be819f0fe7511f15081df041ab740b5 --- /dev/null +++ b/eval-results/mncai/agiin-13.6B-v0.0/results_2023-12-16T15-55-21.950393.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 360875.042247903, + "end_time": 373073.830232347, + "total_evaluation_time_secondes": "12198.787984444003", + "model_name": "mncai/agiin-13.6B-v0.0", + "model_sha": "631e80949b055193053c802437f3a31fe4e1390d", + "model_dtype": "torch.float16", + "model_size": "25.8 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.659556313993174, + "acc_stderr": 0.013847460518892973, + "acc_norm": 0.6945392491467577, + "acc_norm_stderr": 0.013460080478002508 + }, + "harness|hellaswag|10": { + "acc": 0.6858195578570006, + "acc_stderr": 0.0046323996774908106, + "acc_norm": 0.8658633738299144, + "acc_norm_stderr": 0.0034010255178737237 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.562962962962963, + "acc_stderr": 0.042849586397534015, + "acc_norm": 0.562962962962963, + "acc_norm_stderr": 0.042849586397534015 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.625, + "acc_stderr": 0.039397364351956274, + "acc_norm": 0.625, + "acc_norm_stderr": 0.039397364351956274 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6339622641509434, + "acc_stderr": 0.029647813539365245, + "acc_norm": 0.6339622641509434, + "acc_norm_stderr": 0.029647813539365245 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.03852084696008534, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.03852084696008534 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6184971098265896, + "acc_stderr": 0.037038511930995215, + "acc_norm": 0.6184971098265896, + "acc_norm_stderr": 0.037038511930995215 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.048971049527263666, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.048971049527263666 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5531914893617021, + "acc_stderr": 0.032500536843658404, + "acc_norm": 0.5531914893617021, + "acc_norm_stderr": 0.032500536843658404 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4473684210526316, + "acc_stderr": 0.04677473004491199, + "acc_norm": 0.4473684210526316, + "acc_norm_stderr": 0.04677473004491199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6068965517241379, + "acc_stderr": 0.040703290137070705, + "acc_norm": 0.6068965517241379, + "acc_norm_stderr": 0.040703290137070705 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.41798941798941797, + "acc_stderr": 0.025402555503260912, + "acc_norm": 0.41798941798941797, + "acc_norm_stderr": 0.025402555503260912 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.04415438226743744, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.04415438226743744 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7354838709677419, + "acc_stderr": 0.02509189237885928, + "acc_norm": 0.7354838709677419, + "acc_norm_stderr": 0.02509189237885928 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4876847290640394, + "acc_stderr": 0.035169204442208966, + "acc_norm": 0.4876847290640394, + "acc_norm_stderr": 0.035169204442208966 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.03192271569548301, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.03192271569548301 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7676767676767676, + "acc_stderr": 0.030088629490217487, + "acc_norm": 0.7676767676767676, + "acc_norm_stderr": 0.030088629490217487 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8393782383419689, + "acc_stderr": 0.026499057701397443, + "acc_norm": 0.8393782383419689, + "acc_norm_stderr": 0.026499057701397443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6512820512820513, + "acc_stderr": 0.02416278028401772, + "acc_norm": 0.6512820512820513, + "acc_norm_stderr": 0.02416278028401772 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.02911661760608301, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.02911661760608301 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6386554621848739, + "acc_stderr": 0.03120469122515001, + "acc_norm": 0.6386554621848739, + "acc_norm_stderr": 0.03120469122515001 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.03879687024073327, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.03879687024073327 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8201834862385321, + "acc_stderr": 0.01646534546739152, + "acc_norm": 0.8201834862385321, + "acc_norm_stderr": 0.01646534546739152 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5787037037037037, + "acc_stderr": 0.033674621388960775, + "acc_norm": 0.5787037037037037, + "acc_norm_stderr": 0.033674621388960775 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8088235294117647, + "acc_stderr": 0.027599174300640766, + "acc_norm": 0.8088235294117647, + "acc_norm_stderr": 0.027599174300640766 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7890295358649789, + "acc_stderr": 0.02655837250266192, + "acc_norm": 0.7890295358649789, + "acc_norm_stderr": 0.02655837250266192 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7022900763358778, + "acc_stderr": 0.040103589424622034, + "acc_norm": 0.7022900763358778, + "acc_norm_stderr": 0.040103589424622034 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8099173553719008, + "acc_stderr": 0.03581796951709282, + "acc_norm": 0.8099173553719008, + "acc_norm_stderr": 0.03581796951709282 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.75, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7423312883435583, + "acc_stderr": 0.03436150827846917, + "acc_norm": 0.7423312883435583, + "acc_norm_stderr": 0.03436150827846917 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8418803418803419, + "acc_stderr": 0.023902325549560417, + "acc_norm": 0.8418803418803419, + "acc_norm_stderr": 0.023902325549560417 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7484035759897829, + "acc_stderr": 0.015517322365529633, + "acc_norm": 0.7484035759897829, + "acc_norm_stderr": 0.015517322365529633 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6878612716763006, + "acc_stderr": 0.024946792225272314, + "acc_norm": 0.6878612716763006, + "acc_norm_stderr": 0.024946792225272314 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4692737430167598, + "acc_stderr": 0.01669089616194438, + "acc_norm": 0.4692737430167598, + "acc_norm_stderr": 0.01669089616194438 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.02699254433929724, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.02699254433929724 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6784565916398714, + "acc_stderr": 0.026527724079528872, + "acc_norm": 0.6784565916398714, + "acc_norm_stderr": 0.026527724079528872 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6882716049382716, + "acc_stderr": 0.02577311116963045, + "acc_norm": 0.6882716049382716, + "acc_norm_stderr": 0.02577311116963045 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.44680851063829785, + "acc_stderr": 0.029658235097666904, + "acc_norm": 0.44680851063829785, + "acc_norm_stderr": 0.029658235097666904 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.47392438070404175, + "acc_stderr": 0.012752858346533133, + "acc_norm": 0.47392438070404175, + "acc_norm_stderr": 0.012752858346533133 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6397058823529411, + "acc_stderr": 0.029163128570670733, + "acc_norm": 0.6397058823529411, + "acc_norm_stderr": 0.029163128570670733 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6323529411764706, + "acc_stderr": 0.019506291693954854, + "acc_norm": 0.6323529411764706, + "acc_norm_stderr": 0.019506291693954854 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7, + "acc_stderr": 0.04389311454644287, + "acc_norm": 0.7, + "acc_norm_stderr": 0.04389311454644287 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.636734693877551, + "acc_stderr": 0.03078905113903081, + "acc_norm": 0.636734693877551, + "acc_norm_stderr": 0.03078905113903081 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8308457711442786, + "acc_stderr": 0.026508590656233264, + "acc_norm": 0.8308457711442786, + "acc_norm_stderr": 0.026508590656233264 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036623, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036623 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5120481927710844, + "acc_stderr": 0.03891364495835817, + "acc_norm": 0.5120481927710844, + "acc_norm_stderr": 0.03891364495835817 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.783625730994152, + "acc_stderr": 0.031581495393387324, + "acc_norm": 0.783625730994152, + "acc_norm_stderr": 0.031581495393387324 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.5165238678090576, + "mc1_stderr": 0.017493940190057723, + "mc2": 0.6740086972319943, + "mc2_stderr": 0.015471222805293889 + }, + "harness|winogrande|5": { + "acc": 0.7868981846882399, + "acc_stderr": 0.011508957690722743 + }, + "harness|gsm8k|5": { + "acc": 0.47687642153146326, + "acc_stderr": 0.013757748544245331 + }, + "all": { + "acc": 0.621527215806331, + "acc_stderr": 0.03309044810009566, + "acc_norm": 0.6248205476117454, + "acc_norm_stderr": 0.03375647243509085, + "mc1": 0.5165238678090576, + "mc1_stderr": 0.017493940190057723, + "mc2": 0.6740086972319943, + "mc2_stderr": 0.015471222805293889 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "640fc1350c5bd69c" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "49e3f8cbf5b61ee7" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/agiin-13.6B-v0.1/results_2023-12-16T16-35-40.891850.json b/eval-results/mncai/agiin-13.6B-v0.1/results_2023-12-16T16-35-40.891850.json new file mode 100644 index 0000000000000000000000000000000000000000..5aae4ff45ace12ecbf2e045075a34a1cbeea1bdf --- /dev/null +++ b/eval-results/mncai/agiin-13.6B-v0.1/results_2023-12-16T16-35-40.891850.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 363239.476766451, + "end_time": 375506.887804492, + "total_evaluation_time_secondes": "12267.411038041057", + "model_name": "mncai/agiin-13.6B-v0.1", + "model_sha": "6c93ca1d60b09b9b91e15c57dc8525827d371798", + "model_dtype": "torch.float16", + "model_size": "25.8 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6672354948805461, + "acc_stderr": 0.013769863046192302, + "acc_norm": 0.6945392491467577, + "acc_norm_stderr": 0.013460080478002508 + }, + "harness|hellaswag|10": { + "acc": 0.6861183031268672, + "acc_stderr": 0.004631205099684944, + "acc_norm": 0.8663612826130253, + "acc_norm_stderr": 0.0033956833380563364 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5481481481481482, + "acc_stderr": 0.04299268905480864, + "acc_norm": 0.5481481481481482, + "acc_norm_stderr": 0.04299268905480864 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5986842105263158, + "acc_stderr": 0.039889037033362836, + "acc_norm": 0.5986842105263158, + "acc_norm_stderr": 0.039889037033362836 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6339622641509434, + "acc_stderr": 0.02964781353936525, + "acc_norm": 0.6339622641509434, + "acc_norm_stderr": 0.02964781353936525 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6875, + "acc_stderr": 0.038760854559127644, + "acc_norm": 0.6875, + "acc_norm_stderr": 0.038760854559127644 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110175, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110175 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6184971098265896, + "acc_stderr": 0.037038511930995215, + "acc_norm": 0.6184971098265896, + "acc_norm_stderr": 0.037038511930995215 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4019607843137255, + "acc_stderr": 0.04878608714466996, + "acc_norm": 0.4019607843137255, + "acc_norm_stderr": 0.04878608714466996 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.65, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.65, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5617021276595745, + "acc_stderr": 0.03243618636108102, + "acc_norm": 0.5617021276595745, + "acc_norm_stderr": 0.03243618636108102 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.39473684210526316, + "acc_stderr": 0.045981880578165414, + "acc_norm": 0.39473684210526316, + "acc_norm_stderr": 0.045981880578165414 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.593103448275862, + "acc_stderr": 0.04093793981266236, + "acc_norm": 0.593103448275862, + "acc_norm_stderr": 0.04093793981266236 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.42328042328042326, + "acc_stderr": 0.02544636563440678, + "acc_norm": 0.42328042328042326, + "acc_norm_stderr": 0.02544636563440678 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.0442626668137991, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.0442626668137991 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7258064516129032, + "acc_stderr": 0.025378139970885203, + "acc_norm": 0.7258064516129032, + "acc_norm_stderr": 0.025378139970885203 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.035158955511656986, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.035158955511656986 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939098, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939098 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7818181818181819, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.7818181818181819, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7727272727272727, + "acc_stderr": 0.02985751567338642, + "acc_norm": 0.7727272727272727, + "acc_norm_stderr": 0.02985751567338642 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8134715025906736, + "acc_stderr": 0.028112091210117467, + "acc_norm": 0.8134715025906736, + "acc_norm_stderr": 0.028112091210117467 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6564102564102564, + "acc_stderr": 0.024078696580635474, + "acc_norm": 0.6564102564102564, + "acc_norm_stderr": 0.024078696580635474 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34814814814814815, + "acc_stderr": 0.029045600290616255, + "acc_norm": 0.34814814814814815, + "acc_norm_stderr": 0.029045600290616255 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6302521008403361, + "acc_stderr": 0.031357095996135904, + "acc_norm": 0.6302521008403361, + "acc_norm_stderr": 0.031357095996135904 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.03861557546255169, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.03861557546255169 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8220183486238533, + "acc_stderr": 0.016399436366612896, + "acc_norm": 0.8220183486238533, + "acc_norm_stderr": 0.016399436366612896 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.03388857118502325, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.03388857118502325 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.028125972265654366, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.028125972265654366 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7805907172995781, + "acc_stderr": 0.026939106581553945, + "acc_norm": 0.7805907172995781, + "acc_norm_stderr": 0.026939106581553945 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6771300448430493, + "acc_stderr": 0.03138147637575499, + "acc_norm": 0.6771300448430493, + "acc_norm_stderr": 0.03138147637575499 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7099236641221374, + "acc_stderr": 0.03980066246467766, + "acc_norm": 0.7099236641221374, + "acc_norm_stderr": 0.03980066246467766 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8181818181818182, + "acc_stderr": 0.03520893951097654, + "acc_norm": 0.8181818181818182, + "acc_norm_stderr": 0.03520893951097654 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.04330043749650741, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.04330043749650741 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7361963190184049, + "acc_stderr": 0.03462419931615624, + "acc_norm": 0.7361963190184049, + "acc_norm_stderr": 0.03462419931615624 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4642857142857143, + "acc_stderr": 0.04733667890053756, + "acc_norm": 0.4642857142857143, + "acc_norm_stderr": 0.04733667890053756 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7475728155339806, + "acc_stderr": 0.04301250399690878, + "acc_norm": 0.7475728155339806, + "acc_norm_stderr": 0.04301250399690878 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8461538461538461, + "acc_stderr": 0.023636873317489267, + "acc_norm": 0.8461538461538461, + "acc_norm_stderr": 0.023636873317489267 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.62, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.62, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7573435504469987, + "acc_stderr": 0.015329888940899867, + "acc_norm": 0.7573435504469987, + "acc_norm_stderr": 0.015329888940899867 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6791907514450867, + "acc_stderr": 0.025131000233647886, + "acc_norm": 0.6791907514450867, + "acc_norm_stderr": 0.025131000233647886 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.46033519553072627, + "acc_stderr": 0.016669799592112025, + "acc_norm": 0.46033519553072627, + "acc_norm_stderr": 0.016669799592112025 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6535947712418301, + "acc_stderr": 0.027245613047215355, + "acc_norm": 0.6535947712418301, + "acc_norm_stderr": 0.027245613047215355 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6784565916398714, + "acc_stderr": 0.026527724079528872, + "acc_norm": 0.6784565916398714, + "acc_norm_stderr": 0.026527724079528872 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6759259259259259, + "acc_stderr": 0.026041766202717163, + "acc_norm": 0.6759259259259259, + "acc_norm_stderr": 0.026041766202717163 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4219858156028369, + "acc_stderr": 0.029462189233370593, + "acc_norm": 0.4219858156028369, + "acc_norm_stderr": 0.029462189233370593 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.47327249022164275, + "acc_stderr": 0.012751977967676008, + "acc_norm": 0.47327249022164275, + "acc_norm_stderr": 0.012751977967676008 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6360294117647058, + "acc_stderr": 0.02922719246003203, + "acc_norm": 0.6360294117647058, + "acc_norm_stderr": 0.02922719246003203 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6486928104575164, + "acc_stderr": 0.019312676065786558, + "acc_norm": 0.6486928104575164, + "acc_norm_stderr": 0.019312676065786558 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.0449429086625209, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.0449429086625209 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6285714285714286, + "acc_stderr": 0.030932858792789845, + "acc_norm": 0.6285714285714286, + "acc_norm_stderr": 0.030932858792789845 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8258706467661692, + "acc_stderr": 0.026814951200421603, + "acc_norm": 0.8258706467661692, + "acc_norm_stderr": 0.026814951200421603 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036844, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036844 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5060240963855421, + "acc_stderr": 0.03892212195333045, + "acc_norm": 0.5060240963855421, + "acc_norm_stderr": 0.03892212195333045 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.03188578017686398, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.03188578017686398 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.5214198286413708, + "mc1_stderr": 0.01748743214471164, + "mc2": 0.6797310501619931, + "mc2_stderr": 0.015395432575157594 + }, + "harness|winogrande|5": { + "acc": 0.7868981846882399, + "acc_stderr": 0.011508957690722743 + }, + "harness|gsm8k|5": { + "acc": 0.46474601971190294, + "acc_stderr": 0.01373820799017732 + }, + "all": { + "acc": 0.6140808996502091, + "acc_stderr": 0.03322600041693132, + "acc_norm": 0.6172006340341523, + "acc_norm_stderr": 0.033898195854611735, + "mc1": 0.5214198286413708, + "mc1_stderr": 0.01748743214471164, + "mc2": 0.6797310501619931, + "mc2_stderr": 0.015395432575157594 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "3db41976808da16f" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "2546e72b7f706920" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/chatdoctor/results_2023-07-24T15-52-02.947837.json b/eval-results/mncai/chatdoctor/results_2023-07-24T15-52-02.947837.json new file mode 100644 index 0000000000000000000000000000000000000000..3547089d741b4381541e1cfc3e6e299927e3b37b --- /dev/null +++ b/eval-results/mncai/chatdoctor/results_2023-07-24T15-52-02.947837.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5136518771331058, + "acc_stderr": 0.014605943429860947, + "acc_norm": 0.537542662116041, + "acc_norm_stderr": 0.01457014449507558 + }, + "harness|hellaswag|10": { + "acc": 0.5989842660824537, + "acc_stderr": 0.004891025533633033, + "acc_norm": 0.7854013144791874, + "acc_norm_stderr": 0.004097046160548165 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768081, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768081 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3851851851851852, + "acc_stderr": 0.04203921040156279, + "acc_norm": 0.3851851851851852, + "acc_norm_stderr": 0.04203921040156279 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3355263157894737, + "acc_stderr": 0.038424985593952694, + "acc_norm": 0.3355263157894737, + "acc_norm_stderr": 0.038424985593952694 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.39245283018867927, + "acc_stderr": 0.030052580579557845, + "acc_norm": 0.39245283018867927, + "acc_norm_stderr": 0.030052580579557845 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3611111111111111, + "acc_stderr": 0.04016660030451233, + "acc_norm": 0.3611111111111111, + "acc_norm_stderr": 0.04016660030451233 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768077, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768077 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720683, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720683 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2832369942196532, + "acc_stderr": 0.034355680560478746, + "acc_norm": 0.2832369942196532, + "acc_norm_stderr": 0.034355680560478746 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.18627450980392157, + "acc_stderr": 0.03873958714149352, + "acc_norm": 0.18627450980392157, + "acc_norm_stderr": 0.03873958714149352 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.42, + "acc_stderr": 0.04960449637488584, + "acc_norm": 0.42, + "acc_norm_stderr": 0.04960449637488584 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.39148936170212767, + "acc_stderr": 0.03190701242326812, + "acc_norm": 0.39148936170212767, + "acc_norm_stderr": 0.03190701242326812 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.04142439719489362, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.04142439719489362 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.296551724137931, + "acc_stderr": 0.03806142687309994, + "acc_norm": 0.296551724137931, + "acc_norm_stderr": 0.03806142687309994 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24867724867724866, + "acc_stderr": 0.02226181769240017, + "acc_norm": 0.24867724867724866, + "acc_norm_stderr": 0.02226181769240017 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.0393253768039287, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.0393253768039287 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3709677419354839, + "acc_stderr": 0.027480541887953593, + "acc_norm": 0.3709677419354839, + "acc_norm_stderr": 0.027480541887953593 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.0314471258167824, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.0314471258167824 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5696969696969697, + "acc_stderr": 0.038662259628790774, + "acc_norm": 0.5696969696969697, + "acc_norm_stderr": 0.038662259628790774 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3686868686868687, + "acc_stderr": 0.03437305501980619, + "acc_norm": 0.3686868686868687, + "acc_norm_stderr": 0.03437305501980619 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.49740932642487046, + "acc_stderr": 0.03608390745384487, + "acc_norm": 0.49740932642487046, + "acc_norm_stderr": 0.03608390745384487 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.31794871794871793, + "acc_stderr": 0.023610884308927865, + "acc_norm": 0.31794871794871793, + "acc_norm_stderr": 0.023610884308927865 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.22592592592592592, + "acc_stderr": 0.025497532639609556, + "acc_norm": 0.22592592592592592, + "acc_norm_stderr": 0.025497532639609556 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3319327731092437, + "acc_stderr": 0.03058869701378366, + "acc_norm": 0.3319327731092437, + "acc_norm_stderr": 0.03058869701378366 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23178807947019867, + "acc_stderr": 0.034454062719870546, + "acc_norm": 0.23178807947019867, + "acc_norm_stderr": 0.034454062719870546 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.45871559633027525, + "acc_stderr": 0.0213641225338817, + "acc_norm": 0.45871559633027525, + "acc_norm_stderr": 0.0213641225338817 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.030058202704309846, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.030058202704309846 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.3872549019607843, + "acc_stderr": 0.03418931233833344, + "acc_norm": 0.3872549019607843, + "acc_norm_stderr": 0.03418931233833344 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.4641350210970464, + "acc_stderr": 0.03246338898055659, + "acc_norm": 0.4641350210970464, + "acc_norm_stderr": 0.03246338898055659 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.47085201793721976, + "acc_stderr": 0.03350073248773404, + "acc_norm": 0.47085201793721976, + "acc_norm_stderr": 0.03350073248773404 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.35877862595419846, + "acc_stderr": 0.04206739313864908, + "acc_norm": 0.35877862595419846, + "acc_norm_stderr": 0.04206739313864908 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.4049586776859504, + "acc_stderr": 0.044811377559424694, + "acc_norm": 0.4049586776859504, + "acc_norm_stderr": 0.044811377559424694 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.0478034362693679, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.0478034362693679 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.37423312883435583, + "acc_stderr": 0.038020681028996146, + "acc_norm": 0.37423312883435583, + "acc_norm_stderr": 0.038020681028996146 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.0457237235873743, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.0457237235873743 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.4563106796116505, + "acc_stderr": 0.049318019942204146, + "acc_norm": 0.4563106796116505, + "acc_norm_stderr": 0.049318019942204146 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.5128205128205128, + "acc_stderr": 0.032745319388423504, + "acc_norm": 0.5128205128205128, + "acc_norm_stderr": 0.032745319388423504 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.42, + "acc_stderr": 0.04960449637488584, + "acc_norm": 0.42, + "acc_norm_stderr": 0.04960449637488584 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.49936143039591313, + "acc_stderr": 0.01787994891443168, + "acc_norm": 0.49936143039591313, + "acc_norm_stderr": 0.01787994891443168 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.3930635838150289, + "acc_stderr": 0.026296227915613674, + "acc_norm": 0.3930635838150289, + "acc_norm_stderr": 0.026296227915613674 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.02699254433929723, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.02699254433929723 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.40192926045016075, + "acc_stderr": 0.02784647600593047, + "acc_norm": 0.40192926045016075, + "acc_norm_stderr": 0.02784647600593047 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.4012345679012346, + "acc_stderr": 0.0272725828498398, + "acc_norm": 0.4012345679012346, + "acc_norm_stderr": 0.0272725828498398 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.29432624113475175, + "acc_stderr": 0.027187127011503793, + "acc_norm": 0.29432624113475175, + "acc_norm_stderr": 0.027187127011503793 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2894393741851369, + "acc_stderr": 0.011582659702210252, + "acc_norm": 0.2894393741851369, + "acc_norm_stderr": 0.011582659702210252 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.029520095697687758, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.029520095697687758 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.32679738562091504, + "acc_stderr": 0.01897542792050721, + "acc_norm": 0.32679738562091504, + "acc_norm_stderr": 0.01897542792050721 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.42727272727272725, + "acc_stderr": 0.04738198703545483, + "acc_norm": 0.42727272727272725, + "acc_norm_stderr": 0.04738198703545483 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2693877551020408, + "acc_stderr": 0.02840125202902294, + "acc_norm": 0.2693877551020408, + "acc_norm_stderr": 0.02840125202902294 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.373134328358209, + "acc_stderr": 0.03419832608176007, + "acc_norm": 0.373134328358209, + "acc_norm_stderr": 0.03419832608176007 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.39156626506024095, + "acc_stderr": 0.03799857454479636, + "acc_norm": 0.39156626506024095, + "acc_norm_stderr": 0.03799857454479636 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.4853801169590643, + "acc_stderr": 0.038331852752130205, + "acc_norm": 0.4853801169590643, + "acc_norm_stderr": 0.038331852752130205 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2827417380660955, + "mc1_stderr": 0.015764770836777305, + "mc2": 0.4355022713703972, + "mc2_stderr": 0.015354591426228493 + }, + "all": { + "acc": 0.36615941304709165, + "acc_stderr": 0.03450196411598282, + "acc_norm": 0.36972395259589963, + "acc_norm_stderr": 0.034487900076866376, + "mc1": 0.2827417380660955, + "mc1_stderr": 0.015764770836777305, + "mc2": 0.4355022713703972, + "mc2_stderr": 0.015354591426228493 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "mncai/chatdoctor", + "model_sha": "8fdcfdda6877d7f21173dfac48b2c14499ba8264", + "model_dtype": "torch.float16", + "lighteval_sha": "03c2fad20ff7f5334c33cfee459024b8d7e4a109", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "2b0e07d4cdd3b0fe", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "578edd77107cb2c3", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "6a95a1511f8da075", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "24a78edc4d9a93aa", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "b11106668d6c0974", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "10180ba12a075cb0", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "73351ef4968750a2", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "a539150af234c668", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "52e12e5a43bcee35", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "d1f3721a5659f7ee", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "f2d78f546b5595c2", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "c9cc19179f63d1d6", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5046144e67e992e8", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4b14581ba4fc06fc", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "1ee52c413b5b4cc4", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "2914077c4dd3090a", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0f88a874342378de", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "9889933f1dd02a23", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dc309a94c4bfdd2f", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "0801a0aebec3ba8c", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "5bc4aca8831d9c05", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b92bd6b06fc3464c", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a549346cde8165e9", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "e7e9cf91f9d6a081", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "a61a1670f854d9e1", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8a77cb7763f28110", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "fcfcfae391f8faa1", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a29454cc1feb23ef", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "b6734a25556d75dc", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "5720438e29473426", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "486321d5858de240", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "473919e64d1b8c80", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "47a65c81fd7ed010", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "aedfcd41cbd2fcc9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "ed5f2414144d7b72", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "692eaacb5b747264", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "2cbce4edca937588", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "c2f38b19bab1aa2c", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fde277bc547bc3d8", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "87b232bbebce39db", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "58c21af9da3e126e", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d1f5c770d368e9c6", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "98d6db15a50aaa8e", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "2aabd8c7337502f8", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "17f8c8f2d4a0a9b1", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "dfc6df491d991966", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "cffe8139e00da9dd", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "4a69ed6ee55918fb", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "6cc713f12b5890de", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "b4044fc92756c377", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b019784da8db089a", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "f47f37c7c9bfc601", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "4d282718d6142410", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "fbc6026e500537bc", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "150dd1ff81ff642e", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "fcbac3e735545969", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ffc962a38441ef13", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "9ffb65d225ae550f", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "1c61d6705b299f5c", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "2650.709557056427", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/chatdoctor/results_2023-09-17T01-48-31.701330.json b/eval-results/mncai/chatdoctor/results_2023-09-17T01-48-31.701330.json new file mode 100644 index 0000000000000000000000000000000000000000..2bc0112009f4b25075cde316242b750071ad5ace --- /dev/null +++ b/eval-results/mncai/chatdoctor/results_2023-09-17T01-48-31.701330.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "mncai/chatdoctor", + "model_sha": "8fdcfdda6877d7f21173dfac48b2c14499ba8264", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.22640520134228187, + "em_stderr": 0.004285876197711522, + "f1": 0.3016862416107395, + "f1_stderr": 0.004314877276433696 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.6992896606156275, + "acc_stderr": 0.01288801049470473 + }, + "all": { + "em": 0.22640520134228187, + "em_stderr": 0.004285876197711522, + "f1": 0.3016862416107395, + "f1_stderr": 0.004314877276433696, + "acc": 0.34964483030781374, + "acc_stderr": 0.006444005247352365 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "f70227603c1b1bfe", + "hash_cont_tokens": "4dae7a162a31f761" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "e3d5b3003c52b880", + "hash_cont_tokens": "db3280af0b7e49ea" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "5be2b0947cee07a9", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "ce9af2df9f2847fa", + "hash_cont_tokens": "304c98b6059b003d" + }, + "total_evaluation_time_secondes": "7380.074978113174", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/mistral-7b-dpo-v5/results_2023-12-16T17-06-29.601004.json b/eval-results/mncai/mistral-7b-dpo-v5/results_2023-12-16T17-06-29.601004.json new file mode 100644 index 0000000000000000000000000000000000000000..5501019610aa25f9512b0fc19c5d3fdc140266bd --- /dev/null +++ b/eval-results/mncai/mistral-7b-dpo-v5/results_2023-12-16T17-06-29.601004.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 370332.754650717, + "end_time": 377343.66217463, + "total_evaluation_time_secondes": "7010.90752391296", + "model_name": "mncai/mistral-7b-dpo-v5", + "model_sha": "8108f313d878ce848ceceeaf55ce8b3ecaaee792", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6911262798634812, + "acc_stderr": 0.013501770929344, + "acc_norm": 0.7201365187713311, + "acc_norm_stderr": 0.01311904089772592 + }, + "harness|hellaswag|10": { + "acc": 0.6978689504082852, + "acc_stderr": 0.004582433109636476, + "acc_norm": 0.8757219677355108, + "acc_norm_stderr": 0.003292242543637345 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6370370370370371, + "acc_stderr": 0.04153948404742398, + "acc_norm": 0.6370370370370371, + "acc_norm_stderr": 0.04153948404742398 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6907894736842105, + "acc_stderr": 0.037610708698674805, + "acc_norm": 0.6907894736842105, + "acc_norm_stderr": 0.037610708698674805 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6943396226415094, + "acc_stderr": 0.028353298073322666, + "acc_norm": 0.6943396226415094, + "acc_norm_stderr": 0.028353298073322666 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7361111111111112, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.7361111111111112, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6705202312138728, + "acc_stderr": 0.03583901754736412, + "acc_norm": 0.6705202312138728, + "acc_norm_stderr": 0.03583901754736412 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.048971049527263666, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.048971049527263666 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932263, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932263 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5531914893617021, + "acc_stderr": 0.0325005368436584, + "acc_norm": 0.5531914893617021, + "acc_norm_stderr": 0.0325005368436584 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.046970851366478626, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.046970851366478626 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5724137931034483, + "acc_stderr": 0.04122737111370332, + "acc_norm": 0.5724137931034483, + "acc_norm_stderr": 0.04122737111370332 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.025424835086923996, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.025424835086923996 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4365079365079365, + "acc_stderr": 0.04435932892851466, + "acc_norm": 0.4365079365079365, + "acc_norm_stderr": 0.04435932892851466 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7935483870967742, + "acc_stderr": 0.023025899617188716, + "acc_norm": 0.7935483870967742, + "acc_norm_stderr": 0.023025899617188716 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.49261083743842365, + "acc_stderr": 0.035176035403610084, + "acc_norm": 0.49261083743842365, + "acc_norm_stderr": 0.035176035403610084 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7696969696969697, + "acc_stderr": 0.0328766675860349, + "acc_norm": 0.7696969696969697, + "acc_norm_stderr": 0.0328766675860349 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7929292929292929, + "acc_stderr": 0.028869778460267042, + "acc_norm": 0.7929292929292929, + "acc_norm_stderr": 0.028869778460267042 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8808290155440415, + "acc_stderr": 0.02338193534812143, + "acc_norm": 0.8808290155440415, + "acc_norm_stderr": 0.02338193534812143 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6487179487179487, + "acc_stderr": 0.024203665177902803, + "acc_norm": 0.6487179487179487, + "acc_norm_stderr": 0.024203665177902803 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.028037929969114993, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.028037929969114993 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.634453781512605, + "acc_stderr": 0.031282177063684614, + "acc_norm": 0.634453781512605, + "acc_norm_stderr": 0.031282177063684614 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526732, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526732 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8403669724770643, + "acc_stderr": 0.015703498348461766, + "acc_norm": 0.8403669724770643, + "acc_norm_stderr": 0.015703498348461766 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.49537037037037035, + "acc_stderr": 0.03409825519163572, + "acc_norm": 0.49537037037037035, + "acc_norm_stderr": 0.03409825519163572 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8088235294117647, + "acc_stderr": 0.027599174300640766, + "acc_norm": 0.8088235294117647, + "acc_norm_stderr": 0.027599174300640766 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229962, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229962 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.695067264573991, + "acc_stderr": 0.030898610882477515, + "acc_norm": 0.695067264573991, + "acc_norm_stderr": 0.030898610882477515 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7786259541984732, + "acc_stderr": 0.03641297081313729, + "acc_norm": 0.7786259541984732, + "acc_norm_stderr": 0.03641297081313729 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.768595041322314, + "acc_stderr": 0.038498560987940904, + "acc_norm": 0.768595041322314, + "acc_norm_stderr": 0.038498560987940904 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.04133119440243839, + "acc_norm": 0.7592592592592593, + "acc_norm_stderr": 0.04133119440243839 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7484662576687117, + "acc_stderr": 0.03408997886857529, + "acc_norm": 0.7484662576687117, + "acc_norm_stderr": 0.03408997886857529 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4642857142857143, + "acc_stderr": 0.04733667890053756, + "acc_norm": 0.4642857142857143, + "acc_norm_stderr": 0.04733667890053756 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7864077669902912, + "acc_stderr": 0.040580420156460344, + "acc_norm": 0.7864077669902912, + "acc_norm_stderr": 0.040580420156460344 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8846153846153846, + "acc_stderr": 0.020930193185179326, + "acc_norm": 0.8846153846153846, + "acc_norm_stderr": 0.020930193185179326 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.68, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.68, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8301404853128991, + "acc_stderr": 0.013428186370608311, + "acc_norm": 0.8301404853128991, + "acc_norm_stderr": 0.013428186370608311 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7398843930635838, + "acc_stderr": 0.023618678310069356, + "acc_norm": 0.7398843930635838, + "acc_norm_stderr": 0.023618678310069356 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4480446927374302, + "acc_stderr": 0.016631976628930595, + "acc_norm": 0.4480446927374302, + "acc_norm_stderr": 0.016631976628930595 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7189542483660131, + "acc_stderr": 0.025738854797818737, + "acc_norm": 0.7189542483660131, + "acc_norm_stderr": 0.025738854797818737 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7138263665594855, + "acc_stderr": 0.025670259242188933, + "acc_norm": 0.7138263665594855, + "acc_norm_stderr": 0.025670259242188933 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.75, + "acc_stderr": 0.02409347123262133, + "acc_norm": 0.75, + "acc_norm_stderr": 0.02409347123262133 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4858156028368794, + "acc_stderr": 0.02981549448368206, + "acc_norm": 0.4858156028368794, + "acc_norm_stderr": 0.02981549448368206 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.46479791395045633, + "acc_stderr": 0.012738547371303957, + "acc_norm": 0.46479791395045633, + "acc_norm_stderr": 0.012738547371303957 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6544117647058824, + "acc_stderr": 0.02888819310398863, + "acc_norm": 0.6544117647058824, + "acc_norm_stderr": 0.02888819310398863 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6617647058823529, + "acc_stderr": 0.01913994374848704, + "acc_norm": 0.6617647058823529, + "acc_norm_stderr": 0.01913994374848704 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7183673469387755, + "acc_stderr": 0.028795185574291293, + "acc_norm": 0.7183673469387755, + "acc_norm_stderr": 0.028795185574291293 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8407960199004975, + "acc_stderr": 0.025870646766169146, + "acc_norm": 0.8407960199004975, + "acc_norm_stderr": 0.025870646766169146 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.89, + "acc_stderr": 0.03144660377352202, + "acc_norm": 0.89, + "acc_norm_stderr": 0.03144660377352202 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5481927710843374, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.5481927710843374, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8245614035087719, + "acc_stderr": 0.029170885500727665, + "acc_norm": 0.8245614035087719, + "acc_norm_stderr": 0.029170885500727665 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.5385556915544676, + "mc1_stderr": 0.017451384104637452, + "mc2": 0.6686090881936299, + "mc2_stderr": 0.015322918299770005 + }, + "harness|winogrande|5": { + "acc": 0.8224151539068666, + "acc_stderr": 0.010740676861359237 + }, + "harness|gsm8k|5": { + "acc": 0.7065959059893859, + "acc_stderr": 0.012541830815461487 + }, + "all": { + "acc": 0.6444771644137858, + "acc_stderr": 0.032189712118428256, + "acc_norm": 0.6439145631454195, + "acc_norm_stderr": 0.03285791543982518, + "mc1": 0.5385556915544676, + "mc1_stderr": 0.017451384104637452, + "mc2": 0.6686090881936299, + "mc2_stderr": 0.015322918299770005 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "006e282f3a8aa57f" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "db441e26c93360d3" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/mistral-7b-dpo-v6/results_2023-12-16T20-12-40.545634.json b/eval-results/mncai/mistral-7b-dpo-v6/results_2023-12-16T20-12-40.545634.json new file mode 100644 index 0000000000000000000000000000000000000000..c45e1f15cf7255f099a88d5e29b96edc5cb87055 --- /dev/null +++ b/eval-results/mncai/mistral-7b-dpo-v6/results_2023-12-16T20-12-40.545634.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 381519.258111362, + "end_time": 388514.620079951, + "total_evaluation_time_secondes": "6995.361968589015", + "model_name": "mncai/mistral-7b-dpo-v6", + "model_sha": "206be3fd589dd62817343c53525ab7fb1b752faf", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6919795221843004, + "acc_stderr": 0.013491429517292038, + "acc_norm": 0.7252559726962458, + "acc_norm_stderr": 0.013044617212771227 + }, + "harness|hellaswag|10": { + "acc": 0.7074287990440151, + "acc_stderr": 0.004540134005060321, + "acc_norm": 0.8809998008364868, + "acc_norm_stderr": 0.003231270127834668 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.04072314811876837, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.04072314811876837 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6973684210526315, + "acc_stderr": 0.037385206761196686, + "acc_norm": 0.6973684210526315, + "acc_norm_stderr": 0.037385206761196686 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7245283018867924, + "acc_stderr": 0.027495663683724053, + "acc_norm": 0.7245283018867924, + "acc_norm_stderr": 0.027495663683724053 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7708333333333334, + "acc_stderr": 0.03514697467862388, + "acc_norm": 0.7708333333333334, + "acc_norm_stderr": 0.03514697467862388 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6763005780346821, + "acc_stderr": 0.035676037996391706, + "acc_norm": 0.6763005780346821, + "acc_norm_stderr": 0.035676037996391706 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.049406356306056595, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.049406356306056595 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932263, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932263 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5574468085106383, + "acc_stderr": 0.03246956919789958, + "acc_norm": 0.5574468085106383, + "acc_norm_stderr": 0.03246956919789958 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5087719298245614, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.5087719298245614, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5655172413793104, + "acc_stderr": 0.04130740879555498, + "acc_norm": 0.5655172413793104, + "acc_norm_stderr": 0.04130740879555498 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.42328042328042326, + "acc_stderr": 0.025446365634406783, + "acc_norm": 0.42328042328042326, + "acc_norm_stderr": 0.025446365634406783 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4603174603174603, + "acc_stderr": 0.04458029125470973, + "acc_norm": 0.4603174603174603, + "acc_norm_stderr": 0.04458029125470973 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7838709677419354, + "acc_stderr": 0.02341529343356853, + "acc_norm": 0.7838709677419354, + "acc_norm_stderr": 0.02341529343356853 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5221674876847291, + "acc_stderr": 0.03514528562175007, + "acc_norm": 0.5221674876847291, + "acc_norm_stderr": 0.03514528562175007 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7757575757575758, + "acc_stderr": 0.03256866661681102, + "acc_norm": 0.7757575757575758, + "acc_norm_stderr": 0.03256866661681102 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.029126522834586818, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.029126522834586818 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8911917098445595, + "acc_stderr": 0.022473253332768766, + "acc_norm": 0.8911917098445595, + "acc_norm_stderr": 0.022473253332768766 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6615384615384615, + "acc_stderr": 0.023991500500313036, + "acc_norm": 0.6615384615384615, + "acc_norm_stderr": 0.023991500500313036 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.31851851851851853, + "acc_stderr": 0.028406533090608456, + "acc_norm": 0.31851851851851853, + "acc_norm_stderr": 0.028406533090608456 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6470588235294118, + "acc_stderr": 0.031041941304059278, + "acc_norm": 0.6470588235294118, + "acc_norm_stderr": 0.031041941304059278 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526732, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526732 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8477064220183487, + "acc_stderr": 0.015405084393157074, + "acc_norm": 0.8477064220183487, + "acc_norm_stderr": 0.015405084393157074 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.034076320938540516, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.034076320938540516 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8284313725490197, + "acc_stderr": 0.026460569561240644, + "acc_norm": 0.8284313725490197, + "acc_norm_stderr": 0.026460569561240644 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7848101265822784, + "acc_stderr": 0.02675082699467617, + "acc_norm": 0.7848101265822784, + "acc_norm_stderr": 0.02675082699467617 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6860986547085202, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.6860986547085202, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.816793893129771, + "acc_stderr": 0.03392770926494733, + "acc_norm": 0.816793893129771, + "acc_norm_stderr": 0.03392770926494733 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8016528925619835, + "acc_stderr": 0.03640118271990947, + "acc_norm": 0.8016528925619835, + "acc_norm_stderr": 0.03640118271990947 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.04133119440243839, + "acc_norm": 0.7592592592592593, + "acc_norm_stderr": 0.04133119440243839 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7730061349693251, + "acc_stderr": 0.03291099578615769, + "acc_norm": 0.7730061349693251, + "acc_norm_stderr": 0.03291099578615769 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.41964285714285715, + "acc_stderr": 0.04684099321077106, + "acc_norm": 0.41964285714285715, + "acc_norm_stderr": 0.04684099321077106 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8760683760683761, + "acc_stderr": 0.021586494001281372, + "acc_norm": 0.8760683760683761, + "acc_norm_stderr": 0.021586494001281372 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8275862068965517, + "acc_stderr": 0.013507943909371802, + "acc_norm": 0.8275862068965517, + "acc_norm_stderr": 0.013507943909371802 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7398843930635838, + "acc_stderr": 0.023618678310069356, + "acc_norm": 0.7398843930635838, + "acc_norm_stderr": 0.023618678310069356 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4782122905027933, + "acc_stderr": 0.016706617522176132, + "acc_norm": 0.4782122905027933, + "acc_norm_stderr": 0.016706617522176132 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.025646863097137894, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.025646863097137894 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7170418006430869, + "acc_stderr": 0.02558306248998481, + "acc_norm": 0.7170418006430869, + "acc_norm_stderr": 0.02558306248998481 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.02438366553103545, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.02438366553103545 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4716312056737589, + "acc_stderr": 0.02977945095730307, + "acc_norm": 0.4716312056737589, + "acc_norm_stderr": 0.02977945095730307 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.46870925684485004, + "acc_stderr": 0.012745204626083136, + "acc_norm": 0.46870925684485004, + "acc_norm_stderr": 0.012745204626083136 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6875, + "acc_stderr": 0.02815637344037142, + "acc_norm": 0.6875, + "acc_norm_stderr": 0.02815637344037142 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6683006535947712, + "acc_stderr": 0.01904748523936038, + "acc_norm": 0.6683006535947712, + "acc_norm_stderr": 0.01904748523936038 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6818181818181818, + "acc_stderr": 0.044612721759105085, + "acc_norm": 0.6818181818181818, + "acc_norm_stderr": 0.044612721759105085 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7387755102040816, + "acc_stderr": 0.028123429335142773, + "acc_norm": 0.7387755102040816, + "acc_norm_stderr": 0.028123429335142773 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.835820895522388, + "acc_stderr": 0.026193923544454115, + "acc_norm": 0.835820895522388, + "acc_norm_stderr": 0.026193923544454115 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.03379976689896309, + "acc_norm": 0.87, + "acc_norm_stderr": 0.03379976689896309 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5421686746987951, + "acc_stderr": 0.038786267710023595, + "acc_norm": 0.5421686746987951, + "acc_norm_stderr": 0.038786267710023595 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8187134502923976, + "acc_stderr": 0.029547741687640044, + "acc_norm": 0.8187134502923976, + "acc_norm_stderr": 0.029547741687640044 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.5483476132190942, + "mc1_stderr": 0.01742148030027764, + "mc2": 0.6823900041375971, + "mc2_stderr": 0.015243369336298 + }, + "harness|winogrande|5": { + "acc": 0.8255722178374112, + "acc_stderr": 0.010665187902498435 + }, + "harness|gsm8k|5": { + "acc": 0.7088703563305534, + "acc_stderr": 0.012513215297888463 + }, + "all": { + "acc": 0.6525100354141533, + "acc_stderr": 0.032048485515607635, + "acc_norm": 0.6521274074305052, + "acc_norm_stderr": 0.03271226334016835, + "mc1": 0.5483476132190942, + "mc1_stderr": 0.01742148030027764, + "mc2": 0.6823900041375971, + "mc2_stderr": 0.015243369336298 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "c908a2de20b67525" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "58ac4e2f22cebc57" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/yi-34B-v2/results_2023-12-10T05-59-23.635398.json b/eval-results/mncai/yi-34B-v2/results_2023-12-10T05-59-23.635398.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7601d2b06b9a98b1e3dcdbd8c6ddb6b986293d --- /dev/null +++ b/eval-results/mncai/yi-34B-v2/results_2023-12-10T05-59-23.635398.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 584807.142281546, + "end_time": 642834.599588723, + "total_evaluation_time_secondes": "58027.457307177014", + "model_name": "mncai/yi-34B-v2", + "model_sha": "bf7696c10077e73d06752c564ea35cc7e5e336ca", + "model_dtype": "torch.float16", + "model_size": "64.29 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6373720136518771, + "acc_stderr": 0.014049106564955007, + "acc_norm": 0.6612627986348123, + "acc_norm_stderr": 0.013830568927974332 + }, + "harness|hellaswag|10": { + "acc": 0.6523600876319459, + "acc_stderr": 0.004752476997887817, + "acc_norm": 0.8500298745269866, + "acc_norm_stderr": 0.0035631244274585126 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6962962962962963, + "acc_stderr": 0.03972552884785137, + "acc_norm": 0.6962962962962963, + "acc_norm_stderr": 0.03972552884785137 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.881578947368421, + "acc_stderr": 0.026293995855474935, + "acc_norm": 0.881578947368421, + "acc_norm_stderr": 0.026293995855474935 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.79, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.79, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.8150943396226416, + "acc_stderr": 0.02389335183446432, + "acc_norm": 0.8150943396226416, + "acc_norm_stderr": 0.02389335183446432 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.9097222222222222, + "acc_stderr": 0.023964965777906935, + "acc_norm": 0.9097222222222222, + "acc_norm_stderr": 0.023964965777906935 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.7225433526011561, + "acc_stderr": 0.03414014007044036, + "acc_norm": 0.7225433526011561, + "acc_norm_stderr": 0.03414014007044036 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.049665709039785295, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.049665709039785295 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.83, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.83, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.774468085106383, + "acc_stderr": 0.027321078417387533, + "acc_norm": 0.774468085106383, + "acc_norm_stderr": 0.027321078417387533 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5614035087719298, + "acc_stderr": 0.04668000738510455, + "acc_norm": 0.5614035087719298, + "acc_norm_stderr": 0.04668000738510455 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.7241379310344828, + "acc_stderr": 0.03724563619774631, + "acc_norm": 0.7241379310344828, + "acc_norm_stderr": 0.03724563619774631 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.6878306878306878, + "acc_stderr": 0.023865206836972592, + "acc_norm": 0.6878306878306878, + "acc_norm_stderr": 0.023865206836972592 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5476190476190477, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.5476190476190477, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.9032258064516129, + "acc_stderr": 0.016818943416345197, + "acc_norm": 0.9032258064516129, + "acc_norm_stderr": 0.016818943416345197 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.6748768472906403, + "acc_stderr": 0.032957975663112704, + "acc_norm": 0.6748768472906403, + "acc_norm_stderr": 0.032957975663112704 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932261, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932261 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8424242424242424, + "acc_stderr": 0.028450388805284343, + "acc_norm": 0.8424242424242424, + "acc_norm_stderr": 0.028450388805284343 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.9292929292929293, + "acc_stderr": 0.01826310542019949, + "acc_norm": 0.9292929292929293, + "acc_norm_stderr": 0.01826310542019949 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9740932642487047, + "acc_stderr": 0.01146452335695318, + "acc_norm": 0.9740932642487047, + "acc_norm_stderr": 0.01146452335695318 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.8282051282051283, + "acc_stderr": 0.01912490360342356, + "acc_norm": 0.8282051282051283, + "acc_norm_stderr": 0.01912490360342356 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3962962962962963, + "acc_stderr": 0.029822619458534, + "acc_norm": 0.3962962962962963, + "acc_norm_stderr": 0.029822619458534 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.8529411764705882, + "acc_stderr": 0.023005459446673964, + "acc_norm": 0.8529411764705882, + "acc_norm_stderr": 0.023005459446673964 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4900662251655629, + "acc_stderr": 0.04081677107248436, + "acc_norm": 0.4900662251655629, + "acc_norm_stderr": 0.04081677107248436 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.926605504587156, + "acc_stderr": 0.011180976446357573, + "acc_norm": 0.926605504587156, + "acc_norm_stderr": 0.011180976446357573 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03214952147802749, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03214952147802749 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9117647058823529, + "acc_stderr": 0.01990739979131695, + "acc_norm": 0.9117647058823529, + "acc_norm_stderr": 0.01990739979131695 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.9113924050632911, + "acc_stderr": 0.018498315206865384, + "acc_norm": 0.9113924050632911, + "acc_norm_stderr": 0.018498315206865384 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.8161434977578476, + "acc_stderr": 0.025998379092356517, + "acc_norm": 0.8161434977578476, + "acc_norm_stderr": 0.025998379092356517 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8702290076335878, + "acc_stderr": 0.029473649496907065, + "acc_norm": 0.8702290076335878, + "acc_norm_stderr": 0.029473649496907065 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.9008264462809917, + "acc_stderr": 0.02728524631275896, + "acc_norm": 0.9008264462809917, + "acc_norm_stderr": 0.02728524631275896 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8796296296296297, + "acc_stderr": 0.03145703854306251, + "acc_norm": 0.8796296296296297, + "acc_norm_stderr": 0.03145703854306251 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.852760736196319, + "acc_stderr": 0.027839915278339653, + "acc_norm": 0.852760736196319, + "acc_norm_stderr": 0.027839915278339653 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.04697113923010213, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.04697113923010213 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8446601941747572, + "acc_stderr": 0.03586594738573974, + "acc_norm": 0.8446601941747572, + "acc_norm_stderr": 0.03586594738573974 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9358974358974359, + "acc_stderr": 0.016046261631673137, + "acc_norm": 0.9358974358974359, + "acc_norm_stderr": 0.016046261631673137 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.82, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.82, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.9016602809706258, + "acc_stderr": 0.010648356301876346, + "acc_norm": 0.9016602809706258, + "acc_norm_stderr": 0.010648356301876346 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.8063583815028902, + "acc_stderr": 0.021274230317515547, + "acc_norm": 0.8063583815028902, + "acc_norm_stderr": 0.021274230317515547 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.7150837988826816, + "acc_stderr": 0.015096222302469792, + "acc_norm": 0.7150837988826816, + "acc_norm_stderr": 0.015096222302469792 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.8366013071895425, + "acc_stderr": 0.021170623011213512, + "acc_norm": 0.8366013071895425, + "acc_norm_stderr": 0.021170623011213512 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.8135048231511254, + "acc_stderr": 0.022122439772480768, + "acc_norm": 0.8135048231511254, + "acc_norm_stderr": 0.022122439772480768 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8611111111111112, + "acc_stderr": 0.019242526226544543, + "acc_norm": 0.8611111111111112, + "acc_norm_stderr": 0.019242526226544543 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.624113475177305, + "acc_stderr": 0.028893955412115875, + "acc_norm": 0.624113475177305, + "acc_norm_stderr": 0.028893955412115875 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5984354628422425, + "acc_stderr": 0.01252031512014712, + "acc_norm": 0.5984354628422425, + "acc_norm_stderr": 0.01252031512014712 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.8382352941176471, + "acc_stderr": 0.02236867256288675, + "acc_norm": 0.8382352941176471, + "acc_norm_stderr": 0.02236867256288675 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.8169934640522876, + "acc_stderr": 0.015643069911273344, + "acc_norm": 0.8169934640522876, + "acc_norm_stderr": 0.015643069911273344 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.04350271442923243, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.04350271442923243 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.8489795918367347, + "acc_stderr": 0.022923004094736833, + "acc_norm": 0.8489795918367347, + "acc_norm_stderr": 0.022923004094736833 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8855721393034826, + "acc_stderr": 0.022509345325101706, + "acc_norm": 0.8855721393034826, + "acc_norm_stderr": 0.022509345325101706 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.92, + "acc_stderr": 0.0272659924344291, + "acc_norm": 0.92, + "acc_norm_stderr": 0.0272659924344291 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5903614457831325, + "acc_stderr": 0.038284011150790206, + "acc_norm": 0.5903614457831325, + "acc_norm_stderr": 0.038284011150790206 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8713450292397661, + "acc_stderr": 0.025679342723276908, + "acc_norm": 0.8713450292397661, + "acc_norm_stderr": 0.025679342723276908 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.4173806609547124, + "mc1_stderr": 0.017262891063272175, + "mc2": 0.5733928094646895, + "mc2_stderr": 0.01509801265375318 + }, + "harness|winogrande|5": { + "acc": 0.8366219415943172, + "acc_stderr": 0.010390695970273759 + }, + "harness|gsm8k|5": { + "acc": 0.6497346474601972, + "acc_stderr": 0.013140409455571286 + }, + "all": { + "acc": 0.7523453787674309, + "acc_stderr": 0.02848483810892476, + "acc_norm": 0.756411391315877, + "acc_norm_stderr": 0.029027731000189076, + "mc1": 0.4173806609547124, + "mc1_stderr": 0.017262891063272175, + "mc2": 0.5733928094646895, + "mc2_stderr": 0.01509801265375318 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "d03b65d70f5d2bd0", + "hash_cont_tokens": "e23c779c4c2dd1ec" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "a4173b2c24cc5b73", + "hash_cont_tokens": "55da5ba61989a8fe" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40095, + "non_padded": 73, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "77e04f2550caf863", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "98a31a182db1b80d", + "hash_cont_tokens": "5cc800feae9fa1ad" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "928088c2614ed583", + "hash_cont_tokens": "655dbb90034f484a" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "15416f2429dec97b", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "64b589888e3e7bc3", + "hash_cont_tokens": "f77b74d946d7fc02" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "51e51ff266545be3", + "hash_cont_tokens": "1ba4b1a158d8bf3f" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "001fbe642356be74", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "dc5ce83315ad2bc2", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "24306d0e032064a5", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "a9ef531ac38bad0b", + "hash_cont_tokens": "78a0ebf66d91c5cf" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "77c9870dda6690f7", + "hash_cont_tokens": "5a030c95824fdbe5" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "bd0c759308eb4459", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "a230a5a3bf23a34a", + "hash_cont_tokens": "2326dc60d0bc41b6" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "8d999d8be34349fa", + "hash_cont_tokens": "be908364b6f14dd6" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "ee2a4e54a1b0e0dc", + "hash_cont_tokens": "179280ef597fe1bf" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 564, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "b594b9d44887498b", + "hash_cont_tokens": "95cdcdaf1abd0bd2" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dc3a958f698477dc", + "hash_cont_tokens": "6a4818f3c307c346" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "69b04d2f179417b0", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "89080ed6730d8e91", + "hash_cont_tokens": "36d0d84455f0bdba" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b193dd7663685d41", + "hash_cont_tokens": "c678f794a9b8ee74" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8dc9c89e66558e0b", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5812582ffb72f01a", + "hash_cont_tokens": "e9c94304326d875c" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6870165439f8fb10", + "hash_cont_tokens": "f937a1349eb483eb" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "7420c2ddd5589762", + "hash_cont_tokens": "8b27dd3907d25b4e" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "c9f7fbfcc3530334", + "hash_cont_tokens": "3763cae29e2f938c" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "9b039211bb6ad435", + "hash_cont_tokens": "fd7b555352d765a4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "875233af78045033", + "hash_cont_tokens": "61f46d4a209b9aa2" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "60fe531d1cd92bde", + "hash_cont_tokens": "4e7053e7c19d680d" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "6f8d63c226bb9e2e", + "hash_cont_tokens": "84d19ae8790476bb" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "8cd4c1fa5fb87625", + "hash_cont_tokens": "b119c7b668213a4e" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "0ced9949b12a96dd", + "hash_cont_tokens": "a3b126bc622d571f" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "0e7db386f385e8f6", + "hash_cont_tokens": "9abf19ceb76331ff" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "21f96cd4a7888e96", + "hash_cont_tokens": "0e2e725ae9a898da" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "05609fcbade219c3", + "hash_cont_tokens": "a94c1dea6d775249" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "5b28e2ea8de66c4b", + "hash_cont_tokens": "3832f860859bb86b" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "f4053a5b1dec4a26", + "hash_cont_tokens": "9fac5a0c364fca8a" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "06739234c63944fd", + "hash_cont_tokens": "dc53ed31134ddf3a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "b9c38c5f5c902879", + "hash_cont_tokens": "e272b5456d5552d6" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "76ad9bfb0a5eba90", + "hash_cont_tokens": "7119d4642957b1f0" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "b37c52ae0d5caf59", + "hash_cont_tokens": "099d58c66ece3f11" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "00579ebece916954", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "6a53996f93afda8b", + "hash_cont_tokens": "bae342d4e82ba8f7" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f26c133ee5ce21ff", + "hash_cont_tokens": "578c64cbdbb1e0d4" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "82424ecb6f145bf5", + "hash_cont_tokens": "79b25f42b3fce0f9" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "301695fe42f541f0", + "hash_cont_tokens": "9d1f3b976417156c" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9613317698fffbd3", + "hash_cont_tokens": "88dab560e1e06d97" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a77c81e5f2ecee7d", + "hash_cont_tokens": "04ea847139fe9393" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ede14e81d9fbeda3", + "hash_cont_tokens": "0435ff692ad17e68" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "559b4b12bc5ac37c", + "hash_cont_tokens": "b852c74e9f8801bd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "766f55053f2bdf72", + "hash_cont_tokens": "5db0f6460652d063" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "f8a2453499f47d41", + "hash_cont_tokens": "c960676ef7f3dbe5" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "c2a1a681983c46f4", + "hash_cont_tokens": "3320565f412c4b01" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "f356c34720b56b6e", + "hash_cont_tokens": "218ed775ef60aab9" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "6c20ab03b57ea617", + "hash_cont_tokens": "20babf5cc4cc7f3d" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "f66bfb80b77246bf", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "45d4c067b9f9c37a", + "hash_cont_tokens": "dc6d57296bea0882" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6b2f9f955e197814", + "hash_cont_tokens": "37f53444db289ed3" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "1c491ea3183384db", + "hash_cont_tokens": "71a67034827cd30e" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "0a16c9aa8804bcf9", + "hash_cont_tokens": "c93e9c22fa3077a0" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "a0417d00e6822d02", + "hash_cont_tokens": "fade0cef142ca1b2" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "c9e1d352754e9d40", + "hash_cont_tokens": "1ca02f2da8712e7f" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113452, + "non_padded": 1420, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/mncai/yi-34B-v3/results_2023-12-11T01-51-08.694143.json b/eval-results/mncai/yi-34B-v3/results_2023-12-11T01-51-08.694143.json new file mode 100644 index 0000000000000000000000000000000000000000..bb9f00ec92e3ae87cb4658a3260ca4ff64de41d7 --- /dev/null +++ b/eval-results/mncai/yi-34B-v3/results_2023-12-11T01-51-08.694143.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 656698.548686411, + "end_time": 714345.644573602, + "total_evaluation_time_secondes": "57647.09588719101", + "model_name": "mncai/yi-34B-v3", + "model_sha": "f7605af56f29b42e72f9c2cbbd4ad8e443a8dae0", + "model_dtype": "torch.float16", + "model_size": "64.29 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6390784982935154, + "acc_stderr": 0.014034761386175452, + "acc_norm": 0.6706484641638225, + "acc_norm_stderr": 0.013734057652635476 + }, + "harness|hellaswag|10": { + "acc": 0.6487751443935471, + "acc_stderr": 0.004763774981834676, + "acc_norm": 0.8511252738498307, + "acc_norm_stderr": 0.0035523745313052004 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6962962962962963, + "acc_stderr": 0.03972552884785137, + "acc_norm": 0.6962962962962963, + "acc_norm_stderr": 0.03972552884785137 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.881578947368421, + "acc_stderr": 0.026293995855474935, + "acc_norm": 0.881578947368421, + "acc_norm_stderr": 0.026293995855474935 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.79, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.79, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.8188679245283019, + "acc_stderr": 0.023702963526757798, + "acc_norm": 0.8188679245283019, + "acc_norm_stderr": 0.023702963526757798 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.9097222222222222, + "acc_stderr": 0.023964965777906935, + "acc_norm": 0.9097222222222222, + "acc_norm_stderr": 0.023964965777906935 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.7167630057803468, + "acc_stderr": 0.034355680560478746, + "acc_norm": 0.7167630057803468, + "acc_norm_stderr": 0.034355680560478746 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.5392156862745098, + "acc_stderr": 0.04959859966384181, + "acc_norm": 0.5392156862745098, + "acc_norm_stderr": 0.04959859966384181 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.83, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.83, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.7787234042553192, + "acc_stderr": 0.027136349602424056, + "acc_norm": 0.7787234042553192, + "acc_norm_stderr": 0.027136349602424056 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5701754385964912, + "acc_stderr": 0.04657047260594963, + "acc_norm": 0.5701754385964912, + "acc_norm_stderr": 0.04657047260594963 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.7379310344827587, + "acc_stderr": 0.036646663372252565, + "acc_norm": 0.7379310344827587, + "acc_norm_stderr": 0.036646663372252565 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.6957671957671958, + "acc_stderr": 0.02369541500946309, + "acc_norm": 0.6957671957671958, + "acc_norm_stderr": 0.02369541500946309 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5396825396825397, + "acc_stderr": 0.04458029125470973, + "acc_norm": 0.5396825396825397, + "acc_norm_stderr": 0.04458029125470973 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562427, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562427 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.9, + "acc_stderr": 0.017066403719657255, + "acc_norm": 0.9, + "acc_norm_stderr": 0.017066403719657255 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.6699507389162561, + "acc_stderr": 0.033085304262282574, + "acc_norm": 0.6699507389162561, + "acc_norm_stderr": 0.033085304262282574 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8545454545454545, + "acc_stderr": 0.027530196355066584, + "acc_norm": 0.8545454545454545, + "acc_norm_stderr": 0.027530196355066584 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.9343434343434344, + "acc_stderr": 0.01764652667723333, + "acc_norm": 0.9343434343434344, + "acc_norm_stderr": 0.01764652667723333 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9740932642487047, + "acc_stderr": 0.01146452335695318, + "acc_norm": 0.9740932642487047, + "acc_norm_stderr": 0.01146452335695318 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.8205128205128205, + "acc_stderr": 0.019457390787681803, + "acc_norm": 0.8205128205128205, + "acc_norm_stderr": 0.019457390787681803 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.4111111111111111, + "acc_stderr": 0.02999992350870669, + "acc_norm": 0.4111111111111111, + "acc_norm_stderr": 0.02999992350870669 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.8529411764705882, + "acc_stderr": 0.023005459446673964, + "acc_norm": 0.8529411764705882, + "acc_norm_stderr": 0.023005459446673964 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4966887417218543, + "acc_stderr": 0.04082393379449654, + "acc_norm": 0.4966887417218543, + "acc_norm_stderr": 0.04082393379449654 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.9284403669724771, + "acc_stderr": 0.01105125524781546, + "acc_norm": 0.9284403669724771, + "acc_norm_stderr": 0.01105125524781546 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.6620370370370371, + "acc_stderr": 0.03225941352631295, + "acc_norm": 0.6620370370370371, + "acc_norm_stderr": 0.03225941352631295 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9068627450980392, + "acc_stderr": 0.020397853969426994, + "acc_norm": 0.9068627450980392, + "acc_norm_stderr": 0.020397853969426994 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.9156118143459916, + "acc_stderr": 0.01809424711647332, + "acc_norm": 0.9156118143459916, + "acc_norm_stderr": 0.01809424711647332 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.8026905829596412, + "acc_stderr": 0.02670985334496796, + "acc_norm": 0.8026905829596412, + "acc_norm_stderr": 0.02670985334496796 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8702290076335878, + "acc_stderr": 0.029473649496907065, + "acc_norm": 0.8702290076335878, + "acc_norm_stderr": 0.029473649496907065 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.9008264462809917, + "acc_stderr": 0.02728524631275896, + "acc_norm": 0.9008264462809917, + "acc_norm_stderr": 0.02728524631275896 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8981481481481481, + "acc_stderr": 0.029239272675632748, + "acc_norm": 0.8981481481481481, + "acc_norm_stderr": 0.029239272675632748 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8650306748466258, + "acc_stderr": 0.026845765054553855, + "acc_norm": 0.8650306748466258, + "acc_norm_stderr": 0.026845765054553855 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.04697113923010213, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.04697113923010213 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8446601941747572, + "acc_stderr": 0.03586594738573974, + "acc_norm": 0.8446601941747572, + "acc_norm_stderr": 0.03586594738573974 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9401709401709402, + "acc_stderr": 0.015537514263253862, + "acc_norm": 0.9401709401709402, + "acc_norm_stderr": 0.015537514263253862 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.82, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.82, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.9029374201787995, + "acc_stderr": 0.010586474712018283, + "acc_norm": 0.9029374201787995, + "acc_norm_stderr": 0.010586474712018283 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.8121387283236994, + "acc_stderr": 0.02102926975242323, + "acc_norm": 0.8121387283236994, + "acc_norm_stderr": 0.02102926975242323 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.7039106145251397, + "acc_stderr": 0.015268677317602274, + "acc_norm": 0.7039106145251397, + "acc_norm_stderr": 0.015268677317602274 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.8398692810457516, + "acc_stderr": 0.020998740930362303, + "acc_norm": 0.8398692810457516, + "acc_norm_stderr": 0.020998740930362303 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.8167202572347267, + "acc_stderr": 0.021974198848265812, + "acc_norm": 0.8167202572347267, + "acc_norm_stderr": 0.021974198848265812 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8641975308641975, + "acc_stderr": 0.0190615881815054, + "acc_norm": 0.8641975308641975, + "acc_norm_stderr": 0.0190615881815054 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.6276595744680851, + "acc_stderr": 0.02883892147125145, + "acc_norm": 0.6276595744680851, + "acc_norm_stderr": 0.02883892147125145 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5971316818774446, + "acc_stderr": 0.01252695557711801, + "acc_norm": 0.5971316818774446, + "acc_norm_stderr": 0.01252695557711801 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.8345588235294118, + "acc_stderr": 0.02257177102549474, + "acc_norm": 0.8345588235294118, + "acc_norm_stderr": 0.02257177102549474 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.8169934640522876, + "acc_stderr": 0.015643069911273344, + "acc_norm": 0.8169934640522876, + "acc_norm_stderr": 0.015643069911273344 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7181818181818181, + "acc_stderr": 0.04309118709946458, + "acc_norm": 0.7181818181818181, + "acc_norm_stderr": 0.04309118709946458 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.8489795918367347, + "acc_stderr": 0.022923004094736844, + "acc_norm": 0.8489795918367347, + "acc_norm_stderr": 0.022923004094736844 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8855721393034826, + "acc_stderr": 0.022509345325101706, + "acc_norm": 0.8855721393034826, + "acc_norm_stderr": 0.022509345325101706 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.92, + "acc_stderr": 0.0272659924344291, + "acc_norm": 0.92, + "acc_norm_stderr": 0.0272659924344291 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5903614457831325, + "acc_stderr": 0.038284011150790206, + "acc_norm": 0.5903614457831325, + "acc_norm_stderr": 0.038284011150790206 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8830409356725146, + "acc_stderr": 0.024648068961366152, + "acc_norm": 0.8830409356725146, + "acc_norm_stderr": 0.024648068961366152 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.4186046511627907, + "mc1_stderr": 0.017270015284476855, + "mc2": 0.5753679426280454, + "mc2_stderr": 0.014962842073717312 + }, + "harness|winogrande|5": { + "acc": 0.835043409629045, + "acc_stderr": 0.010430917468237419 + }, + "harness|gsm8k|5": { + "acc": 0.645185746777862, + "acc_stderr": 0.013179083387979214 + }, + "all": { + "acc": 0.7536948044621744, + "acc_stderr": 0.028378789321173548, + "acc_norm": 0.7581198984934292, + "acc_norm_stderr": 0.02891498378900509, + "mc1": 0.4186046511627907, + "mc1_stderr": 0.017270015284476855, + "mc2": 0.5753679426280454, + "mc2_stderr": 0.014962842073717312 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "d03b65d70f5d2bd0", + "hash_cont_tokens": "e23c779c4c2dd1ec" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "a4173b2c24cc5b73", + "hash_cont_tokens": "55da5ba61989a8fe" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40095, + "non_padded": 73, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "77e04f2550caf863", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "98a31a182db1b80d", + "hash_cont_tokens": "5cc800feae9fa1ad" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "928088c2614ed583", + "hash_cont_tokens": "655dbb90034f484a" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "15416f2429dec97b", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "64b589888e3e7bc3", + "hash_cont_tokens": "f77b74d946d7fc02" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "51e51ff266545be3", + "hash_cont_tokens": "1ba4b1a158d8bf3f" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "001fbe642356be74", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "dc5ce83315ad2bc2", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "24306d0e032064a5", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "a9ef531ac38bad0b", + "hash_cont_tokens": "78a0ebf66d91c5cf" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "77c9870dda6690f7", + "hash_cont_tokens": "5a030c95824fdbe5" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "bd0c759308eb4459", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "a230a5a3bf23a34a", + "hash_cont_tokens": "2326dc60d0bc41b6" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "8d999d8be34349fa", + "hash_cont_tokens": "be908364b6f14dd6" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "ee2a4e54a1b0e0dc", + "hash_cont_tokens": "179280ef597fe1bf" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 564, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "b594b9d44887498b", + "hash_cont_tokens": "95cdcdaf1abd0bd2" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "dc3a958f698477dc", + "hash_cont_tokens": "6a4818f3c307c346" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "69b04d2f179417b0", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "89080ed6730d8e91", + "hash_cont_tokens": "36d0d84455f0bdba" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "b193dd7663685d41", + "hash_cont_tokens": "c678f794a9b8ee74" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8dc9c89e66558e0b", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5812582ffb72f01a", + "hash_cont_tokens": "e9c94304326d875c" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6870165439f8fb10", + "hash_cont_tokens": "f937a1349eb483eb" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "7420c2ddd5589762", + "hash_cont_tokens": "8b27dd3907d25b4e" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "c9f7fbfcc3530334", + "hash_cont_tokens": "3763cae29e2f938c" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "9b039211bb6ad435", + "hash_cont_tokens": "fd7b555352d765a4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "875233af78045033", + "hash_cont_tokens": "61f46d4a209b9aa2" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "60fe531d1cd92bde", + "hash_cont_tokens": "4e7053e7c19d680d" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "6f8d63c226bb9e2e", + "hash_cont_tokens": "84d19ae8790476bb" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "8cd4c1fa5fb87625", + "hash_cont_tokens": "b119c7b668213a4e" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "0ced9949b12a96dd", + "hash_cont_tokens": "a3b126bc622d571f" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "0e7db386f385e8f6", + "hash_cont_tokens": "9abf19ceb76331ff" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "21f96cd4a7888e96", + "hash_cont_tokens": "0e2e725ae9a898da" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "05609fcbade219c3", + "hash_cont_tokens": "a94c1dea6d775249" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "5b28e2ea8de66c4b", + "hash_cont_tokens": "3832f860859bb86b" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "f4053a5b1dec4a26", + "hash_cont_tokens": "9fac5a0c364fca8a" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "06739234c63944fd", + "hash_cont_tokens": "dc53ed31134ddf3a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "b9c38c5f5c902879", + "hash_cont_tokens": "e272b5456d5552d6" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "76ad9bfb0a5eba90", + "hash_cont_tokens": "7119d4642957b1f0" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "b37c52ae0d5caf59", + "hash_cont_tokens": "099d58c66ece3f11" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "00579ebece916954", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "6a53996f93afda8b", + "hash_cont_tokens": "bae342d4e82ba8f7" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f26c133ee5ce21ff", + "hash_cont_tokens": "578c64cbdbb1e0d4" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "82424ecb6f145bf5", + "hash_cont_tokens": "79b25f42b3fce0f9" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "301695fe42f541f0", + "hash_cont_tokens": "9d1f3b976417156c" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "9613317698fffbd3", + "hash_cont_tokens": "88dab560e1e06d97" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a77c81e5f2ecee7d", + "hash_cont_tokens": "04ea847139fe9393" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ede14e81d9fbeda3", + "hash_cont_tokens": "0435ff692ad17e68" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "559b4b12bc5ac37c", + "hash_cont_tokens": "b852c74e9f8801bd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "766f55053f2bdf72", + "hash_cont_tokens": "5db0f6460652d063" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "f8a2453499f47d41", + "hash_cont_tokens": "c960676ef7f3dbe5" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "c2a1a681983c46f4", + "hash_cont_tokens": "3320565f412c4b01" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "f356c34720b56b6e", + "hash_cont_tokens": "218ed775ef60aab9" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "6c20ab03b57ea617", + "hash_cont_tokens": "20babf5cc4cc7f3d" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "f66bfb80b77246bf", + "hash_cont_tokens": "bcc22fd85dcc85e9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "45d4c067b9f9c37a", + "hash_cont_tokens": "dc6d57296bea0882" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6b2f9f955e197814", + "hash_cont_tokens": "37f53444db289ed3" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "1c491ea3183384db", + "hash_cont_tokens": "71a67034827cd30e" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "0a16c9aa8804bcf9", + "hash_cont_tokens": "c93e9c22fa3077a0" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "a0417d00e6822d02", + "hash_cont_tokens": "fd5f25d044c2f2a7" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "c9e1d352754e9d40", + "hash_cont_tokens": "494ad121927a0f6f" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113452, + "non_padded": 1420, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/notstoic/PygmalionCoT-7b/results_2023-07-18T12-24-33.017908.json b/eval-results/notstoic/PygmalionCoT-7b/results_2023-07-18T12-24-33.017908.json new file mode 100644 index 0000000000000000000000000000000000000000..b5efdb36b3d0b4fa38e98ca7708c2895b0f6f734 --- /dev/null +++ b/eval-results/notstoic/PygmalionCoT-7b/results_2023-07-18T12-24-33.017908.json @@ -0,0 +1,871 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.4872013651877133, + "acc_stderr": 0.014606603181012541, + "acc_norm": 0.514505119453925, + "acc_norm_stderr": 0.014605241081370053 + }, + "harness|hellaswag|10": { + "acc": 0.5707030472017527, + "acc_stderr": 0.004939642460172579, + "acc_norm": 0.7691694881497709, + "acc_norm_stderr": 0.004205030476886523 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421296, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421296 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3925925925925926, + "acc_stderr": 0.04218506215368879, + "acc_norm": 0.3925925925925926, + "acc_norm_stderr": 0.04218506215368879 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.0378272898086547, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.0378272898086547 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.35094339622641507, + "acc_stderr": 0.02937364625323469, + "acc_norm": 0.35094339622641507, + "acc_norm_stderr": 0.02937364625323469 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3263888888888889, + "acc_stderr": 0.03921067198982266, + "acc_norm": 0.3263888888888889, + "acc_norm_stderr": 0.03921067198982266 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909282, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909282 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.28901734104046245, + "acc_stderr": 0.03456425745086999, + "acc_norm": 0.28901734104046245, + "acc_norm_stderr": 0.03456425745086999 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237656, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237656 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3276595744680851, + "acc_stderr": 0.030683020843231008, + "acc_norm": 0.3276595744680851, + "acc_norm_stderr": 0.030683020843231008 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813344, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813344 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.32413793103448274, + "acc_stderr": 0.03900432069185555, + "acc_norm": 0.32413793103448274, + "acc_norm_stderr": 0.03900432069185555 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.02241804289111395, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.02241804289111395 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.03893259610604673, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.03893259610604673 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3161290322580645, + "acc_stderr": 0.02645087448904276, + "acc_norm": 0.3161290322580645, + "acc_norm_stderr": 0.02645087448904276 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2315270935960591, + "acc_stderr": 0.029678333141444455, + "acc_norm": 0.2315270935960591, + "acc_norm_stderr": 0.029678333141444455 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.42424242424242425, + "acc_stderr": 0.03859268142070262, + "acc_norm": 0.42424242424242425, + "acc_norm_stderr": 0.03859268142070262 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3181818181818182, + "acc_stderr": 0.03318477333845331, + "acc_norm": 0.3181818181818182, + "acc_norm_stderr": 0.03318477333845331 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.40414507772020725, + "acc_stderr": 0.03541508578884019, + "acc_norm": 0.40414507772020725, + "acc_norm_stderr": 0.03541508578884019 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.30512820512820515, + "acc_stderr": 0.023346335293325887, + "acc_norm": 0.30512820512820515, + "acc_norm_stderr": 0.023346335293325887 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.22962962962962963, + "acc_stderr": 0.025644108639267634, + "acc_norm": 0.22962962962962963, + "acc_norm_stderr": 0.025644108639267634 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3487394957983193, + "acc_stderr": 0.030956636328566548, + "acc_norm": 0.3487394957983193, + "acc_norm_stderr": 0.030956636328566548 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.26490066225165565, + "acc_stderr": 0.03603038545360384, + "acc_norm": 0.26490066225165565, + "acc_norm_stderr": 0.03603038545360384 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3706422018348624, + "acc_stderr": 0.020707458164352984, + "acc_norm": 0.3706422018348624, + "acc_norm_stderr": 0.020707458164352984 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.18055555555555555, + "acc_stderr": 0.02623287897149166, + "acc_norm": 0.18055555555555555, + "acc_norm_stderr": 0.02623287897149166 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.4019607843137255, + "acc_stderr": 0.03441190023482465, + "acc_norm": 0.4019607843137255, + "acc_norm_stderr": 0.03441190023482465 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.41350210970464135, + "acc_stderr": 0.03205649904851858, + "acc_norm": 0.41350210970464135, + "acc_norm_stderr": 0.03205649904851858 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.4125560538116592, + "acc_stderr": 0.03304062175449297, + "acc_norm": 0.4125560538116592, + "acc_norm_stderr": 0.03304062175449297 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.32061068702290074, + "acc_stderr": 0.04093329229834278, + "acc_norm": 0.32061068702290074, + "acc_norm_stderr": 0.04093329229834278 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5371900826446281, + "acc_stderr": 0.04551711196104218, + "acc_norm": 0.5371900826446281, + "acc_norm_stderr": 0.04551711196104218 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.047128212574267705, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.047128212574267705 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3496932515337423, + "acc_stderr": 0.03746668325470021, + "acc_norm": 0.3496932515337423, + "acc_norm_stderr": 0.03746668325470021 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3106796116504854, + "acc_stderr": 0.0458212416016155, + "acc_norm": 0.3106796116504854, + "acc_norm_stderr": 0.0458212416016155 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.405982905982906, + "acc_stderr": 0.03217180182641086, + "acc_norm": 0.405982905982906, + "acc_norm_stderr": 0.03217180182641086 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.388250319284802, + "acc_stderr": 0.017427673295544337, + "acc_norm": 0.388250319284802, + "acc_norm_stderr": 0.017427673295544337 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.3554913294797688, + "acc_stderr": 0.025770292082977247, + "acc_norm": 0.3554913294797688, + "acc_norm_stderr": 0.025770292082977247 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574877, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574877 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.31699346405228757, + "acc_stderr": 0.026643278474508755, + "acc_norm": 0.31699346405228757, + "acc_norm_stderr": 0.026643278474508755 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.34726688102893893, + "acc_stderr": 0.027040745502307333, + "acc_norm": 0.34726688102893893, + "acc_norm_stderr": 0.027040745502307333 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.026229649178821157, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.026229649178821157 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2624113475177305, + "acc_stderr": 0.026244920349843, + "acc_norm": 0.2624113475177305, + "acc_norm_stderr": 0.026244920349843 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2926988265971317, + "acc_stderr": 0.011620949195849526, + "acc_norm": 0.2926988265971317, + "acc_norm_stderr": 0.011620949195849526 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.02767846864214472, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.02767846864214472 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.3660130718954248, + "acc_stderr": 0.019488025745529672, + "acc_norm": 0.3660130718954248, + "acc_norm_stderr": 0.019488025745529672 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.4, + "acc_stderr": 0.0469237132203465, + "acc_norm": 0.4, + "acc_norm_stderr": 0.0469237132203465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.30612244897959184, + "acc_stderr": 0.02950489645459595, + "acc_norm": 0.30612244897959184, + "acc_norm_stderr": 0.02950489645459595 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2885572139303483, + "acc_stderr": 0.03203841040213322, + "acc_norm": 0.2885572139303483, + "acc_norm_stderr": 0.03203841040213322 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3132530120481928, + "acc_stderr": 0.036108050180310235, + "acc_norm": 0.3132530120481928, + "acc_norm_stderr": 0.036108050180310235 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.4619883040935672, + "acc_stderr": 0.03823727092882307, + "acc_norm": 0.4619883040935672, + "acc_norm_stderr": 0.03823727092882307 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3317013463892289, + "mc1_stderr": 0.016482148810241473, + "mc2": 0.48134145706514697, + "mc2_stderr": 0.015248540942050783 + }, + "all": { + "acc": 0.34011474738813613, + "acc_stderr": 0.03412998365791724, + "acc_norm": 0.34394136086634347, + "acc_norm_stderr": 0.034117509520918445, + "mc1": 0.3317013463892289, + "mc1_stderr": 0.016482148810241473, + "mc2": 0.48134145706514697, + "mc2_stderr": 0.015248540942050783 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config": { + "model_name": "notstoic/PygmalionCoT-7b", + "model_sha": "c03ac527360663d17bb142405251028eec843ed9", + "model_dtype": "torch.float16", + "lighteval_sha": "43cff840721bd0214adb4e29236a5e2ca1813937", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "task_config": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "hashes": { + "harness|arc:challenge|25": { + "hash_examples": "fb8c51b1872daeda", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "harness|hellaswag|10": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-anatomy|5": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "harness|hendrycksTest-astronomy|5": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "harness|hendrycksTest-business_ethics|5": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "harness|hendrycksTest-college_biology|5": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "harness|hendrycksTest-college_chemistry|5": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_computer_science|5": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "harness|hendrycksTest-college_mathematics|5": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-college_medicine|5": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "harness|hendrycksTest-college_physics|5": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "harness|hendrycksTest-computer_security|5": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "harness|hendrycksTest-econometrics|5": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "harness|hendrycksTest-formal_logic|5": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "harness|hendrycksTest-global_facts|5": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-high_school_biology|5": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "harness|hendrycksTest-high_school_geography|5": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "harness|hendrycksTest-high_school_physics|5": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "harness|hendrycksTest-human_aging|5": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "harness|hendrycksTest-human_sexuality|5": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "harness|hendrycksTest-international_law|5": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "harness|hendrycksTest-jurisprudence|5": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "harness|hendrycksTest-machine_learning|5": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "harness|hendrycksTest-management|5": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "harness|hendrycksTest-marketing|5": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "harness|hendrycksTest-medical_genetics|5": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "harness|hendrycksTest-miscellaneous|5": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "harness|hendrycksTest-moral_disputes|5": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "harness|hendrycksTest-nutrition|5": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "harness|hendrycksTest-philosophy|5": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "harness|hendrycksTest-prehistory|5": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "harness|hendrycksTest-professional_accounting|5": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "harness|hendrycksTest-professional_law|5": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "harness|hendrycksTest-professional_medicine|5": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "harness|hendrycksTest-professional_psychology|5": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "harness|hendrycksTest-public_relations|5": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "harness|hendrycksTest-security_studies|5": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "harness|hendrycksTest-sociology|5": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "harness|hendrycksTest-virology|5": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "harness|hendrycksTest-world_religions|5": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "harness|truthfulqa:mc|0": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + } + } +} \ No newline at end of file diff --git a/eval-results/notstoic/PygmalionCoT-7b/results_2023-09-22T15-06-38.792335.json b/eval-results/notstoic/PygmalionCoT-7b/results_2023-09-22T15-06-38.792335.json new file mode 100644 index 0000000000000000000000000000000000000000..e5f5dc3ea68f5cef6bde4ab5509f2e4826e8eab9 --- /dev/null +++ b/eval-results/notstoic/PygmalionCoT-7b/results_2023-09-22T15-06-38.792335.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "notstoic/PygmalionCoT-7b", + "model_sha": "c03ac527360663d17bb142405251028eec843ed9", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.12111996644295302, + "em_stderr": 0.0033412757702121106, + "f1": 0.17514471476510068, + "f1_stderr": 0.0034689450739406216 + }, + "harness|gsm8k|5": { + "acc": 0.032600454890068235, + "acc_stderr": 0.004891669021939579 + }, + "harness|winogrande|5": { + "acc": 0.6890292028413575, + "acc_stderr": 0.01300953473628606 + }, + "all": { + "em": 0.12111996644295302, + "em_stderr": 0.0033412757702121106, + "f1": 0.17514471476510068, + "f1_stderr": 0.0034689450739406216, + "acc": 0.36081482886571287, + "acc_stderr": 0.00895060187911282 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "fdb99f739898b91d" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "af04e0574792e183" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "fde2f58fbd8994ef" + }, + "total_evaluation_time_secondes": "10943.194388389587", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/oh-yeontaek/llama-2-13B-LoRA-assemble/results_2023-09-13T23-30-08.066135.json b/eval-results/oh-yeontaek/llama-2-13B-LoRA-assemble/results_2023-09-13T23-30-08.066135.json new file mode 100644 index 0000000000000000000000000000000000000000..b3d880a3f43c571b1a49c27c5b5b1ee45954778d --- /dev/null +++ b/eval-results/oh-yeontaek/llama-2-13B-LoRA-assemble/results_2023-09-13T23-30-08.066135.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "oh-yeontaek/llama-2-13B-LoRA-assemble", + "model_sha": "85bb49d333dba4a08b051418663d16853ce30cee", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6006825938566553, + "acc_stderr": 0.014312094557946702, + "acc_norm": 0.6356655290102389, + "acc_norm_stderr": 0.014063260279882417 + }, + "harness|hellaswag|10": { + "acc": 0.6361282613025294, + "acc_stderr": 0.004801290954387088, + "acc_norm": 0.8350926110336586, + "acc_norm_stderr": 0.0037033852685121747 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5259259259259259, + "acc_stderr": 0.04313531696750575, + "acc_norm": 0.5259259259259259, + "acc_norm_stderr": 0.04313531696750575 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.618421052631579, + "acc_stderr": 0.03953173377749194, + "acc_norm": 0.618421052631579, + "acc_norm_stderr": 0.03953173377749194 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6226415094339622, + "acc_stderr": 0.029832808114796005, + "acc_norm": 0.6226415094339622, + "acc_norm_stderr": 0.029832808114796005 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6597222222222222, + "acc_stderr": 0.039621355734862175, + "acc_norm": 0.6597222222222222, + "acc_norm_stderr": 0.039621355734862175 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5953757225433526, + "acc_stderr": 0.03742461193887248, + "acc_norm": 0.5953757225433526, + "acc_norm_stderr": 0.03742461193887248 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04690650298201942, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04690650298201942 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.48936170212765956, + "acc_stderr": 0.03267862331014063, + "acc_norm": 0.48936170212765956, + "acc_norm_stderr": 0.03267862331014063 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.35964912280701755, + "acc_stderr": 0.04514496132873634, + "acc_norm": 0.35964912280701755, + "acc_norm_stderr": 0.04514496132873634 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5655172413793104, + "acc_stderr": 0.04130740879555498, + "acc_norm": 0.5655172413793104, + "acc_norm_stderr": 0.04130740879555498 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.34656084656084657, + "acc_stderr": 0.024508777521028424, + "acc_norm": 0.34656084656084657, + "acc_norm_stderr": 0.024508777521028424 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04285714285714281, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04285714285714281 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6709677419354839, + "acc_stderr": 0.026729499068349958, + "acc_norm": 0.6709677419354839, + "acc_norm_stderr": 0.026729499068349958 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4876847290640394, + "acc_stderr": 0.035169204442208966, + "acc_norm": 0.4876847290640394, + "acc_norm_stderr": 0.035169204442208966 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.62, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.62, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7212121212121212, + "acc_stderr": 0.035014387062967806, + "acc_norm": 0.7212121212121212, + "acc_norm_stderr": 0.035014387062967806 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.029620227874790482, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.029620227874790482 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8808290155440415, + "acc_stderr": 0.02338193534812143, + "acc_norm": 0.8808290155440415, + "acc_norm_stderr": 0.02338193534812143 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6205128205128205, + "acc_stderr": 0.02460362692409742, + "acc_norm": 0.6205128205128205, + "acc_norm_stderr": 0.02460362692409742 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32592592592592595, + "acc_stderr": 0.02857834836547307, + "acc_norm": 0.32592592592592595, + "acc_norm_stderr": 0.02857834836547307 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6008403361344538, + "acc_stderr": 0.03181110032413926, + "acc_norm": 0.6008403361344538, + "acc_norm_stderr": 0.03181110032413926 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.038020397601079024, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.038020397601079024 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7926605504587156, + "acc_stderr": 0.01738141556360868, + "acc_norm": 0.7926605504587156, + "acc_norm_stderr": 0.01738141556360868 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.033723432716530645, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.033723432716530645 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.02615686752393104, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.02615686752393104 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7848101265822784, + "acc_stderr": 0.02675082699467617, + "acc_norm": 0.7848101265822784, + "acc_norm_stderr": 0.02675082699467617 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6860986547085202, + "acc_stderr": 0.03114679648297246, + "acc_norm": 0.6860986547085202, + "acc_norm_stderr": 0.03114679648297246 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6793893129770993, + "acc_stderr": 0.04093329229834278, + "acc_norm": 0.6793893129770993, + "acc_norm_stderr": 0.04093329229834278 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908706, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908706 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7685185185185185, + "acc_stderr": 0.04077494709252627, + "acc_norm": 0.7685185185185185, + "acc_norm_stderr": 0.04077494709252627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6993865030674846, + "acc_stderr": 0.03602511318806771, + "acc_norm": 0.6993865030674846, + "acc_norm_stderr": 0.03602511318806771 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8461538461538461, + "acc_stderr": 0.023636873317489294, + "acc_norm": 0.8461538461538461, + "acc_norm_stderr": 0.023636873317489294 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7956577266922095, + "acc_stderr": 0.0144191239809319, + "acc_norm": 0.7956577266922095, + "acc_norm_stderr": 0.0144191239809319 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.661849710982659, + "acc_stderr": 0.025469770149400172, + "acc_norm": 0.661849710982659, + "acc_norm_stderr": 0.025469770149400172 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4849162011173184, + "acc_stderr": 0.016714890379996062, + "acc_norm": 0.4849162011173184, + "acc_norm_stderr": 0.016714890379996062 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.673202614379085, + "acc_stderr": 0.026857294663281416, + "acc_norm": 0.673202614379085, + "acc_norm_stderr": 0.026857294663281416 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6881028938906752, + "acc_stderr": 0.026311858071854155, + "acc_norm": 0.6881028938906752, + "acc_norm_stderr": 0.026311858071854155 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7160493827160493, + "acc_stderr": 0.025089478523765137, + "acc_norm": 0.7160493827160493, + "acc_norm_stderr": 0.025089478523765137 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4716312056737589, + "acc_stderr": 0.029779450957303062, + "acc_norm": 0.4716312056737589, + "acc_norm_stderr": 0.029779450957303062 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4595827900912647, + "acc_stderr": 0.012728446067669956, + "acc_norm": 0.4595827900912647, + "acc_norm_stderr": 0.012728446067669956 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6102941176470589, + "acc_stderr": 0.0296246635811597, + "acc_norm": 0.6102941176470589, + "acc_norm_stderr": 0.0296246635811597 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.019910377463105932, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.019910377463105932 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.673469387755102, + "acc_stderr": 0.03002105623844031, + "acc_norm": 0.673469387755102, + "acc_norm_stderr": 0.03002105623844031 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7661691542288557, + "acc_stderr": 0.029929415408348384, + "acc_norm": 0.7661691542288557, + "acc_norm_stderr": 0.029929415408348384 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4939759036144578, + "acc_stderr": 0.03892212195333045, + "acc_norm": 0.4939759036144578, + "acc_norm_stderr": 0.03892212195333045 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8011695906432749, + "acc_stderr": 0.030611116557432528, + "acc_norm": 0.8011695906432749, + "acc_norm_stderr": 0.030611116557432528 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.4039167686658507, + "mc1_stderr": 0.017177276822584284, + "mc2": 0.5595996097364623, + "mc2_stderr": 0.015690304235652236 + }, + "all": { + "acc": 0.5989105794324632, + "acc_stderr": 0.03386180683240171, + "acc_norm": 0.6028757876508483, + "acc_norm_stderr": 0.033838980731318, + "mc1": 0.4039167686658507, + "mc1_stderr": 0.017177276822584284, + "mc2": 0.5595996097364623, + "mc2_stderr": 0.015690304235652236 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6366.7631549835205", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/oh-yeontaek/llama-2-13B-LoRA-assemble/results_2023-10-28T12-38-31.031518.json b/eval-results/oh-yeontaek/llama-2-13B-LoRA-assemble/results_2023-10-28T12-38-31.031518.json new file mode 100644 index 0000000000000000000000000000000000000000..340c54ac839d43dc87dba4e3d23085858f8c4874 --- /dev/null +++ b/eval-results/oh-yeontaek/llama-2-13B-LoRA-assemble/results_2023-10-28T12-38-31.031518.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "oh-yeontaek/llama-2-13B-LoRA-assemble", + "model_sha": "85bb49d333dba4a08b051418663d16853ce30cee", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.018246644295302015, + "em_stderr": 0.0013706682452812897, + "f1": 0.12087667785234917, + "f1_stderr": 0.002262552570535497 + }, + "harness|gsm8k|5": { + "acc": 0.0841546626231994, + "acc_stderr": 0.0076470240466032045 + }, + "harness|winogrande|5": { + "acc": 0.7616416732438832, + "acc_stderr": 0.011974948667702302 + }, + "all": { + "em": 0.018246644295302015, + "em_stderr": 0.0013706682452812897, + "f1": 0.12087667785234917, + "f1_stderr": 0.002262552570535497, + "acc": 0.4228981679335413, + "acc_stderr": 0.009810986357152753 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "9ff5913580f0b287" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "2956cb1e0dc8dba0" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "435c82431df70ae7" + }, + "total_evaluation_time_secondes": "11780.806861639023", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/oh-yeontaek/llama-2-70B-LoRA-assemble-v2/results_2023-09-15T16-06-18.387785.json b/eval-results/oh-yeontaek/llama-2-70B-LoRA-assemble-v2/results_2023-09-15T16-06-18.387785.json new file mode 100644 index 0000000000000000000000000000000000000000..e4e80ff70d831fbdd69e88a5b6ea62dcb8b83717 --- /dev/null +++ b/eval-results/oh-yeontaek/llama-2-70B-LoRA-assemble-v2/results_2023-09-15T16-06-18.387785.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "oh-yeontaek/llama-2-70B-LoRA-assemble-v2", + "model_sha": "7feeb5b665ab1ecdfd9cc4fe45fadb86b7b91b5b", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6834470989761092, + "acc_stderr": 0.013592431519068079, + "acc_norm": 0.7184300341296929, + "acc_norm_stderr": 0.013143376735009022 + }, + "harness|hellaswag|10": { + "acc": 0.6721768571997611, + "acc_stderr": 0.0046846063106423304, + "acc_norm": 0.8688508265285799, + "acc_norm_stderr": 0.00336873543416138 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6148148148148148, + "acc_stderr": 0.042039210401562783, + "acc_norm": 0.6148148148148148, + "acc_norm_stderr": 0.042039210401562783 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7828947368421053, + "acc_stderr": 0.03355045304882924, + "acc_norm": 0.7828947368421053, + "acc_norm_stderr": 0.03355045304882924 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7433962264150943, + "acc_stderr": 0.02688064788905199, + "acc_norm": 0.7433962264150943, + "acc_norm_stderr": 0.02688064788905199 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8194444444444444, + "acc_stderr": 0.03216600808802267, + "acc_norm": 0.8194444444444444, + "acc_norm_stderr": 0.03216600808802267 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6589595375722543, + "acc_stderr": 0.036146654241808254, + "acc_norm": 0.6589595375722543, + "acc_norm_stderr": 0.036146654241808254 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.04784060704105654, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.04784060704105654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6553191489361702, + "acc_stderr": 0.031068985963122145, + "acc_norm": 0.6553191489361702, + "acc_norm_stderr": 0.031068985963122145 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.41228070175438597, + "acc_stderr": 0.04630653203366595, + "acc_norm": 0.41228070175438597, + "acc_norm_stderr": 0.04630653203366595 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6344827586206897, + "acc_stderr": 0.04013124195424386, + "acc_norm": 0.6344827586206897, + "acc_norm_stderr": 0.04013124195424386 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4656084656084656, + "acc_stderr": 0.02569032176249384, + "acc_norm": 0.4656084656084656, + "acc_norm_stderr": 0.02569032176249384 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4603174603174603, + "acc_stderr": 0.04458029125470973, + "acc_norm": 0.4603174603174603, + "acc_norm_stderr": 0.04458029125470973 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.832258064516129, + "acc_stderr": 0.021255464065371314, + "acc_norm": 0.832258064516129, + "acc_norm_stderr": 0.021255464065371314 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5467980295566502, + "acc_stderr": 0.03502544650845872, + "acc_norm": 0.5467980295566502, + "acc_norm_stderr": 0.03502544650845872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932262, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932262 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8303030303030303, + "acc_stderr": 0.02931118867498312, + "acc_norm": 0.8303030303030303, + "acc_norm_stderr": 0.02931118867498312 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.9040404040404041, + "acc_stderr": 0.020984808610047933, + "acc_norm": 0.9040404040404041, + "acc_norm_stderr": 0.020984808610047933 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.927461139896373, + "acc_stderr": 0.018718998520678178, + "acc_norm": 0.927461139896373, + "acc_norm_stderr": 0.018718998520678178 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6974358974358974, + "acc_stderr": 0.02329088805377272, + "acc_norm": 0.6974358974358974, + "acc_norm_stderr": 0.02329088805377272 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32592592592592595, + "acc_stderr": 0.028578348365473072, + "acc_norm": 0.32592592592592595, + "acc_norm_stderr": 0.028578348365473072 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7815126050420168, + "acc_stderr": 0.026841514322958934, + "acc_norm": 0.7815126050420168, + "acc_norm_stderr": 0.026841514322958934 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.47019867549668876, + "acc_stderr": 0.040752249922169775, + "acc_norm": 0.47019867549668876, + "acc_norm_stderr": 0.040752249922169775 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8844036697247707, + "acc_stderr": 0.013708749534172636, + "acc_norm": 0.8844036697247707, + "acc_norm_stderr": 0.013708749534172636 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5648148148148148, + "acc_stderr": 0.033812000056435254, + "acc_norm": 0.5648148148148148, + "acc_norm_stderr": 0.033812000056435254 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8970588235294118, + "acc_stderr": 0.021328337570804365, + "acc_norm": 0.8970588235294118, + "acc_norm_stderr": 0.021328337570804365 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8776371308016878, + "acc_stderr": 0.02133174182974679, + "acc_norm": 0.8776371308016878, + "acc_norm_stderr": 0.02133174182974679 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7668161434977578, + "acc_stderr": 0.028380391147094706, + "acc_norm": 0.7668161434977578, + "acc_norm_stderr": 0.028380391147094706 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8244274809160306, + "acc_stderr": 0.03336820338476073, + "acc_norm": 0.8244274809160306, + "acc_norm_stderr": 0.03336820338476073 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8512396694214877, + "acc_stderr": 0.03248470083807194, + "acc_norm": 0.8512396694214877, + "acc_norm_stderr": 0.03248470083807194 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.03602814176392645, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.03602814176392645 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8220858895705522, + "acc_stderr": 0.03004735765580663, + "acc_norm": 0.8220858895705522, + "acc_norm_stderr": 0.03004735765580663 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.49107142857142855, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.49107142857142855, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.0376017800602662, + "acc_norm": 0.8252427184466019, + "acc_norm_stderr": 0.0376017800602662 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8888888888888888, + "acc_stderr": 0.02058849131609238, + "acc_norm": 0.8888888888888888, + "acc_norm_stderr": 0.02058849131609238 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.71, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8659003831417624, + "acc_stderr": 0.012185528166499983, + "acc_norm": 0.8659003831417624, + "acc_norm_stderr": 0.012185528166499983 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7774566473988439, + "acc_stderr": 0.02239421566194282, + "acc_norm": 0.7774566473988439, + "acc_norm_stderr": 0.02239421566194282 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.5597765363128492, + "acc_stderr": 0.01660256461504993, + "acc_norm": 0.5597765363128492, + "acc_norm_stderr": 0.01660256461504993 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7352941176470589, + "acc_stderr": 0.02526169121972949, + "acc_norm": 0.7352941176470589, + "acc_norm_stderr": 0.02526169121972949 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7588424437299035, + "acc_stderr": 0.024296594034763426, + "acc_norm": 0.7588424437299035, + "acc_norm_stderr": 0.024296594034763426 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7962962962962963, + "acc_stderr": 0.022409674547304168, + "acc_norm": 0.7962962962962963, + "acc_norm_stderr": 0.022409674547304168 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5531914893617021, + "acc_stderr": 0.02965823509766691, + "acc_norm": 0.5531914893617021, + "acc_norm_stderr": 0.02965823509766691 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5586701434159062, + "acc_stderr": 0.012682016335646678, + "acc_norm": 0.5586701434159062, + "acc_norm_stderr": 0.012682016335646678 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7352941176470589, + "acc_stderr": 0.026799562024887653, + "acc_norm": 0.7352941176470589, + "acc_norm_stderr": 0.026799562024887653 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7434640522875817, + "acc_stderr": 0.017667841612379005, + "acc_norm": 0.7434640522875817, + "acc_norm_stderr": 0.017667841612379005 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7454545454545455, + "acc_stderr": 0.041723430387053825, + "acc_norm": 0.7454545454545455, + "acc_norm_stderr": 0.041723430387053825 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7795918367346939, + "acc_stderr": 0.026537045312145298, + "acc_norm": 0.7795918367346939, + "acc_norm_stderr": 0.026537045312145298 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8855721393034826, + "acc_stderr": 0.022509345325101706, + "acc_norm": 0.8855721393034826, + "acc_norm_stderr": 0.022509345325101706 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.88, + "acc_stderr": 0.03265986323710906, + "acc_norm": 0.88, + "acc_norm_stderr": 0.03265986323710906 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5240963855421686, + "acc_stderr": 0.03887971849597264, + "acc_norm": 0.5240963855421686, + "acc_norm_stderr": 0.03887971849597264 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8596491228070176, + "acc_stderr": 0.0266405825391332, + "acc_norm": 0.8596491228070176, + "acc_norm_stderr": 0.0266405825391332 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.4663402692778458, + "mc1_stderr": 0.01746379386716811, + "mc2": 0.6478807414957388, + "mc2_stderr": 0.014914964973799093 + }, + "all": { + "acc": 0.6931761249212157, + "acc_stderr": 0.031300161246260914, + "acc_norm": 0.6971025131327819, + "acc_norm_stderr": 0.03127024725201448, + "mc1": 0.4663402692778458, + "mc1_stderr": 0.01746379386716811, + "mc2": 0.6478807414957388, + "mc2_stderr": 0.014914964973799093 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "43892.29086637497", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/oh-yeontaek/llama-2-70B-LoRA-assemble-v2/results_2023-11-09T09-15-37.324583.json b/eval-results/oh-yeontaek/llama-2-70B-LoRA-assemble-v2/results_2023-11-09T09-15-37.324583.json new file mode 100644 index 0000000000000000000000000000000000000000..8633d04a7d4f24b632f362060f70cba2fced08e3 --- /dev/null +++ b/eval-results/oh-yeontaek/llama-2-70B-LoRA-assemble-v2/results_2023-11-09T09-15-37.324583.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "oh-yeontaek/llama-2-70B-LoRA-assemble-v2", + "model_sha": "7feeb5b665ab1ecdfd9cc4fe45fadb86b7b91b5b", + "model_dtype": "torch.float16", + "model_size": "128.64 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.16096895973154363, + "em_stderr": 0.0037635677120072403, + "f1": 0.3114240771812082, + "f1_stderr": 0.0037408737089822184 + }, + "harness|gsm8k|5": { + "acc": 0.1425322213798332, + "acc_stderr": 0.009629588445673814 + }, + "harness|winogrande|5": { + "acc": 0.8121546961325967, + "acc_stderr": 0.010977481103435093 + }, + "all": { + "em": 0.16096895973154363, + "em_stderr": 0.0037635677120072403, + "f1": 0.3114240771812082, + "f1_stderr": 0.0037408737089822184, + "acc": 0.477343458756215, + "acc_stderr": 0.010303534774554453 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "8f1e7343011a2aed" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "6517618ce1a695f7" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "5c7f7bdb97ae11ba" + }, + "truncated": 3, + "non_truncated": 12119, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/oh-yeontaek/llama-2-70B-LoRA-assemble/results_2023-09-14T11-41-03.022396.json b/eval-results/oh-yeontaek/llama-2-70B-LoRA-assemble/results_2023-09-14T11-41-03.022396.json new file mode 100644 index 0000000000000000000000000000000000000000..1f78df97930326cf29066ac57b8b92e13c2e706e --- /dev/null +++ b/eval-results/oh-yeontaek/llama-2-70B-LoRA-assemble/results_2023-09-14T11-41-03.022396.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "oh-yeontaek/llama-2-70B-LoRA-assemble", + "model_sha": "91caffe08852dcbbdedd64786bd3b4ac0dcb2e96", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6851535836177475, + "acc_stderr": 0.01357265770308495, + "acc_norm": 0.7184300341296929, + "acc_norm_stderr": 0.013143376735009022 + }, + "harness|hellaswag|10": { + "acc": 0.6707827126070504, + "acc_stderr": 0.00468968597815517, + "acc_norm": 0.867755427205736, + "acc_norm_stderr": 0.0033806414709899157 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6222222222222222, + "acc_stderr": 0.04188307537595852, + "acc_norm": 0.6222222222222222, + "acc_norm_stderr": 0.04188307537595852 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7763157894736842, + "acc_stderr": 0.03391160934343603, + "acc_norm": 0.7763157894736842, + "acc_norm_stderr": 0.03391160934343603 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.73, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.73, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7358490566037735, + "acc_stderr": 0.027134291628741702, + "acc_norm": 0.7358490566037735, + "acc_norm_stderr": 0.027134291628741702 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8125, + "acc_stderr": 0.032639560491693344, + "acc_norm": 0.8125, + "acc_norm_stderr": 0.032639560491693344 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.62, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.62, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6705202312138728, + "acc_stderr": 0.03583901754736412, + "acc_norm": 0.6705202312138728, + "acc_norm_stderr": 0.03583901754736412 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04690650298201943, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04690650298201943 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.73, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.73, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6638297872340425, + "acc_stderr": 0.030881618520676942, + "acc_norm": 0.6638297872340425, + "acc_norm_stderr": 0.030881618520676942 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.42105263157894735, + "acc_stderr": 0.046446020912223177, + "acc_norm": 0.42105263157894735, + "acc_norm_stderr": 0.046446020912223177 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6275862068965518, + "acc_stderr": 0.040287315329475576, + "acc_norm": 0.6275862068965518, + "acc_norm_stderr": 0.040287315329475576 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4656084656084656, + "acc_stderr": 0.025690321762493844, + "acc_norm": 0.4656084656084656, + "acc_norm_stderr": 0.025690321762493844 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8290322580645161, + "acc_stderr": 0.021417242936321582, + "acc_norm": 0.8290322580645161, + "acc_norm_stderr": 0.021417242936321582 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5320197044334976, + "acc_stderr": 0.035107665979592154, + "acc_norm": 0.5320197044334976, + "acc_norm_stderr": 0.035107665979592154 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8242424242424242, + "acc_stderr": 0.02972094300622445, + "acc_norm": 0.8242424242424242, + "acc_norm_stderr": 0.02972094300622445 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8939393939393939, + "acc_stderr": 0.021938047738853113, + "acc_norm": 0.8939393939393939, + "acc_norm_stderr": 0.021938047738853113 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.927461139896373, + "acc_stderr": 0.018718998520678178, + "acc_norm": 0.927461139896373, + "acc_norm_stderr": 0.018718998520678178 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6948717948717948, + "acc_stderr": 0.023346335293325887, + "acc_norm": 0.6948717948717948, + "acc_norm_stderr": 0.023346335293325887 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.31851851851851853, + "acc_stderr": 0.02840653309060846, + "acc_norm": 0.31851851851851853, + "acc_norm_stderr": 0.02840653309060846 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.773109243697479, + "acc_stderr": 0.02720537153827947, + "acc_norm": 0.773109243697479, + "acc_norm_stderr": 0.02720537153827947 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4900662251655629, + "acc_stderr": 0.04081677107248436, + "acc_norm": 0.4900662251655629, + "acc_norm_stderr": 0.04081677107248436 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8862385321100917, + "acc_stderr": 0.013613614800232805, + "acc_norm": 0.8862385321100917, + "acc_norm_stderr": 0.013613614800232805 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5740740740740741, + "acc_stderr": 0.033723432716530624, + "acc_norm": 0.5740740740740741, + "acc_norm_stderr": 0.033723432716530624 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8970588235294118, + "acc_stderr": 0.021328337570804365, + "acc_norm": 0.8970588235294118, + "acc_norm_stderr": 0.021328337570804365 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8734177215189873, + "acc_stderr": 0.021644195727955173, + "acc_norm": 0.8734177215189873, + "acc_norm_stderr": 0.021644195727955173 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7757847533632287, + "acc_stderr": 0.02799153425851952, + "acc_norm": 0.7757847533632287, + "acc_norm_stderr": 0.02799153425851952 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8396946564885496, + "acc_stderr": 0.03217829420744633, + "acc_norm": 0.8396946564885496, + "acc_norm_stderr": 0.03217829420744633 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8512396694214877, + "acc_stderr": 0.03248470083807194, + "acc_norm": 0.8512396694214877, + "acc_norm_stderr": 0.03248470083807194 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8240740740740741, + "acc_stderr": 0.036809181416738807, + "acc_norm": 0.8240740740740741, + "acc_norm_stderr": 0.036809181416738807 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8220858895705522, + "acc_stderr": 0.03004735765580663, + "acc_norm": 0.8220858895705522, + "acc_norm_stderr": 0.03004735765580663 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4732142857142857, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.4732142857142857, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.03760178006026621, + "acc_norm": 0.8252427184466019, + "acc_norm_stderr": 0.03760178006026621 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8888888888888888, + "acc_stderr": 0.020588491316092375, + "acc_norm": 0.8888888888888888, + "acc_norm_stderr": 0.020588491316092375 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8633461047254151, + "acc_stderr": 0.012282876868629234, + "acc_norm": 0.8633461047254151, + "acc_norm_stderr": 0.012282876868629234 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7716763005780347, + "acc_stderr": 0.022598703804321635, + "acc_norm": 0.7716763005780347, + "acc_norm_stderr": 0.022598703804321635 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.5743016759776536, + "acc_stderr": 0.01653682964899712, + "acc_norm": 0.5743016759776536, + "acc_norm_stderr": 0.01653682964899712 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.738562091503268, + "acc_stderr": 0.025160998214292456, + "acc_norm": 0.738562091503268, + "acc_norm_stderr": 0.025160998214292456 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7556270096463023, + "acc_stderr": 0.024406162094668907, + "acc_norm": 0.7556270096463023, + "acc_norm_stderr": 0.024406162094668907 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7993827160493827, + "acc_stderr": 0.02228231394977488, + "acc_norm": 0.7993827160493827, + "acc_norm_stderr": 0.02228231394977488 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5567375886524822, + "acc_stderr": 0.029634838473766006, + "acc_norm": 0.5567375886524822, + "acc_norm_stderr": 0.029634838473766006 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5645371577574967, + "acc_stderr": 0.012663412101248345, + "acc_norm": 0.5645371577574967, + "acc_norm_stderr": 0.012663412101248345 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7426470588235294, + "acc_stderr": 0.0265565194700415, + "acc_norm": 0.7426470588235294, + "acc_norm_stderr": 0.0265565194700415 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7418300653594772, + "acc_stderr": 0.017704531653250078, + "acc_norm": 0.7418300653594772, + "acc_norm_stderr": 0.017704531653250078 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7545454545454545, + "acc_stderr": 0.041220665028782855, + "acc_norm": 0.7545454545454545, + "acc_norm_stderr": 0.041220665028782855 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7877551020408163, + "acc_stderr": 0.026176967197866764, + "acc_norm": 0.7877551020408163, + "acc_norm_stderr": 0.026176967197866764 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8805970149253731, + "acc_stderr": 0.02292879327721974, + "acc_norm": 0.8805970149253731, + "acc_norm_stderr": 0.02292879327721974 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.9, + "acc_stderr": 0.030151134457776334, + "acc_norm": 0.9, + "acc_norm_stderr": 0.030151134457776334 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5240963855421686, + "acc_stderr": 0.03887971849597264, + "acc_norm": 0.5240963855421686, + "acc_norm_stderr": 0.03887971849597264 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8654970760233918, + "acc_stderr": 0.026168221344662297, + "acc_norm": 0.8654970760233918, + "acc_norm_stderr": 0.026168221344662297 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.46511627906976744, + "mc1_stderr": 0.01746084997587397, + "mc2": 0.6479539766332348, + "mc2_stderr": 0.014916593992436448 + }, + "all": { + "acc": 0.6934330265245879, + "acc_stderr": 0.031312838620430335, + "acc_norm": 0.697335554746802, + "acc_norm_stderr": 0.03128337547678218, + "mc1": 0.46511627906976744, + "mc1_stderr": 0.01746084997587397, + "mc2": 0.6479539766332348, + "mc2_stderr": 0.014916593992436448 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "43836.30269193649", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/oh-yeontaek/llama-2-7B-LoRA-assemble/results_2023-09-13T17-57-16.083940.json b/eval-results/oh-yeontaek/llama-2-7B-LoRA-assemble/results_2023-09-13T17-57-16.083940.json new file mode 100644 index 0000000000000000000000000000000000000000..b06bd20aa98b31054ec03c7c12fdc43373596ae9 --- /dev/null +++ b/eval-results/oh-yeontaek/llama-2-7B-LoRA-assemble/results_2023-09-13T17-57-16.083940.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "oh-yeontaek/llama-2-7B-LoRA-assemble", + "model_sha": "72e866a96a2e9afc6527c8d757c69088c3a069c8", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5477815699658704, + "acc_stderr": 0.014544519880633832, + "acc_norm": 0.5733788395904437, + "acc_norm_stderr": 0.014453185592920295 + }, + "harness|hellaswag|10": { + "acc": 0.5986855208125871, + "acc_stderr": 0.004891626718097023, + "acc_norm": 0.7880900219079865, + "acc_norm_stderr": 0.00407826210759555 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45925925925925926, + "acc_stderr": 0.04304979692464243, + "acc_norm": 0.45925925925925926, + "acc_norm_stderr": 0.04304979692464243 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.46710526315789475, + "acc_stderr": 0.04060127035236395, + "acc_norm": 0.46710526315789475, + "acc_norm_stderr": 0.04060127035236395 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5471698113207547, + "acc_stderr": 0.03063562795796182, + "acc_norm": 0.5471698113207547, + "acc_norm_stderr": 0.03063562795796182 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5625, + "acc_stderr": 0.04148415739394154, + "acc_norm": 0.5625, + "acc_norm_stderr": 0.04148415739394154 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4046242774566474, + "acc_stderr": 0.03742461193887249, + "acc_norm": 0.4046242774566474, + "acc_norm_stderr": 0.03742461193887249 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.042801058373643966, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.042801058373643966 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.65, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.65, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.44680851063829785, + "acc_stderr": 0.032500536843658404, + "acc_norm": 0.44680851063829785, + "acc_norm_stderr": 0.032500536843658404 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322004, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322004 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.496551724137931, + "acc_stderr": 0.04166567577101579, + "acc_norm": 0.496551724137931, + "acc_norm_stderr": 0.04166567577101579 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.291005291005291, + "acc_stderr": 0.02339382650048487, + "acc_norm": 0.291005291005291, + "acc_norm_stderr": 0.02339382650048487 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.04006168083848878, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.04006168083848878 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5645161290322581, + "acc_stderr": 0.028206225591502744, + "acc_norm": 0.5645161290322581, + "acc_norm_stderr": 0.028206225591502744 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.33497536945812806, + "acc_stderr": 0.033208527423483104, + "acc_norm": 0.33497536945812806, + "acc_norm_stderr": 0.033208527423483104 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.703030303030303, + "acc_stderr": 0.03567969772268049, + "acc_norm": 0.703030303030303, + "acc_norm_stderr": 0.03567969772268049 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6515151515151515, + "acc_stderr": 0.03394853965156402, + "acc_norm": 0.6515151515151515, + "acc_norm_stderr": 0.03394853965156402 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7564766839378239, + "acc_stderr": 0.030975436386845443, + "acc_norm": 0.7564766839378239, + "acc_norm_stderr": 0.030975436386845443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.49743589743589745, + "acc_stderr": 0.025350672979412202, + "acc_norm": 0.49743589743589745, + "acc_norm_stderr": 0.025350672979412202 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.026067159222275805, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.026067159222275805 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4957983193277311, + "acc_stderr": 0.03247734334448111, + "acc_norm": 0.4957983193277311, + "acc_norm_stderr": 0.03247734334448111 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7137614678899082, + "acc_stderr": 0.019379436628919982, + "acc_norm": 0.7137614678899082, + "acc_norm_stderr": 0.019379436628919982 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.39814814814814814, + "acc_stderr": 0.033384734032074016, + "acc_norm": 0.39814814814814814, + "acc_norm_stderr": 0.033384734032074016 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6862745098039216, + "acc_stderr": 0.032566854844603886, + "acc_norm": 0.6862745098039216, + "acc_norm_stderr": 0.032566854844603886 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7341772151898734, + "acc_stderr": 0.02875679962965834, + "acc_norm": 0.7341772151898734, + "acc_norm_stderr": 0.02875679962965834 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5695067264573991, + "acc_stderr": 0.0332319730294294, + "acc_norm": 0.5695067264573991, + "acc_norm_stderr": 0.0332319730294294 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5801526717557252, + "acc_stderr": 0.04328577215262972, + "acc_norm": 0.5801526717557252, + "acc_norm_stderr": 0.04328577215262972 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6528925619834711, + "acc_stderr": 0.043457245702925335, + "acc_norm": 0.6528925619834711, + "acc_norm_stderr": 0.043457245702925335 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6481481481481481, + "acc_stderr": 0.04616631111801714, + "acc_norm": 0.6481481481481481, + "acc_norm_stderr": 0.04616631111801714 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5460122699386503, + "acc_stderr": 0.0391170190467718, + "acc_norm": 0.5460122699386503, + "acc_norm_stderr": 0.0391170190467718 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.045218299028335865, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.045218299028335865 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.04541609446503948, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.04541609446503948 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7478632478632479, + "acc_stderr": 0.02844796547623102, + "acc_norm": 0.7478632478632479, + "acc_norm_stderr": 0.02844796547623102 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.65, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.65, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7113665389527458, + "acc_stderr": 0.016203792703197783, + "acc_norm": 0.7113665389527458, + "acc_norm_stderr": 0.016203792703197783 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5895953757225434, + "acc_stderr": 0.026483392042098177, + "acc_norm": 0.5895953757225434, + "acc_norm_stderr": 0.026483392042098177 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23128491620111732, + "acc_stderr": 0.014102223623152579, + "acc_norm": 0.23128491620111732, + "acc_norm_stderr": 0.014102223623152579 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5228758169934641, + "acc_stderr": 0.028599936776089782, + "acc_norm": 0.5228758169934641, + "acc_norm_stderr": 0.028599936776089782 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5755627009646302, + "acc_stderr": 0.028071928247946205, + "acc_norm": 0.5755627009646302, + "acc_norm_stderr": 0.028071928247946205 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5524691358024691, + "acc_stderr": 0.02766713856942271, + "acc_norm": 0.5524691358024691, + "acc_norm_stderr": 0.02766713856942271 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.375886524822695, + "acc_stderr": 0.028893955412115882, + "acc_norm": 0.375886524822695, + "acc_norm_stderr": 0.028893955412115882 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3820078226857888, + "acc_stderr": 0.01240956447023556, + "acc_norm": 0.3820078226857888, + "acc_norm_stderr": 0.01240956447023556 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.47794117647058826, + "acc_stderr": 0.030343264224213528, + "acc_norm": 0.47794117647058826, + "acc_norm_stderr": 0.030343264224213528 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5, + "acc_stderr": 0.020227834851568375, + "acc_norm": 0.5, + "acc_norm_stderr": 0.020227834851568375 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5909090909090909, + "acc_stderr": 0.04709306978661896, + "acc_norm": 0.5909090909090909, + "acc_norm_stderr": 0.04709306978661896 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6122448979591837, + "acc_stderr": 0.03119223072679566, + "acc_norm": 0.6122448979591837, + "acc_norm_stderr": 0.03119223072679566 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6218905472636815, + "acc_stderr": 0.034288678487786564, + "acc_norm": 0.6218905472636815, + "acc_norm_stderr": 0.034288678487786564 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.39156626506024095, + "acc_stderr": 0.037998574544796375, + "acc_norm": 0.39156626506024095, + "acc_norm_stderr": 0.037998574544796375 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.695906432748538, + "acc_stderr": 0.03528211258245229, + "acc_norm": 0.695906432748538, + "acc_norm_stderr": 0.03528211258245229 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.379436964504284, + "mc1_stderr": 0.01698703926614298, + "mc2": 0.5317577400569955, + "mc2_stderr": 0.01563126356990073 + }, + "all": { + "acc": 0.5097712422613702, + "acc_stderr": 0.0348785197530624, + "acc_norm": 0.5134153400701832, + "acc_norm_stderr": 0.03486318587343164, + "mc1": 0.379436964504284, + "mc1_stderr": 0.01698703926614298, + "mc2": 0.5317577400569955, + "mc2_stderr": 0.01563126356990073 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4209.850060462952", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/oh-yeontaek/llama-2-7B-LoRA-assemble/results_2023-10-24T23-43-13.966405.json b/eval-results/oh-yeontaek/llama-2-7B-LoRA-assemble/results_2023-10-24T23-43-13.966405.json new file mode 100644 index 0000000000000000000000000000000000000000..5f4769945b6e5bd83673d40b0e63e1cc80f32feb --- /dev/null +++ b/eval-results/oh-yeontaek/llama-2-7B-LoRA-assemble/results_2023-10-24T23-43-13.966405.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "oh-yeontaek/llama-2-7B-LoRA-assemble", + "model_sha": "72e866a96a2e9afc6527c8d757c69088c3a069c8", + "model_size": "12.61 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.31596057046979864, + "em_stderr": 0.004760983364669265, + "f1": 0.39136640100671266, + "f1_stderr": 0.004644890166719777 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.7348066298342542, + "acc_stderr": 0.012406549466192858 + }, + "all": { + "em": 0.31596057046979864, + "em_stderr": 0.004760983364669265, + "f1": 0.39136640100671266, + "f1_stderr": 0.004644890166719777, + "acc": 0.3674033149171271, + "acc_stderr": 0.006203274733096429 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "52100d2e6c0d7638" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "5803d8c5597ba481" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "031dbb65d7818232" + }, + "total_evaluation_time_secondes": "5836.765391588211", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/one-man-army/una-neural-chat-v3-3-P1-OMA/results_2023-12-12T11-36-35.468608.json b/eval-results/one-man-army/una-neural-chat-v3-3-P1-OMA/results_2023-12-12T11-36-35.468608.json new file mode 100644 index 0000000000000000000000000000000000000000..b067dd2635e0f927d072ea91a28411c0d4763503 --- /dev/null +++ b/eval-results/one-man-army/una-neural-chat-v3-3-P1-OMA/results_2023-12-12T11-36-35.468608.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 4397.934572222, + "end_time": 11951.492583681, + "total_evaluation_time_secondes": "7553.558011459", + "model_name": "one-man-army/una-neural-chat-v3-3-P1-OMA", + "model_sha": "014600373086ea46c7cdc4754c984a804b28a070", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6390784982935154, + "acc_stderr": 0.014034761386175456, + "acc_norm": 0.6680887372013652, + "acc_norm_stderr": 0.013760988200880543 + }, + "harness|hellaswag|10": { + "acc": 0.6734714200358495, + "acc_stderr": 0.004679847503411345, + "acc_norm": 0.8591913961362279, + "acc_norm_stderr": 0.0034711315448920418 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6148148148148148, + "acc_stderr": 0.04203921040156279, + "acc_norm": 0.6148148148148148, + "acc_norm_stderr": 0.04203921040156279 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6513157894736842, + "acc_stderr": 0.0387813988879761, + "acc_norm": 0.6513157894736842, + "acc_norm_stderr": 0.0387813988879761 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6792452830188679, + "acc_stderr": 0.028727502957880267, + "acc_norm": 0.6792452830188679, + "acc_norm_stderr": 0.028727502957880267 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7291666666666666, + "acc_stderr": 0.03716177437566017, + "acc_norm": 0.7291666666666666, + "acc_norm_stderr": 0.03716177437566017 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.56, + "acc_stderr": 0.049888765156985884, + "acc_norm": 0.56, + "acc_norm_stderr": 0.049888765156985884 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6416184971098265, + "acc_stderr": 0.036563436533531585, + "acc_norm": 0.6416184971098265, + "acc_norm_stderr": 0.036563436533531585 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.45098039215686275, + "acc_stderr": 0.049512182523962625, + "acc_norm": 0.45098039215686275, + "acc_norm_stderr": 0.049512182523962625 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.73, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.73, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5574468085106383, + "acc_stderr": 0.03246956919789958, + "acc_norm": 0.5574468085106383, + "acc_norm_stderr": 0.03246956919789958 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.43859649122807015, + "acc_stderr": 0.04668000738510455, + "acc_norm": 0.43859649122807015, + "acc_norm_stderr": 0.04668000738510455 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5517241379310345, + "acc_stderr": 0.04144311810878152, + "acc_norm": 0.5517241379310345, + "acc_norm_stderr": 0.04144311810878152 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.02510742548113728, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.02510742548113728 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.044444444444444495, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.044444444444444495 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7451612903225806, + "acc_stderr": 0.024790118459332208, + "acc_norm": 0.7451612903225806, + "acc_norm_stderr": 0.024790118459332208 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4630541871921182, + "acc_stderr": 0.035083705204426656, + "acc_norm": 0.4630541871921182, + "acc_norm_stderr": 0.035083705204426656 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7636363636363637, + "acc_stderr": 0.033175059300091826, + "acc_norm": 0.7636363636363637, + "acc_norm_stderr": 0.033175059300091826 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.797979797979798, + "acc_stderr": 0.028606204289229872, + "acc_norm": 0.797979797979798, + "acc_norm_stderr": 0.028606204289229872 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8756476683937824, + "acc_stderr": 0.023814477086593552, + "acc_norm": 0.8756476683937824, + "acc_norm_stderr": 0.023814477086593552 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6333333333333333, + "acc_stderr": 0.02443301646605246, + "acc_norm": 0.6333333333333333, + "acc_norm_stderr": 0.02443301646605246 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.029116617606083015, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.029116617606083015 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6596638655462185, + "acc_stderr": 0.030778057422931673, + "acc_norm": 0.6596638655462185, + "acc_norm_stderr": 0.030778057422931673 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8366972477064221, + "acc_stderr": 0.015848255806501538, + "acc_norm": 0.8366972477064221, + "acc_norm_stderr": 0.015848255806501538 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5324074074074074, + "acc_stderr": 0.03402801581358966, + "acc_norm": 0.5324074074074074, + "acc_norm_stderr": 0.03402801581358966 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.028867431449849313, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.028867431449849313 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7805907172995781, + "acc_stderr": 0.026939106581553945, + "acc_norm": 0.7805907172995781, + "acc_norm_stderr": 0.026939106581553945 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6771300448430493, + "acc_stderr": 0.031381476375754995, + "acc_norm": 0.6771300448430493, + "acc_norm_stderr": 0.031381476375754995 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7633587786259542, + "acc_stderr": 0.03727673575596914, + "acc_norm": 0.7633587786259542, + "acc_norm_stderr": 0.03727673575596914 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.042365112580946315, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.042365112580946315 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.754601226993865, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.754601226993865, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5089285714285714, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.5089285714285714, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.039166677628225836, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.039166677628225836 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8632478632478633, + "acc_stderr": 0.022509033937077816, + "acc_norm": 0.8632478632478633, + "acc_norm_stderr": 0.022509033937077816 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.74, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.74, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8301404853128991, + "acc_stderr": 0.013428186370608311, + "acc_norm": 0.8301404853128991, + "acc_norm_stderr": 0.013428186370608311 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7052023121387283, + "acc_stderr": 0.024547617794803828, + "acc_norm": 0.7052023121387283, + "acc_norm_stderr": 0.024547617794803828 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.39664804469273746, + "acc_stderr": 0.016361354769822468, + "acc_norm": 0.39664804469273746, + "acc_norm_stderr": 0.016361354769822468 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7026143790849673, + "acc_stderr": 0.02617390850671858, + "acc_norm": 0.7026143790849673, + "acc_norm_stderr": 0.02617390850671858 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6752411575562701, + "acc_stderr": 0.026596782287697043, + "acc_norm": 0.6752411575562701, + "acc_norm_stderr": 0.026596782287697043 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7160493827160493, + "acc_stderr": 0.025089478523765137, + "acc_norm": 0.7160493827160493, + "acc_norm_stderr": 0.025089478523765137 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.45390070921985815, + "acc_stderr": 0.029700453247291456, + "acc_norm": 0.45390070921985815, + "acc_norm_stderr": 0.029700453247291456 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.43415906127770537, + "acc_stderr": 0.012659033237067248, + "acc_norm": 0.43415906127770537, + "acc_norm_stderr": 0.012659033237067248 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6801470588235294, + "acc_stderr": 0.028332959514031215, + "acc_norm": 0.6801470588235294, + "acc_norm_stderr": 0.028332959514031215 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6601307189542484, + "acc_stderr": 0.019162418588623567, + "acc_norm": 0.6601307189542484, + "acc_norm_stderr": 0.019162418588623567 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.710204081632653, + "acc_stderr": 0.029043088683304328, + "acc_norm": 0.710204081632653, + "acc_norm_stderr": 0.029043088683304328 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8258706467661692, + "acc_stderr": 0.026814951200421603, + "acc_norm": 0.8258706467661692, + "acc_norm_stderr": 0.026814951200421603 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.536144578313253, + "acc_stderr": 0.038823108508905954, + "acc_norm": 0.536144578313253, + "acc_norm_stderr": 0.038823108508905954 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8187134502923976, + "acc_stderr": 0.029547741687640044, + "acc_norm": 0.8187134502923976, + "acc_norm_stderr": 0.029547741687640044 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.4847001223990208, + "mc1_stderr": 0.017495304473187902, + "mc2": 0.6434691270555646, + "mc2_stderr": 0.015146860071018828 + }, + "harness|winogrande|5": { + "acc": 0.7963693764798737, + "acc_stderr": 0.011317798781626913 + }, + "harness|gsm8k|5": { + "acc": 0.6186504927975739, + "acc_stderr": 0.013379089877400715 + }, + "all": { + "acc": 0.636840960894971, + "acc_stderr": 0.0325649293445393, + "acc_norm": 0.6380848976326104, + "acc_norm_stderr": 0.0332251069866789, + "mc1": 0.4847001223990208, + "mc1_stderr": 0.017495304473187902, + "mc2": 0.6434691270555646, + "mc2_stderr": 0.015146860071018828 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "32fc8d1b35003b6b" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "5e8acac7553bd926" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/one-man-army/una-neural-chat-v3-3-P2-OMA/results_2023-12-13T14-22-46.443158.json b/eval-results/one-man-army/una-neural-chat-v3-3-P2-OMA/results_2023-12-13T14-22-46.443158.json new file mode 100644 index 0000000000000000000000000000000000000000..ffba1c3521990b939aacf6fc72c40f0f89cc8626 --- /dev/null +++ b/eval-results/one-man-army/una-neural-chat-v3-3-P2-OMA/results_2023-12-13T14-22-46.443158.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 101100.954514016, + "end_time": 108311.539697493, + "total_evaluation_time_secondes": "7210.585183477", + "model_name": "one-man-army/una-neural-chat-v3-3-P2-OMA", + "model_sha": "7bab67e479c192927c4a781efdf5be27eaa315a8", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6390784982935154, + "acc_stderr": 0.014034761386175456, + "acc_norm": 0.6723549488054608, + "acc_norm_stderr": 0.01371584794071934 + }, + "harness|hellaswag|10": { + "acc": 0.6787492531368253, + "acc_stderr": 0.004660025270817023, + "acc_norm": 0.8633738299143597, + "acc_norm_stderr": 0.003427503475567809 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6222222222222222, + "acc_stderr": 0.04188307537595852, + "acc_norm": 0.6222222222222222, + "acc_norm_stderr": 0.04188307537595852 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6710526315789473, + "acc_stderr": 0.03823428969926605, + "acc_norm": 0.6710526315789473, + "acc_norm_stderr": 0.03823428969926605 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6792452830188679, + "acc_stderr": 0.028727502957880267, + "acc_norm": 0.6792452830188679, + "acc_norm_stderr": 0.028727502957880267 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7083333333333334, + "acc_stderr": 0.038009680605548594, + "acc_norm": 0.7083333333333334, + "acc_norm_stderr": 0.038009680605548594 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.56, + "acc_stderr": 0.049888765156985884, + "acc_norm": 0.56, + "acc_norm_stderr": 0.049888765156985884 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6358381502890174, + "acc_stderr": 0.03669072477416907, + "acc_norm": 0.6358381502890174, + "acc_norm_stderr": 0.03669072477416907 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.43137254901960786, + "acc_stderr": 0.04928099597287534, + "acc_norm": 0.43137254901960786, + "acc_norm_stderr": 0.04928099597287534 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5702127659574469, + "acc_stderr": 0.03236214467715564, + "acc_norm": 0.5702127659574469, + "acc_norm_stderr": 0.03236214467715564 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4649122807017544, + "acc_stderr": 0.04692008381368909, + "acc_norm": 0.4649122807017544, + "acc_norm_stderr": 0.04692008381368909 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.025197101074246483, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.025197101074246483 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.04415438226743744, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.04415438226743744 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7483870967741936, + "acc_stderr": 0.024685979286239966, + "acc_norm": 0.7483870967741936, + "acc_norm_stderr": 0.024685979286239966 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.45320197044334976, + "acc_stderr": 0.035025446508458714, + "acc_norm": 0.45320197044334976, + "acc_norm_stderr": 0.035025446508458714 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7757575757575758, + "acc_stderr": 0.03256866661681102, + "acc_norm": 0.7757575757575758, + "acc_norm_stderr": 0.03256866661681102 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.797979797979798, + "acc_stderr": 0.028606204289229872, + "acc_norm": 0.797979797979798, + "acc_norm_stderr": 0.028606204289229872 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8808290155440415, + "acc_stderr": 0.023381935348121427, + "acc_norm": 0.8808290155440415, + "acc_norm_stderr": 0.023381935348121427 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6307692307692307, + "acc_stderr": 0.02446861524147892, + "acc_norm": 0.6307692307692307, + "acc_norm_stderr": 0.02446861524147892 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.35555555555555557, + "acc_stderr": 0.02918571494985741, + "acc_norm": 0.35555555555555557, + "acc_norm_stderr": 0.02918571494985741 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6512605042016807, + "acc_stderr": 0.030956636328566548, + "acc_norm": 0.6512605042016807, + "acc_norm_stderr": 0.030956636328566548 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943343, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943343 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8330275229357799, + "acc_stderr": 0.015990154885073393, + "acc_norm": 0.8330275229357799, + "acc_norm_stderr": 0.015990154885073393 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5324074074074074, + "acc_stderr": 0.03402801581358966, + "acc_norm": 0.5324074074074074, + "acc_norm_stderr": 0.03402801581358966 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.028867431449849313, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.028867431449849313 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7805907172995781, + "acc_stderr": 0.026939106581553945, + "acc_norm": 0.7805907172995781, + "acc_norm_stderr": 0.026939106581553945 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7557251908396947, + "acc_stderr": 0.03768335959728743, + "acc_norm": 0.7557251908396947, + "acc_norm_stderr": 0.03768335959728743 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.042365112580946315, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.042365112580946315 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.754601226993865, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.754601226993865, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4732142857142857, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.4732142857142857, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8155339805825242, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.8155339805825242, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8547008547008547, + "acc_stderr": 0.023086635086841407, + "acc_norm": 0.8547008547008547, + "acc_norm_stderr": 0.023086635086841407 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8288633461047255, + "acc_stderr": 0.013468201614066297, + "acc_norm": 0.8288633461047255, + "acc_norm_stderr": 0.013468201614066297 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7023121387283237, + "acc_stderr": 0.024617055388677003, + "acc_norm": 0.7023121387283237, + "acc_norm_stderr": 0.024617055388677003 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.39106145251396646, + "acc_stderr": 0.016320763763808383, + "acc_norm": 0.39106145251396646, + "acc_norm_stderr": 0.016320763763808383 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6928104575163399, + "acc_stderr": 0.02641560191438899, + "acc_norm": 0.6928104575163399, + "acc_norm_stderr": 0.02641560191438899 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.684887459807074, + "acc_stderr": 0.026385273703464496, + "acc_norm": 0.684887459807074, + "acc_norm_stderr": 0.026385273703464496 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7129629629629629, + "acc_stderr": 0.025171041915309684, + "acc_norm": 0.7129629629629629, + "acc_norm_stderr": 0.025171041915309684 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.450354609929078, + "acc_stderr": 0.02968010556502904, + "acc_norm": 0.450354609929078, + "acc_norm_stderr": 0.02968010556502904 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.42959582790091266, + "acc_stderr": 0.012643004623790206, + "acc_norm": 0.42959582790091266, + "acc_norm_stderr": 0.012643004623790206 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6875, + "acc_stderr": 0.02815637344037142, + "acc_norm": 0.6875, + "acc_norm_stderr": 0.02815637344037142 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6601307189542484, + "acc_stderr": 0.01916241858862357, + "acc_norm": 0.6601307189542484, + "acc_norm_stderr": 0.01916241858862357 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.710204081632653, + "acc_stderr": 0.02904308868330433, + "acc_norm": 0.710204081632653, + "acc_norm_stderr": 0.02904308868330433 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8308457711442786, + "acc_stderr": 0.026508590656233278, + "acc_norm": 0.8308457711442786, + "acc_norm_stderr": 0.026508590656233278 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5301204819277109, + "acc_stderr": 0.03885425420866767, + "acc_norm": 0.5301204819277109, + "acc_norm_stderr": 0.03885425420866767 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8070175438596491, + "acc_stderr": 0.030267457554898458, + "acc_norm": 0.8070175438596491, + "acc_norm_stderr": 0.030267457554898458 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.4883720930232558, + "mc1_stderr": 0.017498767175740088, + "mc2": 0.6548011216801304, + "mc2_stderr": 0.015091037719781332 + }, + "harness|winogrande|5": { + "acc": 0.7963693764798737, + "acc_stderr": 0.011317798781626915 + }, + "harness|gsm8k|5": { + "acc": 0.6141015921152388, + "acc_stderr": 0.013409077471319164 + }, + "all": { + "acc": 0.6350905064562539, + "acc_stderr": 0.03255639759907832, + "acc_norm": 0.6364059483478958, + "acc_norm_stderr": 0.03321460918729028, + "mc1": 0.4883720930232558, + "mc1_stderr": 0.017498767175740088, + "mc2": 0.6548011216801304, + "mc2_stderr": 0.015091037719781332 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "582222bf137b1c48" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "b9974d3535431dfb" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/one-man-army/una-neural-chat-v3-3-P2-OMA/results_2023-12-13T14-25-29.170115.json b/eval-results/one-man-army/una-neural-chat-v3-3-P2-OMA/results_2023-12-13T14-25-29.170115.json new file mode 100644 index 0000000000000000000000000000000000000000..5c7e7a49c37e2546d51800b90186b15600ac7ddf --- /dev/null +++ b/eval-results/one-man-army/una-neural-chat-v3-3-P2-OMA/results_2023-12-13T14-25-29.170115.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 101006.673758811, + "end_time": 108487.152989466, + "total_evaluation_time_secondes": "7480.479230655008", + "model_name": "one-man-army/una-neural-chat-v3-3-P2-OMA", + "model_sha": "7bab67e479c192927c4a781efdf5be27eaa315a8", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6390784982935154, + "acc_stderr": 0.014034761386175456, + "acc_norm": 0.6732081911262798, + "acc_norm_stderr": 0.013706665975587331 + }, + "harness|hellaswag|10": { + "acc": 0.6783509261103365, + "acc_stderr": 0.004661544991583035, + "acc_norm": 0.8632742481577375, + "acc_norm_stderr": 0.00342855459595022 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6222222222222222, + "acc_stderr": 0.04188307537595852, + "acc_norm": 0.6222222222222222, + "acc_norm_stderr": 0.04188307537595852 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6578947368421053, + "acc_stderr": 0.03860731599316092, + "acc_norm": 0.6578947368421053, + "acc_norm_stderr": 0.03860731599316092 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6792452830188679, + "acc_stderr": 0.028727502957880267, + "acc_norm": 0.6792452830188679, + "acc_norm_stderr": 0.028727502957880267 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7013888888888888, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.7013888888888888, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6358381502890174, + "acc_stderr": 0.03669072477416907, + "acc_norm": 0.6358381502890174, + "acc_norm_stderr": 0.03669072477416907 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.43137254901960786, + "acc_stderr": 0.04928099597287534, + "acc_norm": 0.43137254901960786, + "acc_norm_stderr": 0.04928099597287534 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5531914893617021, + "acc_stderr": 0.032500536843658404, + "acc_norm": 0.5531914893617021, + "acc_norm_stderr": 0.032500536843658404 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4649122807017544, + "acc_stderr": 0.04692008381368909, + "acc_norm": 0.4649122807017544, + "acc_norm_stderr": 0.04692008381368909 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3862433862433862, + "acc_stderr": 0.025075981767601688, + "acc_norm": 0.3862433862433862, + "acc_norm_stderr": 0.025075981767601688 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.04415438226743744, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.04415438226743744 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7548387096774194, + "acc_stderr": 0.024472243840895514, + "acc_norm": 0.7548387096774194, + "acc_norm_stderr": 0.024472243840895514 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.45320197044334976, + "acc_stderr": 0.035025446508458714, + "acc_norm": 0.45320197044334976, + "acc_norm_stderr": 0.035025446508458714 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7757575757575758, + "acc_stderr": 0.03256866661681102, + "acc_norm": 0.7757575757575758, + "acc_norm_stderr": 0.03256866661681102 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.797979797979798, + "acc_stderr": 0.028606204289229872, + "acc_norm": 0.797979797979798, + "acc_norm_stderr": 0.028606204289229872 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8808290155440415, + "acc_stderr": 0.023381935348121427, + "acc_norm": 0.8808290155440415, + "acc_norm_stderr": 0.023381935348121427 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6358974358974359, + "acc_stderr": 0.02439667298509477, + "acc_norm": 0.6358974358974359, + "acc_norm_stderr": 0.02439667298509477 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.029116617606083015, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.029116617606083015 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6512605042016807, + "acc_stderr": 0.030956636328566548, + "acc_norm": 0.6512605042016807, + "acc_norm_stderr": 0.030956636328566548 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526732, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526732 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8330275229357799, + "acc_stderr": 0.015990154885073393, + "acc_norm": 0.8330275229357799, + "acc_norm_stderr": 0.015990154885073393 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.028867431449849313, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.028867431449849313 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7805907172995781, + "acc_stderr": 0.026939106581553945, + "acc_norm": 0.7805907172995781, + "acc_norm_stderr": 0.026939106581553945 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6816143497757847, + "acc_stderr": 0.03126580522513713, + "acc_norm": 0.6816143497757847, + "acc_norm_stderr": 0.03126580522513713 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7557251908396947, + "acc_stderr": 0.03768335959728743, + "acc_norm": 0.7557251908396947, + "acc_norm_stderr": 0.03768335959728743 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.042365112580946315, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.042365112580946315 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7607361963190185, + "acc_stderr": 0.03351953879521271, + "acc_norm": 0.7607361963190185, + "acc_norm_stderr": 0.03351953879521271 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4732142857142857, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.4732142857142857, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7961165048543689, + "acc_stderr": 0.039891398595317706, + "acc_norm": 0.7961165048543689, + "acc_norm_stderr": 0.039891398595317706 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.02190190511507333, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.02190190511507333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.73, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.73, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8301404853128991, + "acc_stderr": 0.013428186370608311, + "acc_norm": 0.8301404853128991, + "acc_norm_stderr": 0.013428186370608311 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7052023121387283, + "acc_stderr": 0.024547617794803828, + "acc_norm": 0.7052023121387283, + "acc_norm_stderr": 0.024547617794803828 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.38324022346368714, + "acc_stderr": 0.016260159604429125, + "acc_norm": 0.38324022346368714, + "acc_norm_stderr": 0.016260159604429125 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6895424836601307, + "acc_stderr": 0.026493033225145898, + "acc_norm": 0.6895424836601307, + "acc_norm_stderr": 0.026493033225145898 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.684887459807074, + "acc_stderr": 0.026385273703464496, + "acc_norm": 0.684887459807074, + "acc_norm_stderr": 0.026385273703464496 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7098765432098766, + "acc_stderr": 0.025251173936495026, + "acc_norm": 0.7098765432098766, + "acc_norm_stderr": 0.025251173936495026 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.45390070921985815, + "acc_stderr": 0.029700453247291456, + "acc_norm": 0.45390070921985815, + "acc_norm_stderr": 0.029700453247291456 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.43415906127770537, + "acc_stderr": 0.012659033237067248, + "acc_norm": 0.43415906127770537, + "acc_norm_stderr": 0.012659033237067248 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6911764705882353, + "acc_stderr": 0.028064998167040094, + "acc_norm": 0.6911764705882353, + "acc_norm_stderr": 0.028064998167040094 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6584967320261438, + "acc_stderr": 0.019184639328092487, + "acc_norm": 0.6584967320261438, + "acc_norm_stderr": 0.019184639328092487 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7061224489795919, + "acc_stderr": 0.029162738410249772, + "acc_norm": 0.7061224489795919, + "acc_norm_stderr": 0.029162738410249772 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8258706467661692, + "acc_stderr": 0.026814951200421606, + "acc_norm": 0.8258706467661692, + "acc_norm_stderr": 0.026814951200421606 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5301204819277109, + "acc_stderr": 0.03885425420866767, + "acc_norm": 0.5301204819277109, + "acc_norm_stderr": 0.03885425420866767 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8187134502923976, + "acc_stderr": 0.029547741687640044, + "acc_norm": 0.8187134502923976, + "acc_norm_stderr": 0.029547741687640044 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.4883720930232558, + "mc1_stderr": 0.017498767175740088, + "mc2": 0.6548761858136044, + "mc2_stderr": 0.01508528563797577 + }, + "harness|winogrande|5": { + "acc": 0.797947908445146, + "acc_stderr": 0.011285013754047444 + }, + "harness|gsm8k|5": { + "acc": 0.6224412433661866, + "acc_stderr": 0.013353150666358546 + }, + "all": { + "acc": 0.6348474564429509, + "acc_stderr": 0.032547874322213524, + "acc_norm": 0.6360060797642175, + "acc_norm_stderr": 0.03320713700726099, + "mc1": 0.4883720930232558, + "mc1_stderr": 0.017498767175740088, + "mc2": 0.6548761858136044, + "mc2_stderr": 0.01508528563797577 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "dfcb5d832d5a52f8" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "8b5e72b150099064" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/openbmb/UltraLM-13b-v2.0/results_2023-10-09T08-34-12.309014.json b/eval-results/openbmb/UltraLM-13b-v2.0/results_2023-10-09T08-34-12.309014.json new file mode 100644 index 0000000000000000000000000000000000000000..01bca3672c8fe230b5acc7d1dc80102ab3b2f2d1 --- /dev/null +++ b/eval-results/openbmb/UltraLM-13b-v2.0/results_2023-10-09T08-34-12.309014.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "openbmb/UltraLM-13b-v2.0", + "model_sha": "a452045c96ae62379a98ef0d85666616a66e78a6", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5870307167235495, + "acc_stderr": 0.014388344935398326, + "acc_norm": 0.6262798634812287, + "acc_norm_stderr": 0.01413770860175909 + }, + "harness|hellaswag|10": { + "acc": 0.6172077275443139, + "acc_stderr": 0.00485074868785994, + "acc_norm": 0.8148775144393547, + "acc_norm_stderr": 0.0038760312505449856 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4740740740740741, + "acc_stderr": 0.04313531696750574, + "acc_norm": 0.4740740740740741, + "acc_norm_stderr": 0.04313531696750574 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5657894736842105, + "acc_stderr": 0.040335656678483205, + "acc_norm": 0.5657894736842105, + "acc_norm_stderr": 0.040335656678483205 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6113207547169811, + "acc_stderr": 0.030000485448675986, + "acc_norm": 0.6113207547169811, + "acc_norm_stderr": 0.030000485448675986 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5763888888888888, + "acc_stderr": 0.04132125019723369, + "acc_norm": 0.5763888888888888, + "acc_norm_stderr": 0.04132125019723369 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5491329479768786, + "acc_stderr": 0.03794012674697031, + "acc_norm": 0.5491329479768786, + "acc_norm_stderr": 0.03794012674697031 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.04655010411319616, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.04655010411319616 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.73, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.73, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.42127659574468085, + "acc_stderr": 0.03227834510146268, + "acc_norm": 0.42127659574468085, + "acc_norm_stderr": 0.03227834510146268 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.04096985139843671, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.04096985139843671 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.04164188720169377, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.04164188720169377 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3306878306878307, + "acc_stderr": 0.02422996529842507, + "acc_norm": 0.3306878306878307, + "acc_norm_stderr": 0.02422996529842507 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.04306241259127153, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.04306241259127153 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6741935483870968, + "acc_stderr": 0.0266620105785671, + "acc_norm": 0.6741935483870968, + "acc_norm_stderr": 0.0266620105785671 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4187192118226601, + "acc_stderr": 0.03471192860518468, + "acc_norm": 0.4187192118226601, + "acc_norm_stderr": 0.03471192860518468 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.03713158067481913, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.03713158067481913 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.702020202020202, + "acc_stderr": 0.03258630383836556, + "acc_norm": 0.702020202020202, + "acc_norm_stderr": 0.03258630383836556 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7772020725388601, + "acc_stderr": 0.030031147977641538, + "acc_norm": 0.7772020725388601, + "acc_norm_stderr": 0.030031147977641538 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5051282051282051, + "acc_stderr": 0.02534967290683866, + "acc_norm": 0.5051282051282051, + "acc_norm_stderr": 0.02534967290683866 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.028037929969114986, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.028037929969114986 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.542016806722689, + "acc_stderr": 0.03236361111951941, + "acc_norm": 0.542016806722689, + "acc_norm_stderr": 0.03236361111951941 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2913907284768212, + "acc_stderr": 0.03710185726119995, + "acc_norm": 0.2913907284768212, + "acc_norm_stderr": 0.03710185726119995 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7522935779816514, + "acc_stderr": 0.01850814360254783, + "acc_norm": 0.7522935779816514, + "acc_norm_stderr": 0.01850814360254783 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.46296296296296297, + "acc_stderr": 0.03400603625538271, + "acc_norm": 0.46296296296296297, + "acc_norm_stderr": 0.03400603625538271 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7696078431372549, + "acc_stderr": 0.029554292605695063, + "acc_norm": 0.7696078431372549, + "acc_norm_stderr": 0.029554292605695063 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7341772151898734, + "acc_stderr": 0.02875679962965834, + "acc_norm": 0.7341772151898734, + "acc_norm_stderr": 0.02875679962965834 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6457399103139013, + "acc_stderr": 0.03210062154134986, + "acc_norm": 0.6457399103139013, + "acc_norm_stderr": 0.03210062154134986 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6793893129770993, + "acc_stderr": 0.04093329229834278, + "acc_norm": 0.6793893129770993, + "acc_norm_stderr": 0.04093329229834278 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7768595041322314, + "acc_stderr": 0.03800754475228732, + "acc_norm": 0.7768595041322314, + "acc_norm_stderr": 0.03800754475228732 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.04236511258094632, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.04236511258094632 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6932515337423313, + "acc_stderr": 0.03623089915724147, + "acc_norm": 0.6932515337423313, + "acc_norm_stderr": 0.03623089915724147 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.04521829902833585, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.04521829902833585 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.04541609446503948, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.04541609446503948 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.811965811965812, + "acc_stderr": 0.025598193686652244, + "acc_norm": 0.811965811965812, + "acc_norm_stderr": 0.025598193686652244 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7484035759897829, + "acc_stderr": 0.015517322365529641, + "acc_norm": 0.7484035759897829, + "acc_norm_stderr": 0.015517322365529641 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.638728323699422, + "acc_stderr": 0.0258622018522779, + "acc_norm": 0.638728323699422, + "acc_norm_stderr": 0.0258622018522779 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4011173184357542, + "acc_stderr": 0.01639222189940707, + "acc_norm": 0.4011173184357542, + "acc_norm_stderr": 0.01639222189940707 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6568627450980392, + "acc_stderr": 0.02718449890994162, + "acc_norm": 0.6568627450980392, + "acc_norm_stderr": 0.02718449890994162 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6495176848874598, + "acc_stderr": 0.02709865262130175, + "acc_norm": 0.6495176848874598, + "acc_norm_stderr": 0.02709865262130175 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6481481481481481, + "acc_stderr": 0.026571483480719964, + "acc_norm": 0.6481481481481481, + "acc_norm_stderr": 0.026571483480719964 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.40425531914893614, + "acc_stderr": 0.029275532159704725, + "acc_norm": 0.40425531914893614, + "acc_norm_stderr": 0.029275532159704725 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.42242503259452413, + "acc_stderr": 0.012615600475734921, + "acc_norm": 0.42242503259452413, + "acc_norm_stderr": 0.012615600475734921 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5735294117647058, + "acc_stderr": 0.030042615832714857, + "acc_norm": 0.5735294117647058, + "acc_norm_stderr": 0.030042615832714857 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5866013071895425, + "acc_stderr": 0.01992211568278669, + "acc_norm": 0.5866013071895425, + "acc_norm_stderr": 0.01992211568278669 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6448979591836734, + "acc_stderr": 0.030635655150387638, + "acc_norm": 0.6448979591836734, + "acc_norm_stderr": 0.030635655150387638 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7562189054726368, + "acc_stderr": 0.030360490154014645, + "acc_norm": 0.7562189054726368, + "acc_norm_stderr": 0.030360490154014645 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4759036144578313, + "acc_stderr": 0.03887971849597264, + "acc_norm": 0.4759036144578313, + "acc_norm_stderr": 0.03887971849597264 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7368421052631579, + "acc_stderr": 0.03377310252209205, + "acc_norm": 0.7368421052631579, + "acc_norm_stderr": 0.03377310252209205 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3598531211750306, + "mc1_stderr": 0.016801860466677154, + "mc2": 0.4948303207858797, + "mc2_stderr": 0.015361047603025122 + }, + "all": { + "acc": 0.5630476446912529, + "acc_stderr": 0.034298942974256355, + "acc_norm": 0.5670632198379092, + "acc_norm_stderr": 0.03427817426627408, + "mc1": 0.3598531211750306, + "mc1_stderr": 0.016801860466677154, + "mc2": 0.4948303207858797, + "mc2_stderr": 0.015361047603025122 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6353.044916629791", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/openbmb/UltraLM-13b-v2.0/results_2023-10-25T05-11-16.252341.json b/eval-results/openbmb/UltraLM-13b-v2.0/results_2023-10-25T05-11-16.252341.json new file mode 100644 index 0000000000000000000000000000000000000000..4d8da52c8b9db36be474d907b2b0fe5e36520793 --- /dev/null +++ b/eval-results/openbmb/UltraLM-13b-v2.0/results_2023-10-25T05-11-16.252341.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "openbmb/UltraLM-13b-v2.0", + "model_sha": "a452045c96ae62379a98ef0d85666616a66e78a6", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.24842701342281878, + "em_stderr": 0.004425115813837483, + "f1": 0.3269431627516796, + "f1_stderr": 0.004386855622561775 + }, + "harness|gsm8k|5": { + "acc": 0.10993176648976498, + "acc_stderr": 0.008616195587865406 + }, + "harness|winogrande|5": { + "acc": 0.7647987371744278, + "acc_stderr": 0.011920008163650886 + }, + "all": { + "em": 0.24842701342281878, + "em_stderr": 0.004425115813837483, + "f1": 0.3269431627516796, + "f1_stderr": 0.004386855622561775, + "acc": 0.4373652518320964, + "acc_stderr": 0.010268101875758145 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "b3a0b209a0f640bd" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "eecbb1c897294f8f" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "75773ec4703926fd" + }, + "total_evaluation_time_secondes": "12030.502338886261", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/openbmb/UltraLM-13b-v2.0/results_2023-12-02T13-14-33.191759.json b/eval-results/openbmb/UltraLM-13b-v2.0/results_2023-12-02T13-14-33.191759.json new file mode 100644 index 0000000000000000000000000000000000000000..6cdf90a3582df7be6c987d10cc87b82bb6a9e8b3 --- /dev/null +++ b/eval-results/openbmb/UltraLM-13b-v2.0/results_2023-12-02T13-14-33.191759.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1384121.516856826, + "end_time": 1387775.067992094, + "total_evaluation_time_secondes": "3653.5511352680624", + "model_name": "openbmb/UltraLM-13b-v2.0", + "model_sha": "a452045c96ae62379a98ef0d85666616a66e78a6", + "model_dtype": "torch.float16", + "model_size": "24.32 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.2608036391205459, + "acc_stderr": 0.012094252417332734 + }, + "all": { + "acc": 0.2608036391205459, + "acc_stderr": 0.012094252417332734 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "eecbb1c897294f8f" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "42036645de5ac59d", + "hash_cont_tokens": "b52c05c6d14d9ee1" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/openbmb/UltraLM-13b/results_2023-10-04T00-32-52.750601.json b/eval-results/openbmb/UltraLM-13b/results_2023-10-04T00-32-52.750601.json new file mode 100644 index 0000000000000000000000000000000000000000..5af503a34580b6e53529a2811e63b1aa1dc5548e --- /dev/null +++ b/eval-results/openbmb/UltraLM-13b/results_2023-10-04T00-32-52.750601.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "openbmb/UltraLM-13b", + "model_sha": "2c732c2899fc329036d97e5c6f0a61eaff19d97d", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.23976109215017063, + "acc_stderr": 0.012476304127453947, + "acc_norm": 0.29436860068259385, + "acc_norm_stderr": 0.013318528460539426 + }, + "harness|hellaswag|10": { + "acc": 0.2566221868153754, + "acc_stderr": 0.004358764596401033, + "acc_norm": 0.2599083847839076, + "acc_norm_stderr": 0.004376877619234126 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.18518518518518517, + "acc_stderr": 0.03355677216313142, + "acc_norm": 0.18518518518518517, + "acc_norm_stderr": 0.03355677216313142 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.21509433962264152, + "acc_stderr": 0.02528839450289137, + "acc_norm": 0.21509433962264152, + "acc_norm_stderr": 0.02528839450289137 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749874, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749874 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813365 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.20899470899470898, + "acc_stderr": 0.02094048156533486, + "acc_norm": 0.20899470899470898, + "acc_norm_stderr": 0.02094048156533486 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04040610178208841, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04040610178208841 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.1774193548387097, + "acc_stderr": 0.02173254068932927, + "acc_norm": 0.1774193548387097, + "acc_norm_stderr": 0.02173254068932927 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.15270935960591134, + "acc_stderr": 0.02530890453938063, + "acc_norm": 0.15270935960591134, + "acc_norm_stderr": 0.02530890453938063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.17676767676767677, + "acc_stderr": 0.027178752639044915, + "acc_norm": 0.17676767676767677, + "acc_norm_stderr": 0.027178752639044915 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.19689119170984457, + "acc_stderr": 0.028697873971860664, + "acc_norm": 0.19689119170984457, + "acc_norm_stderr": 0.028697873971860664 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.20256410256410257, + "acc_stderr": 0.020377660970371372, + "acc_norm": 0.20256410256410257, + "acc_norm_stderr": 0.020377660970371372 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2111111111111111, + "acc_stderr": 0.024882116857655075, + "acc_norm": 0.2111111111111111, + "acc_norm_stderr": 0.024882116857655075 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21008403361344538, + "acc_stderr": 0.026461398717471874, + "acc_norm": 0.21008403361344538, + "acc_norm_stderr": 0.026461398717471874 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.1986754966887417, + "acc_stderr": 0.03257847384436776, + "acc_norm": 0.1986754966887417, + "acc_norm_stderr": 0.03257847384436776 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1926605504587156, + "acc_stderr": 0.016909276884936094, + "acc_norm": 0.1926605504587156, + "acc_norm_stderr": 0.016909276884936094 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.1527777777777778, + "acc_stderr": 0.024536326026134224, + "acc_norm": 0.1527777777777778, + "acc_norm_stderr": 0.024536326026134224 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.31390134529147984, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.31390134529147984, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2905982905982906, + "acc_stderr": 0.02974504857267404, + "acc_norm": 0.2905982905982906, + "acc_norm_stderr": 0.02974504857267404 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.23754789272030652, + "acc_stderr": 0.015218733046150193, + "acc_norm": 0.23754789272030652, + "acc_norm_stderr": 0.015218733046150193 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.023929155517351284, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.023929155517351284 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1864951768488746, + "acc_stderr": 0.02212243977248077, + "acc_norm": 0.1864951768488746, + "acc_norm_stderr": 0.02212243977248077 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21604938271604937, + "acc_stderr": 0.022899162918445806, + "acc_norm": 0.21604938271604937, + "acc_norm_stderr": 0.022899162918445806 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23404255319148937, + "acc_stderr": 0.025257861359432417, + "acc_norm": 0.23404255319148937, + "acc_norm_stderr": 0.025257861359432417 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142692, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142692 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.18382352941176472, + "acc_stderr": 0.023529242185193106, + "acc_norm": 0.18382352941176472, + "acc_norm_stderr": 0.023529242185193106 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25, + "acc_stderr": 0.01751781884501444, + "acc_norm": 0.25, + "acc_norm_stderr": 0.01751781884501444 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.18775510204081633, + "acc_stderr": 0.02500025603954621, + "acc_norm": 0.18775510204081633, + "acc_norm_stderr": 0.02500025603954621 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401465, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401465 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.28313253012048195, + "acc_stderr": 0.03507295431370518, + "acc_norm": 0.28313253012048195, + "acc_norm_stderr": 0.03507295431370518 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2423500611995104, + "mc1_stderr": 0.015000674373570345, + "mc2": 0.4861387255603705, + "mc2_stderr": 0.01574665894684377 + }, + "all": { + "acc": 0.2317456285682548, + "acc_stderr": 0.03071580855473494, + "acc_norm": 0.23272687783098286, + "acc_norm_stderr": 0.030730390543818306, + "mc1": 0.2423500611995104, + "mc1_stderr": 0.015000674373570345, + "mc2": 0.4861387255603705, + "mc2_stderr": 0.01574665894684377 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4105.90815782547", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/openbmb/UltraLM-13b/results_2023-10-28T22-40-25.196177.json b/eval-results/openbmb/UltraLM-13b/results_2023-10-28T22-40-25.196177.json new file mode 100644 index 0000000000000000000000000000000000000000..7cc0276779f0c356ca106dbbb0e3192a35001242 --- /dev/null +++ b/eval-results/openbmb/UltraLM-13b/results_2023-10-28T22-40-25.196177.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "openbmb/UltraLM-13b", + "model_sha": "2c732c2899fc329036d97e5c6f0a61eaff19d97d", + "model_size": "24.28 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.4956590370955012, + "acc_stderr": 0.014051956064076906 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0, + "acc": 0.2478295185477506, + "acc_stderr": 0.007025978032038453 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "e0c7c22ff34edf19" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "90e3833ea519d325" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "24aa94ac3eceb25d" + }, + "total_evaluation_time_secondes": "31372.552024126053", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/openbmb/UltraLM-13b/results_2023-12-02T13-31-34.076061.json b/eval-results/openbmb/UltraLM-13b/results_2023-12-02T13-31-34.076061.json new file mode 100644 index 0000000000000000000000000000000000000000..4de42759d25fde31f624da6bfa294a85165db266 --- /dev/null +++ b/eval-results/openbmb/UltraLM-13b/results_2023-12-02T13-31-34.076061.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1382909.358700067, + "end_time": 1387586.197199712, + "total_evaluation_time_secondes": "4676.838499645004", + "model_name": "openbmb/UltraLM-13b", + "model_sha": "2c732c2899fc329036d97e5c6f0a61eaff19d97d", + "model_dtype": "torch.float16", + "model_size": "24.28 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "all": { + "acc": 0.0, + "acc_stderr": 0.0 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "90e3833ea519d325" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "42036645de5ac59d", + "hash_cont_tokens": "bc8e4fe79ec75347" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/openbmb/UltraLM-65b/results_2023-08-04T17-10-39.498393.json b/eval-results/openbmb/UltraLM-65b/results_2023-08-04T17-10-39.498393.json new file mode 100644 index 0000000000000000000000000000000000000000..d17278b51dea56ee69671e85c833537d0ca5a1aa --- /dev/null +++ b/eval-results/openbmb/UltraLM-65b/results_2023-08-04T17-10-39.498393.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.6313993174061433, + "acc_stderr": 0.014097810678042194, + "acc_norm": 0.6706484641638225, + "acc_norm_stderr": 0.013734057652635474 + }, + "harness|hellaswag|10": { + "acc": 0.6514638518223461, + "acc_stderr": 0.004755329243976672, + "acc_norm": 0.8498307110137423, + "acc_norm_stderr": 0.0035650718701954478 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5777777777777777, + "acc_stderr": 0.04266763404099582, + "acc_norm": 0.5777777777777777, + "acc_norm_stderr": 0.04266763404099582 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.743421052631579, + "acc_stderr": 0.0355418036802569, + "acc_norm": 0.743421052631579, + "acc_norm_stderr": 0.0355418036802569 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6566037735849056, + "acc_stderr": 0.02922452646912479, + "acc_norm": 0.6566037735849056, + "acc_norm_stderr": 0.02922452646912479 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7013888888888888, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.7013888888888888, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5549132947976878, + "acc_stderr": 0.03789401760283648, + "acc_norm": 0.5549132947976878, + "acc_norm_stderr": 0.03789401760283648 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.047840607041056527, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.047840607041056527 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5914893617021276, + "acc_stderr": 0.032134180267015755, + "acc_norm": 0.5914893617021276, + "acc_norm_stderr": 0.032134180267015755 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.35964912280701755, + "acc_stderr": 0.045144961328736334, + "acc_norm": 0.35964912280701755, + "acc_norm_stderr": 0.045144961328736334 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3915343915343915, + "acc_stderr": 0.025138091388851105, + "acc_norm": 0.3915343915343915, + "acc_norm_stderr": 0.025138091388851105 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.0442626668137991, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.0442626668137991 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7451612903225806, + "acc_stderr": 0.024790118459332208, + "acc_norm": 0.7451612903225806, + "acc_norm_stderr": 0.024790118459332208 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3891625615763547, + "acc_stderr": 0.034304624161038716, + "acc_norm": 0.3891625615763547, + "acc_norm_stderr": 0.034304624161038716 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252607, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252607 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7818181818181819, + "acc_stderr": 0.032250781083062896, + "acc_norm": 0.7818181818181819, + "acc_norm_stderr": 0.032250781083062896 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8080808080808081, + "acc_stderr": 0.02805779167298902, + "acc_norm": 0.8080808080808081, + "acc_norm_stderr": 0.02805779167298902 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8963730569948186, + "acc_stderr": 0.02199531196364424, + "acc_norm": 0.8963730569948186, + "acc_norm_stderr": 0.02199531196364424 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6282051282051282, + "acc_stderr": 0.024503472557110936, + "acc_norm": 0.6282051282051282, + "acc_norm_stderr": 0.024503472557110936 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.027940457136228416, + "acc_norm": 0.3, + "acc_norm_stderr": 0.027940457136228416 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6890756302521008, + "acc_stderr": 0.03006676158297792, + "acc_norm": 0.6890756302521008, + "acc_norm_stderr": 0.03006676158297792 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4105960264900662, + "acc_stderr": 0.04016689594849926, + "acc_norm": 0.4105960264900662, + "acc_norm_stderr": 0.04016689594849926 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8201834862385321, + "acc_stderr": 0.016465345467391524, + "acc_norm": 0.8201834862385321, + "acc_norm_stderr": 0.016465345467391524 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.0340763209385405, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.0340763209385405 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8088235294117647, + "acc_stderr": 0.02759917430064077, + "acc_norm": 0.8088235294117647, + "acc_norm_stderr": 0.02759917430064077 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8270042194092827, + "acc_stderr": 0.024621562866768417, + "acc_norm": 0.8270042194092827, + "acc_norm_stderr": 0.024621562866768417 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.672645739910314, + "acc_stderr": 0.03149384670994131, + "acc_norm": 0.672645739910314, + "acc_norm_stderr": 0.03149384670994131 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7633587786259542, + "acc_stderr": 0.03727673575596913, + "acc_norm": 0.7633587786259542, + "acc_norm_stderr": 0.03727673575596913 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.042844679680521934, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.042844679680521934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7668711656441718, + "acc_stderr": 0.0332201579577674, + "acc_norm": 0.7668711656441718, + "acc_norm_stderr": 0.0332201579577674 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4732142857142857, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.4732142857142857, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.03916667762822584, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.03916667762822584 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.021901905115073325, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.021901905115073325 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8084291187739464, + "acc_stderr": 0.014072859310451949, + "acc_norm": 0.8084291187739464, + "acc_norm_stderr": 0.014072859310451949 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7167630057803468, + "acc_stderr": 0.024257901705323378, + "acc_norm": 0.7167630057803468, + "acc_norm_stderr": 0.024257901705323378 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4793296089385475, + "acc_stderr": 0.016708205559996137, + "acc_norm": 0.4793296089385475, + "acc_norm_stderr": 0.016708205559996137 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6862745098039216, + "acc_stderr": 0.02656892101545715, + "acc_norm": 0.6862745098039216, + "acc_norm_stderr": 0.02656892101545715 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7427652733118971, + "acc_stderr": 0.02482617128925089, + "acc_norm": 0.7427652733118971, + "acc_norm_stderr": 0.02482617128925089 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7469135802469136, + "acc_stderr": 0.024191808600713002, + "acc_norm": 0.7469135802469136, + "acc_norm_stderr": 0.024191808600713002 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.49645390070921985, + "acc_stderr": 0.02982674915328092, + "acc_norm": 0.49645390070921985, + "acc_norm_stderr": 0.02982674915328092 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4830508474576271, + "acc_stderr": 0.01276289688921086, + "acc_norm": 0.4830508474576271, + "acc_norm_stderr": 0.01276289688921086 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.625, + "acc_stderr": 0.029408372932278746, + "acc_norm": 0.625, + "acc_norm_stderr": 0.029408372932278746 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.0190709855896875, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.0190709855896875 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7181818181818181, + "acc_stderr": 0.043091187099464585, + "acc_norm": 0.7181818181818181, + "acc_norm_stderr": 0.043091187099464585 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7020408163265306, + "acc_stderr": 0.029279567411065677, + "acc_norm": 0.7020408163265306, + "acc_norm_stderr": 0.029279567411065677 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8557213930348259, + "acc_stderr": 0.024845753212306053, + "acc_norm": 0.8557213930348259, + "acc_norm_stderr": 0.024845753212306053 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.03487350880197769, + "acc_norm": 0.86, + "acc_norm_stderr": 0.03487350880197769 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.536144578313253, + "acc_stderr": 0.038823108508905954, + "acc_norm": 0.536144578313253, + "acc_norm_stderr": 0.038823108508905954 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8421052631578947, + "acc_stderr": 0.02796678585916089, + "acc_norm": 0.8421052631578947, + "acc_norm_stderr": 0.02796678585916089 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.379436964504284, + "mc1_stderr": 0.01698703926614298, + "mc2": 0.5350863764391227, + "mc2_stderr": 0.014861057577356304 + }, + "all": { + "acc": 0.6350222238594815, + "acc_stderr": 0.033022634388714986, + "acc_norm": 0.6390496137908216, + "acc_norm_stderr": 0.03299629522940671, + "mc1": 0.379436964504284, + "mc1_stderr": 0.01698703926614298, + "mc2": 0.5350863764391227, + "mc2_stderr": 0.014861057577356304 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "openbmb/UltraLM-65b", + "model_sha": "", + "model_dtype": "torch.float16", + "lighteval_sha": "562505b4dff29b1764115bbdea7f8ef9120fa0cb", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "26711.215146303177", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} diff --git a/eval-results/openbmb/UltraLM-65b/results_2023-08-04T22-09-07.792369.json b/eval-results/openbmb/UltraLM-65b/results_2023-08-04T22-09-07.792369.json new file mode 100644 index 0000000000000000000000000000000000000000..5fce418f8c6884ac9a7515c2cda650acacf212e5 --- /dev/null +++ b/eval-results/openbmb/UltraLM-65b/results_2023-08-04T22-09-07.792369.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.6313993174061433, + "acc_stderr": 0.014097810678042194, + "acc_norm": 0.6706484641638225, + "acc_norm_stderr": 0.013734057652635474 + }, + "harness|hellaswag|10": { + "acc": 0.6514638518223461, + "acc_stderr": 0.004755329243976672, + "acc_norm": 0.8498307110137423, + "acc_norm_stderr": 0.0035650718701954478 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5777777777777777, + "acc_stderr": 0.04266763404099582, + "acc_norm": 0.5777777777777777, + "acc_norm_stderr": 0.04266763404099582 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.743421052631579, + "acc_stderr": 0.0355418036802569, + "acc_norm": 0.743421052631579, + "acc_norm_stderr": 0.0355418036802569 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6566037735849056, + "acc_stderr": 0.02922452646912479, + "acc_norm": 0.6566037735849056, + "acc_norm_stderr": 0.02922452646912479 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7013888888888888, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.7013888888888888, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5549132947976878, + "acc_stderr": 0.03789401760283648, + "acc_norm": 0.5549132947976878, + "acc_norm_stderr": 0.03789401760283648 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.047840607041056527, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.047840607041056527 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5914893617021276, + "acc_stderr": 0.032134180267015755, + "acc_norm": 0.5914893617021276, + "acc_norm_stderr": 0.032134180267015755 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.35964912280701755, + "acc_stderr": 0.045144961328736334, + "acc_norm": 0.35964912280701755, + "acc_norm_stderr": 0.045144961328736334 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3915343915343915, + "acc_stderr": 0.025138091388851105, + "acc_norm": 0.3915343915343915, + "acc_norm_stderr": 0.025138091388851105 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.0442626668137991, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.0442626668137991 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7451612903225806, + "acc_stderr": 0.024790118459332208, + "acc_norm": 0.7451612903225806, + "acc_norm_stderr": 0.024790118459332208 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3891625615763547, + "acc_stderr": 0.034304624161038716, + "acc_norm": 0.3891625615763547, + "acc_norm_stderr": 0.034304624161038716 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252607, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252607 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7818181818181819, + "acc_stderr": 0.032250781083062896, + "acc_norm": 0.7818181818181819, + "acc_norm_stderr": 0.032250781083062896 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8080808080808081, + "acc_stderr": 0.02805779167298902, + "acc_norm": 0.8080808080808081, + "acc_norm_stderr": 0.02805779167298902 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8963730569948186, + "acc_stderr": 0.02199531196364424, + "acc_norm": 0.8963730569948186, + "acc_norm_stderr": 0.02199531196364424 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6282051282051282, + "acc_stderr": 0.024503472557110936, + "acc_norm": 0.6282051282051282, + "acc_norm_stderr": 0.024503472557110936 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.027940457136228416, + "acc_norm": 0.3, + "acc_norm_stderr": 0.027940457136228416 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6890756302521008, + "acc_stderr": 0.03006676158297792, + "acc_norm": 0.6890756302521008, + "acc_norm_stderr": 0.03006676158297792 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4105960264900662, + "acc_stderr": 0.04016689594849926, + "acc_norm": 0.4105960264900662, + "acc_norm_stderr": 0.04016689594849926 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8201834862385321, + "acc_stderr": 0.016465345467391524, + "acc_norm": 0.8201834862385321, + "acc_norm_stderr": 0.016465345467391524 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.0340763209385405, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.0340763209385405 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8088235294117647, + "acc_stderr": 0.02759917430064077, + "acc_norm": 0.8088235294117647, + "acc_norm_stderr": 0.02759917430064077 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8270042194092827, + "acc_stderr": 0.024621562866768417, + "acc_norm": 0.8270042194092827, + "acc_norm_stderr": 0.024621562866768417 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.672645739910314, + "acc_stderr": 0.03149384670994131, + "acc_norm": 0.672645739910314, + "acc_norm_stderr": 0.03149384670994131 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7633587786259542, + "acc_stderr": 0.03727673575596913, + "acc_norm": 0.7633587786259542, + "acc_norm_stderr": 0.03727673575596913 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.042844679680521934, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.042844679680521934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7668711656441718, + "acc_stderr": 0.0332201579577674, + "acc_norm": 0.7668711656441718, + "acc_norm_stderr": 0.0332201579577674 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4732142857142857, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.4732142857142857, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.03916667762822584, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.03916667762822584 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.021901905115073325, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.021901905115073325 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8084291187739464, + "acc_stderr": 0.014072859310451949, + "acc_norm": 0.8084291187739464, + "acc_norm_stderr": 0.014072859310451949 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7167630057803468, + "acc_stderr": 0.024257901705323378, + "acc_norm": 0.7167630057803468, + "acc_norm_stderr": 0.024257901705323378 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4793296089385475, + "acc_stderr": 0.016708205559996137, + "acc_norm": 0.4793296089385475, + "acc_norm_stderr": 0.016708205559996137 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6862745098039216, + "acc_stderr": 0.02656892101545715, + "acc_norm": 0.6862745098039216, + "acc_norm_stderr": 0.02656892101545715 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7427652733118971, + "acc_stderr": 0.02482617128925089, + "acc_norm": 0.7427652733118971, + "acc_norm_stderr": 0.02482617128925089 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7469135802469136, + "acc_stderr": 0.024191808600713002, + "acc_norm": 0.7469135802469136, + "acc_norm_stderr": 0.024191808600713002 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.49645390070921985, + "acc_stderr": 0.02982674915328092, + "acc_norm": 0.49645390070921985, + "acc_norm_stderr": 0.02982674915328092 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4830508474576271, + "acc_stderr": 0.01276289688921086, + "acc_norm": 0.4830508474576271, + "acc_norm_stderr": 0.01276289688921086 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.625, + "acc_stderr": 0.029408372932278746, + "acc_norm": 0.625, + "acc_norm_stderr": 0.029408372932278746 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.0190709855896875, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.0190709855896875 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7181818181818181, + "acc_stderr": 0.043091187099464585, + "acc_norm": 0.7181818181818181, + "acc_norm_stderr": 0.043091187099464585 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7020408163265306, + "acc_stderr": 0.029279567411065677, + "acc_norm": 0.7020408163265306, + "acc_norm_stderr": 0.029279567411065677 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8557213930348259, + "acc_stderr": 0.024845753212306053, + "acc_norm": 0.8557213930348259, + "acc_norm_stderr": 0.024845753212306053 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.03487350880197769, + "acc_norm": 0.86, + "acc_norm_stderr": 0.03487350880197769 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.536144578313253, + "acc_stderr": 0.038823108508905954, + "acc_norm": 0.536144578313253, + "acc_norm_stderr": 0.038823108508905954 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8421052631578947, + "acc_stderr": 0.02796678585916089, + "acc_norm": 0.8421052631578947, + "acc_norm_stderr": 0.02796678585916089 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.379436964504284, + "mc1_stderr": 0.01698703926614298, + "mc2": 0.5350863764391227, + "mc2_stderr": 0.014861057577356304 + }, + "all": { + "acc": 0.6350222238594815, + "acc_stderr": 0.033022634388714986, + "acc_norm": 0.6390496137908216, + "acc_norm_stderr": 0.03299629522940671, + "mc1": 0.379436964504284, + "mc1_stderr": 0.01698703926614298, + "mc2": 0.5350863764391227, + "mc2_stderr": 0.014861057577356304 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "openbmb/UltraLM-65b", + "model_sha": "38c0a06d99d4e2b32acffa79b011094b27118935", + "model_dtype": "torch.float16", + "lighteval_sha": "562505b4dff29b1764115bbdea7f8ef9120fa0cb", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "ede2b335438f08e9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b41cf1ad182d68d5" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "238bd86950544b29" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "f9d6d2a7d7e9a041" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "6af58623d0d5fbcd" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "1ba0c71186b1505e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "a9b1f761089f6acc" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "eb2d5002052b5bc5" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "9b30dc19c9b62f60" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "74217a4e2868536f" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "bf39544be0ebf000" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "43570b3948564b64" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "50ab225c2f535210" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "1194078d4e38c984" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "6296151cf7fee15c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "a490d3db0ea5935a" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "6830ef7d0325d7ef" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e0203e3fc1bb0500" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "63435df622d5437b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "5e6ee2ff0404f23c" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "c81919424db3b267" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "2eae753a177d5460" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "29771089bd3c65c6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "a789a13af22308bf" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "5129a9cfb30c5239" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "cd82e108370cece8" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "61ef0c8a87f9c92d" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "d70cfe096d4fb7bd" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "c178cccd753d9bc5" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "0a3a3ea5ef49d19c" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "6d1691881e252df0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "f4b7b7f3a2788768" + }, + "total_evaluation_time_secondes": "26181.887605667114", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/openbmb/UltraLM-65b/results_2023-09-18T23-27-44.207127.json b/eval-results/openbmb/UltraLM-65b/results_2023-09-18T23-27-44.207127.json new file mode 100644 index 0000000000000000000000000000000000000000..0bc94d6e18483c7e3454b927f8c540ddc387128e --- /dev/null +++ b/eval-results/openbmb/UltraLM-65b/results_2023-09-18T23-27-44.207127.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "openbmb/UltraLM-65b", + "model_sha": "c0f13e08a9d784355ccf843d4d6f2f617ceba7f7", + "model_size": "121.68 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.4956590370955012, + "acc_stderr": 0.014051956064076911 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0, + "acc": 0.2478295185477506, + "acc_stderr": 0.007025978032038456 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "4188a613ea19125e" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "69fa6f37b46b746b" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "07d4baa45880dc39" + }, + "total_evaluation_time_secondes": "139112.5233502388", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/openbmb/UltraLM-65b/results_2023-09-23T05-14-21.286059.json b/eval-results/openbmb/UltraLM-65b/results_2023-09-23T05-14-21.286059.json new file mode 100644 index 0000000000000000000000000000000000000000..4e817c3684260bf48fe4a62a1ed59a2f9f0517dd --- /dev/null +++ b/eval-results/openbmb/UltraLM-65b/results_2023-09-23T05-14-21.286059.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "openbmb/UltraLM-65b", + "model_sha": "c0f13e08a9d784355ccf843d4d6f2f617ceba7f7", + "model_size": "121.68 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.23804530201342283, + "em_stderr": 0.004361481495925771, + "f1": 0.2999853187919465, + "f1_stderr": 0.004304795126990332 + }, + "harness|gsm8k|5": { + "acc": 0.32752084912812734, + "acc_stderr": 0.012927102210426474 + }, + "harness|winogrande|5": { + "acc": 0.8113654301499605, + "acc_stderr": 0.010995172318019811 + }, + "all": { + "em": 0.23804530201342283, + "em_stderr": 0.004361481495925771, + "f1": 0.2999853187919465, + "f1_stderr": 0.004304795126990332, + "acc": 0.5694431396390439, + "acc_stderr": 0.011961137264223144 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "e3b00f371c931c95" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "05a74375b8b772b0" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "f61fee552a8a2c51" + }, + "total_evaluation_time_secondes": "39725.53334212303", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/openbmb/UltraRM-13b/results_2023-10-08T20-45-47.827028.json b/eval-results/openbmb/UltraRM-13b/results_2023-10-08T20-45-47.827028.json new file mode 100644 index 0000000000000000000000000000000000000000..e5e4dbc5219c6f0f412acc664e4b387833b4b352 --- /dev/null +++ b/eval-results/openbmb/UltraRM-13b/results_2023-10-08T20-45-47.827028.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "openbmb/UltraRM-13b", + "model_sha": "4b231ae58c15244e6e15f0d2f4e26ec37b846229", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.22781569965870307, + "acc_stderr": 0.01225670860232692, + "acc_norm": 0.2815699658703072, + "acc_norm_stderr": 0.013143376735009014 + }, + "harness|hellaswag|10": { + "acc": 0.25542720573590916, + "acc_stderr": 0.004352098082984431, + "acc_norm": 0.2613025293766182, + "acc_norm_stderr": 0.004384465219070759 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04072314811876837, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04072314811876837 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.03583496176361064, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.03583496176361064 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2188679245283019, + "acc_stderr": 0.02544786382510861, + "acc_norm": 0.2188679245283019, + "acc_norm_stderr": 0.02544786382510861 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2986111111111111, + "acc_stderr": 0.038270523579507554, + "acc_norm": 0.2986111111111111, + "acc_norm_stderr": 0.038270523579507554 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621503, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621503 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749888, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749888 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171452, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171452 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.20425531914893616, + "acc_stderr": 0.026355158413349424, + "acc_norm": 0.20425531914893616, + "acc_norm_stderr": 0.026355158413349424 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.0414243971948936, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.0414243971948936 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.296551724137931, + "acc_stderr": 0.03806142687309993, + "acc_norm": 0.296551724137931, + "acc_norm_stderr": 0.03806142687309993 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.021132859182754427, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.021132859182754427 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.03893259610604673, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.03893259610604673 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24838709677419354, + "acc_stderr": 0.024580028921481003, + "acc_norm": 0.24838709677419354, + "acc_norm_stderr": 0.024580028921481003 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.30049261083743845, + "acc_stderr": 0.03225799476233484, + "acc_norm": 0.30049261083743845, + "acc_norm_stderr": 0.03225799476233484 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.25757575757575757, + "acc_stderr": 0.03115626951964683, + "acc_norm": 0.25757575757575757, + "acc_norm_stderr": 0.03115626951964683 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.20725388601036268, + "acc_stderr": 0.029252823291803638, + "acc_norm": 0.20725388601036268, + "acc_norm_stderr": 0.029252823291803638 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2128205128205128, + "acc_stderr": 0.020752423722128013, + "acc_norm": 0.2128205128205128, + "acc_norm_stderr": 0.020752423722128013 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.27037037037037037, + "acc_stderr": 0.027080372815145665, + "acc_norm": 0.27037037037037037, + "acc_norm_stderr": 0.027080372815145665 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21008403361344538, + "acc_stderr": 0.026461398717471874, + "acc_norm": 0.21008403361344538, + "acc_norm_stderr": 0.026461398717471874 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.03802039760107903, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.03802039760107903 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.26972477064220185, + "acc_stderr": 0.01902848671111544, + "acc_norm": 0.26972477064220185, + "acc_norm_stderr": 0.01902848671111544 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3194444444444444, + "acc_stderr": 0.03179876342176852, + "acc_norm": 0.3194444444444444, + "acc_norm_stderr": 0.03179876342176852 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.0309645179269234, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.0309645179269234 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.32489451476793246, + "acc_stderr": 0.030486039389105303, + "acc_norm": 0.32489451476793246, + "acc_norm_stderr": 0.030486039389105303 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.20179372197309417, + "acc_stderr": 0.026936111912802273, + "acc_norm": 0.20179372197309417, + "acc_norm_stderr": 0.026936111912802273 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.22900763358778625, + "acc_stderr": 0.036853466317118506, + "acc_norm": 0.22900763358778625, + "acc_norm_stderr": 0.036853466317118506 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.04065578140908705, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.04065578140908705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.294478527607362, + "acc_stderr": 0.03581165790474082, + "acc_norm": 0.294478527607362, + "acc_norm_stderr": 0.03581165790474082 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.22321428571428573, + "acc_stderr": 0.039523019677025116, + "acc_norm": 0.22321428571428573, + "acc_norm_stderr": 0.039523019677025116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.1941747572815534, + "acc_stderr": 0.039166677628225836, + "acc_norm": 0.1941747572815534, + "acc_norm_stderr": 0.039166677628225836 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2564102564102564, + "acc_stderr": 0.02860595370200425, + "acc_norm": 0.2564102564102564, + "acc_norm_stderr": 0.02860595370200425 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.2, + "acc_stderr": 0.040201512610368445, + "acc_norm": 0.2, + "acc_norm_stderr": 0.040201512610368445 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.26947637292464877, + "acc_stderr": 0.01586624307321506, + "acc_norm": 0.26947637292464877, + "acc_norm_stderr": 0.01586624307321506 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.28901734104046245, + "acc_stderr": 0.024405173935783238, + "acc_norm": 0.28901734104046245, + "acc_norm_stderr": 0.024405173935783238 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.27039106145251396, + "acc_stderr": 0.014854993938010088, + "acc_norm": 0.27039106145251396, + "acc_norm_stderr": 0.014854993938010088 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.30392156862745096, + "acc_stderr": 0.026336613469046633, + "acc_norm": 0.30392156862745096, + "acc_norm_stderr": 0.026336613469046633 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2958199356913183, + "acc_stderr": 0.025922371788818798, + "acc_norm": 0.2958199356913183, + "acc_norm_stderr": 0.025922371788818798 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.02492200116888633, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.02492200116888633 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.24468085106382978, + "acc_stderr": 0.025645553622266726, + "acc_norm": 0.24468085106382978, + "acc_norm_stderr": 0.025645553622266726 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2392438070404172, + "acc_stderr": 0.01089612365267665, + "acc_norm": 0.2392438070404172, + "acc_norm_stderr": 0.01089612365267665 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.24632352941176472, + "acc_stderr": 0.02617343857052, + "acc_norm": 0.24632352941176472, + "acc_norm_stderr": 0.02617343857052 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2369281045751634, + "acc_stderr": 0.017201662169789782, + "acc_norm": 0.2369281045751634, + "acc_norm_stderr": 0.017201662169789782 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2, + "acc_stderr": 0.03831305140884603, + "acc_norm": 0.2, + "acc_norm_stderr": 0.03831305140884603 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3183673469387755, + "acc_stderr": 0.029822533793982055, + "acc_norm": 0.3183673469387755, + "acc_norm_stderr": 0.029822533793982055 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2537313432835821, + "acc_stderr": 0.03076944496729601, + "acc_norm": 0.2537313432835821, + "acc_norm_stderr": 0.03076944496729601 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.20481927710843373, + "acc_stderr": 0.03141784291663926, + "acc_norm": 0.20481927710843373, + "acc_norm_stderr": 0.03141784291663926 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.29239766081871343, + "acc_stderr": 0.034886477134579215, + "acc_norm": 0.29239766081871343, + "acc_norm_stderr": 0.034886477134579215 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862658, + "mc2": 0.4790548760214852, + "mc2_stderr": 0.016309927278874654 + }, + "all": { + "acc": 0.2590094885923756, + "acc_stderr": 0.03189328196381238, + "acc_norm": 0.26002015960682157, + "acc_norm_stderr": 0.031908858832774556, + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862658, + "mc2": 0.4790548760214852, + "mc2_stderr": 0.016309927278874654 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6363.167981147766", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/openbmb/UltraRM-13b/results_2023-10-24T08-13-56.124311.json b/eval-results/openbmb/UltraRM-13b/results_2023-10-24T08-13-56.124311.json new file mode 100644 index 0000000000000000000000000000000000000000..78310ff1e67fe71289911eb868c10ccc27c23490 --- /dev/null +++ b/eval-results/openbmb/UltraRM-13b/results_2023-10-24T08-13-56.124311.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "openbmb/UltraRM-13b", + "model_sha": "a5b63d2f9531c25b65040981b4f2d8044e5fa378", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.49329123914759276, + "acc_stderr": 0.014051220692330349 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0, + "acc": 0.24664561957379638, + "acc_stderr": 0.0070256103461651745 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "04d93fc920cc7160" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "81f5a62c3b3dd723" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "22176751052b8d55" + }, + "total_evaluation_time_secondes": "33946.37185168266", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/openbmb/UltraRM-13b/results_2023-12-02T13-26-56.823138.json b/eval-results/openbmb/UltraRM-13b/results_2023-12-02T13-26-56.823138.json new file mode 100644 index 0000000000000000000000000000000000000000..86d0728af2fc22ec85722bfc4557eb9dc0ccb443 --- /dev/null +++ b/eval-results/openbmb/UltraRM-13b/results_2023-12-02T13-26-56.823138.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1382322.507106528, + "end_time": 1386719.703191172, + "total_evaluation_time_secondes": "4397.19608464418", + "model_name": "openbmb/UltraRM-13b", + "model_sha": "a5b63d2f9531c25b65040981b4f2d8044e5fa378", + "model_dtype": "torch.float16", + "model_size": "24.32 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "all": { + "acc": 0.0, + "acc_stderr": 0.0 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "eb2bd0c4c3d69eab" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "42036645de5ac59d", + "hash_cont_tokens": "ced6e4f5433e1c36" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/openthaigpt/openthaigpt-1.0.0-alpha-7b-chat-ckpt-hf/results_2023-08-18T12-43-45.904593.json b/eval-results/openthaigpt/openthaigpt-1.0.0-alpha-7b-chat-ckpt-hf/results_2023-08-18T12-43-45.904593.json new file mode 100644 index 0000000000000000000000000000000000000000..6cc8f07231744f6bc6e294c403dce3646cb6a365 --- /dev/null +++ b/eval-results/openthaigpt/openthaigpt-1.0.0-alpha-7b-chat-ckpt-hf/results_2023-08-18T12-43-45.904593.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.48293515358361777, + "acc_stderr": 0.014602878388536598, + "acc_norm": 0.5085324232081911, + "acc_norm_stderr": 0.014609263165632179 + }, + "harness|hellaswag|10": { + "acc": 0.5540728938458475, + "acc_stderr": 0.004960516570284905, + "acc_norm": 0.7488548097988449, + "acc_norm_stderr": 0.0043278555884664165 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.43703703703703706, + "acc_stderr": 0.04284958639753399, + "acc_norm": 0.43703703703703706, + "acc_norm_stderr": 0.04284958639753399 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.45394736842105265, + "acc_stderr": 0.040516463428741434, + "acc_norm": 0.45394736842105265, + "acc_norm_stderr": 0.040516463428741434 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.46037735849056605, + "acc_stderr": 0.030676096599389188, + "acc_norm": 0.46037735849056605, + "acc_norm_stderr": 0.030676096599389188 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4305555555555556, + "acc_stderr": 0.04140685639111502, + "acc_norm": 0.4305555555555556, + "acc_norm_stderr": 0.04140685639111502 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.3, + "acc_stderr": 0.04605661864718381, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04605661864718381 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3468208092485549, + "acc_stderr": 0.036291466701596636, + "acc_norm": 0.3468208092485549, + "acc_norm_stderr": 0.036291466701596636 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.04023382273617747, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.04023382273617747 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4085106382978723, + "acc_stderr": 0.03213418026701576, + "acc_norm": 0.4085106382978723, + "acc_norm_stderr": 0.03213418026701576 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.043727482902780064, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.043727482902780064 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.35172413793103446, + "acc_stderr": 0.0397923663749741, + "acc_norm": 0.35172413793103446, + "acc_norm_stderr": 0.0397923663749741 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30158730158730157, + "acc_stderr": 0.023636975996101796, + "acc_norm": 0.30158730158730157, + "acc_norm_stderr": 0.023636975996101796 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.037184890068181146, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.037184890068181146 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.4096774193548387, + "acc_stderr": 0.027976054915347368, + "acc_norm": 0.4096774193548387, + "acc_norm_stderr": 0.027976054915347368 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3448275862068966, + "acc_stderr": 0.03344283744280458, + "acc_norm": 0.3448275862068966, + "acc_norm_stderr": 0.03344283744280458 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.46060606060606063, + "acc_stderr": 0.03892207016552013, + "acc_norm": 0.46060606060606063, + "acc_norm_stderr": 0.03892207016552013 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5959595959595959, + "acc_stderr": 0.03496130972056129, + "acc_norm": 0.5959595959595959, + "acc_norm_stderr": 0.03496130972056129 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.538860103626943, + "acc_stderr": 0.035975244117345775, + "acc_norm": 0.538860103626943, + "acc_norm_stderr": 0.035975244117345775 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.32564102564102565, + "acc_stderr": 0.02375966576741229, + "acc_norm": 0.32564102564102565, + "acc_norm_stderr": 0.02375966576741229 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.02696242432507383, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.02696242432507383 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3277310924369748, + "acc_stderr": 0.030489911417673227, + "acc_norm": 0.3277310924369748, + "acc_norm_stderr": 0.030489911417673227 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2185430463576159, + "acc_stderr": 0.03374235550425694, + "acc_norm": 0.2185430463576159, + "acc_norm_stderr": 0.03374235550425694 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.5174311926605505, + "acc_stderr": 0.02142429187185315, + "acc_norm": 0.5174311926605505, + "acc_norm_stderr": 0.02142429187185315 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.027920963147993666, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.027920963147993666 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.0345423658538061, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.0345423658538061 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.459915611814346, + "acc_stderr": 0.03244246810187913, + "acc_norm": 0.459915611814346, + "acc_norm_stderr": 0.03244246810187913 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.4618834080717489, + "acc_stderr": 0.033460150119732274, + "acc_norm": 0.4618834080717489, + "acc_norm_stderr": 0.033460150119732274 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.40458015267175573, + "acc_stderr": 0.043046937953806645, + "acc_norm": 0.40458015267175573, + "acc_norm_stderr": 0.043046937953806645 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5619834710743802, + "acc_stderr": 0.04529146804435792, + "acc_norm": 0.5619834710743802, + "acc_norm_stderr": 0.04529146804435792 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.4351851851851852, + "acc_stderr": 0.04792898170907062, + "acc_norm": 0.4351851851851852, + "acc_norm_stderr": 0.04792898170907062 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4233128834355828, + "acc_stderr": 0.03881891213334383, + "acc_norm": 0.4233128834355828, + "acc_norm_stderr": 0.03881891213334383 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2767857142857143, + "acc_stderr": 0.04246624336697625, + "acc_norm": 0.2767857142857143, + "acc_norm_stderr": 0.04246624336697625 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.49514563106796117, + "acc_stderr": 0.04950504382128921, + "acc_norm": 0.49514563106796117, + "acc_norm_stderr": 0.04950504382128921 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6410256410256411, + "acc_stderr": 0.03142616993791924, + "acc_norm": 0.6410256410256411, + "acc_norm_stderr": 0.03142616993791924 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.565772669220945, + "acc_stderr": 0.017724589389677785, + "acc_norm": 0.565772669220945, + "acc_norm_stderr": 0.017724589389677785 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.45375722543352603, + "acc_stderr": 0.026803720583206184, + "acc_norm": 0.45375722543352603, + "acc_norm_stderr": 0.026803720583206184 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25139664804469275, + "acc_stderr": 0.014508979453553988, + "acc_norm": 0.25139664804469275, + "acc_norm_stderr": 0.014508979453553988 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4477124183006536, + "acc_stderr": 0.02847293847803353, + "acc_norm": 0.4477124183006536, + "acc_norm_stderr": 0.02847293847803353 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5209003215434084, + "acc_stderr": 0.028373270961069414, + "acc_norm": 0.5209003215434084, + "acc_norm_stderr": 0.028373270961069414 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.4567901234567901, + "acc_stderr": 0.027716661650194045, + "acc_norm": 0.4567901234567901, + "acc_norm_stderr": 0.027716661650194045 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3191489361702128, + "acc_stderr": 0.0278079901413202, + "acc_norm": 0.3191489361702128, + "acc_norm_stderr": 0.0278079901413202 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2803129074315515, + "acc_stderr": 0.01147155594495862, + "acc_norm": 0.2803129074315515, + "acc_norm_stderr": 0.01147155594495862 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.27205882352941174, + "acc_stderr": 0.027033041151681456, + "acc_norm": 0.27205882352941174, + "acc_norm_stderr": 0.027033041151681456 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.3839869281045752, + "acc_stderr": 0.01967580813528152, + "acc_norm": 0.3839869281045752, + "acc_norm_stderr": 0.01967580813528152 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.4636363636363636, + "acc_stderr": 0.047764491623961985, + "acc_norm": 0.4636363636363636, + "acc_norm_stderr": 0.047764491623961985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2816326530612245, + "acc_stderr": 0.02879518557429129, + "acc_norm": 0.2816326530612245, + "acc_norm_stderr": 0.02879518557429129 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.43781094527363185, + "acc_stderr": 0.035080801121998406, + "acc_norm": 0.43781094527363185, + "acc_norm_stderr": 0.035080801121998406 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.39156626506024095, + "acc_stderr": 0.03799857454479636, + "acc_norm": 0.39156626506024095, + "acc_norm_stderr": 0.03799857454479636 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6081871345029239, + "acc_stderr": 0.037439798259263996, + "acc_norm": 0.6081871345029239, + "acc_norm_stderr": 0.037439798259263996 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3023255813953488, + "mc1_stderr": 0.016077509266133026, + "mc2": 0.47234141710510064, + "mc2_stderr": 0.015662611567973887 + }, + "all": { + "acc": 0.40418003089171195, + "acc_stderr": 0.034880823728323934, + "acc_norm": 0.4079152713252301, + "acc_norm_stderr": 0.034870208877396426, + "mc1": 0.3023255813953488, + "mc1_stderr": 0.016077509266133026, + "mc2": 0.47234141710510064, + "mc2_stderr": 0.015662611567973887 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "openthaigpt/openthaigpt-1.0.0-alpha-7b-chat-ckpt-hf", + "model_sha": "cdffb3488c5cb1a9aa5039a6b3bc72af24827db0", + "model_dtype": "torch.float16", + "lighteval_sha": "8bab069fee0c6e75ffa4c1ef8a9591c28ee0e049", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "61571bf68d6d89aa", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "29906669b1c7054a", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "1d633b3cc0524ba8", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "8c34e0f2bda77358", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "424b02981230ee83", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 8, + "non-truncated": 940, + "padded": 940, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non-truncated": 5532, + "padded": 5524, + "non-padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "6fecf578c508db6a", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "2561.3123438358307", + "truncated": 2088, + "non-truncated": 108931, + "padded": 108834, + "non-padded": 2185, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/openthaigpt/openthaigpt-1.0.0-alpha-7b-chat-ckpt-hf/results_2023-09-22T23-15-18.463104.json b/eval-results/openthaigpt/openthaigpt-1.0.0-alpha-7b-chat-ckpt-hf/results_2023-09-22T23-15-18.463104.json new file mode 100644 index 0000000000000000000000000000000000000000..b6f93e2e1c6bcb6b3dcb084d5e5426fbcb1f077c --- /dev/null +++ b/eval-results/openthaigpt/openthaigpt-1.0.0-alpha-7b-chat-ckpt-hf/results_2023-09-22T23-15-18.463104.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "openthaigpt/openthaigpt-1.0.0-alpha-7b-chat-ckpt-hf", + "model_sha": "cdffb3488c5cb1a9aa5039a6b3bc72af24827db0", + "model_size": "12.58 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.031774328859060404, + "em_stderr": 0.0017962473521312393, + "f1": 0.08420092281879202, + "f1_stderr": 0.0021474530604162255 + }, + "harness|gsm8k|5": { + "acc": 0.03866565579984837, + "acc_stderr": 0.005310583162098024 + }, + "harness|winogrande|5": { + "acc": 0.6906077348066298, + "acc_stderr": 0.012991329330822995 + }, + "all": { + "em": 0.031774328859060404, + "em_stderr": 0.0017962473521312393, + "f1": 0.08420092281879202, + "f1_stderr": 0.0021474530604162255, + "acc": 0.3646366953032391, + "acc_stderr": 0.00915095624646051 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "b96c73729fbfded9" + }, + "truncated": 1263, + "non-truncated": 8273, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "54ff40f1f75624cf" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "80afe720f936f8d2", + "hash_cont_tokens": "5cb38a42788d95e9" + }, + "total_evaluation_time_secondes": "8837.381135702133", + "truncated": 1263, + "non-truncated": 12126, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/prithivida/Asimov-7B-v1/results_2023-11-19T10-40-19.617701.json b/eval-results/prithivida/Asimov-7B-v1/results_2023-11-19T10-40-19.617701.json new file mode 100644 index 0000000000000000000000000000000000000000..4cfd8dafc8e1f73315f5d44f7420f2d684ce1856 --- /dev/null +++ b/eval-results/prithivida/Asimov-7B-v1/results_2023-11-19T10-40-19.617701.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 155238.222681924, + "end_time": 168319.830538866, + "total_evaluation_time_secondes": "13081.607856941991", + "model_name": "prithivida/Asimov-7B-v1", + "model_sha": "0b33ad0a6dde60156ee6008ff47f7cfa6cd27937", + "model_dtype": "4bit", + "model_size": "4.24 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5460750853242321, + "acc_stderr": 0.014549221105171864, + "acc_norm": 0.590443686006826, + "acc_norm_stderr": 0.01437035863247244 + }, + "harness|hellaswag|10": { + "acc": 0.6097390957976498, + "acc_stderr": 0.004868117598481945, + "acc_norm": 0.8004381597291377, + "acc_norm_stderr": 0.00398854190214743 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5526315789473685, + "acc_stderr": 0.04046336883978251, + "acc_norm": 0.5526315789473685, + "acc_norm_stderr": 0.04046336883978251 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6188679245283019, + "acc_stderr": 0.029890609686286634, + "acc_norm": 0.6188679245283019, + "acc_norm_stderr": 0.029890609686286634 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.04122728707651282, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.04122728707651282 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5664739884393064, + "acc_stderr": 0.03778621079092055, + "acc_norm": 0.5664739884393064, + "acc_norm_stderr": 0.03778621079092055 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.37254901960784315, + "acc_stderr": 0.04810840148082635, + "acc_norm": 0.37254901960784315, + "acc_norm_stderr": 0.04810840148082635 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.71, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4595744680851064, + "acc_stderr": 0.032579014820998356, + "acc_norm": 0.4595744680851064, + "acc_norm_stderr": 0.032579014820998356 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.044346007015849245, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.044346007015849245 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.04166567577101579, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.04166567577101579 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3862433862433862, + "acc_stderr": 0.02507598176760168, + "acc_norm": 0.3862433862433862, + "acc_norm_stderr": 0.02507598176760168 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.04403438954768177, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.04403438954768177 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6741935483870968, + "acc_stderr": 0.026662010578567107, + "acc_norm": 0.6741935483870968, + "acc_norm_stderr": 0.026662010578567107 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4088669950738916, + "acc_stderr": 0.034590588158832314, + "acc_norm": 0.4088669950738916, + "acc_norm_stderr": 0.034590588158832314 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7333333333333333, + "acc_stderr": 0.03453131801885417, + "acc_norm": 0.7333333333333333, + "acc_norm_stderr": 0.03453131801885417 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7323232323232324, + "acc_stderr": 0.03154449888270285, + "acc_norm": 0.7323232323232324, + "acc_norm_stderr": 0.03154449888270285 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8290155440414507, + "acc_stderr": 0.027171213683164528, + "acc_norm": 0.8290155440414507, + "acc_norm_stderr": 0.027171213683164528 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5487179487179488, + "acc_stderr": 0.025230381238934837, + "acc_norm": 0.5487179487179488, + "acc_norm_stderr": 0.025230381238934837 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.027840811495871927, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.027840811495871927 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6050420168067226, + "acc_stderr": 0.03175367846096626, + "acc_norm": 0.6050420168067226, + "acc_norm_stderr": 0.03175367846096626 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.03822746937658753, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.03822746937658753 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7431192660550459, + "acc_stderr": 0.018732492928342472, + "acc_norm": 0.7431192660550459, + "acc_norm_stderr": 0.018732492928342472 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.49074074074074076, + "acc_stderr": 0.034093869469927006, + "acc_norm": 0.49074074074074076, + "acc_norm_stderr": 0.034093869469927006 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7058823529411765, + "acc_stderr": 0.03198001660115071, + "acc_norm": 0.7058823529411765, + "acc_norm_stderr": 0.03198001660115071 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7383966244725738, + "acc_stderr": 0.028609516716994934, + "acc_norm": 0.7383966244725738, + "acc_norm_stderr": 0.028609516716994934 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6143497757847534, + "acc_stderr": 0.03266842214289201, + "acc_norm": 0.6143497757847534, + "acc_norm_stderr": 0.03266842214289201 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7480916030534351, + "acc_stderr": 0.03807387116306086, + "acc_norm": 0.7480916030534351, + "acc_norm_stderr": 0.03807387116306086 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7603305785123967, + "acc_stderr": 0.03896878985070416, + "acc_norm": 0.7603305785123967, + "acc_norm_stderr": 0.03896878985070416 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6574074074074074, + "acc_stderr": 0.045879047413018105, + "acc_norm": 0.6574074074074074, + "acc_norm_stderr": 0.045879047413018105 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6748466257668712, + "acc_stderr": 0.036803503712864595, + "acc_norm": 0.6748466257668712, + "acc_norm_stderr": 0.036803503712864595 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.45535714285714285, + "acc_stderr": 0.047268355537191, + "acc_norm": 0.45535714285714285, + "acc_norm_stderr": 0.047268355537191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.045416094465039476, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.045416094465039476 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.811965811965812, + "acc_stderr": 0.02559819368665226, + "acc_norm": 0.811965811965812, + "acc_norm_stderr": 0.02559819368665226 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.65, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.65, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7075351213282248, + "acc_stderr": 0.016267000684598645, + "acc_norm": 0.7075351213282248, + "acc_norm_stderr": 0.016267000684598645 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5924855491329479, + "acc_stderr": 0.0264545781469315, + "acc_norm": 0.5924855491329479, + "acc_norm_stderr": 0.0264545781469315 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24692737430167597, + "acc_stderr": 0.014422292204808848, + "acc_norm": 0.24692737430167597, + "acc_norm_stderr": 0.014422292204808848 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6274509803921569, + "acc_stderr": 0.027684181883302895, + "acc_norm": 0.6274509803921569, + "acc_norm_stderr": 0.027684181883302895 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6270096463022508, + "acc_stderr": 0.027466610213140105, + "acc_norm": 0.6270096463022508, + "acc_norm_stderr": 0.027466610213140105 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.027431623722415005, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.027431623722415005 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.40425531914893614, + "acc_stderr": 0.02927553215970473, + "acc_norm": 0.40425531914893614, + "acc_norm_stderr": 0.02927553215970473 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3852672750977836, + "acc_stderr": 0.012429485434955194, + "acc_norm": 0.3852672750977836, + "acc_norm_stderr": 0.012429485434955194 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5698529411764706, + "acc_stderr": 0.030074971917302875, + "acc_norm": 0.5698529411764706, + "acc_norm_stderr": 0.030074971917302875 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5686274509803921, + "acc_stderr": 0.020036393768352638, + "acc_norm": 0.5686274509803921, + "acc_norm_stderr": 0.020036393768352638 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5363636363636364, + "acc_stderr": 0.04776449162396197, + "acc_norm": 0.5363636363636364, + "acc_norm_stderr": 0.04776449162396197 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6571428571428571, + "acc_stderr": 0.030387262919547728, + "acc_norm": 0.6571428571428571, + "acc_norm_stderr": 0.030387262919547728 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7711442786069652, + "acc_stderr": 0.029705284056772432, + "acc_norm": 0.7711442786069652, + "acc_norm_stderr": 0.029705284056772432 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.76, + "acc_stderr": 0.04292346959909282, + "acc_norm": 0.76, + "acc_norm_stderr": 0.04292346959909282 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.45180722891566266, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.45180722891566266, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7368421052631579, + "acc_stderr": 0.03377310252209205, + "acc_norm": 0.7368421052631579, + "acc_norm_stderr": 0.03377310252209205 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3463892288861689, + "mc1_stderr": 0.01665699710912514, + "mc2": 0.5114755425083032, + "mc2_stderr": 0.015500857240755488 + }, + "harness|winogrande|5": { + "acc": 0.739542225730071, + "acc_stderr": 0.012334833671998297 + }, + "harness|drop|3": { + "em": 0.004928691275167785, + "em_stderr": 0.0007171872517059772, + "f1": 0.06691170302013415, + "f1_stderr": 0.0015363127511980274 + }, + "harness|gsm8k|5": { + "acc": 0.0932524639878696, + "acc_stderr": 0.00800968883832857 + }, + "all": { + "acc": 0.5591327615228756, + "acc_stderr": 0.033793433613618404, + "acc_norm": 0.5679554479286705, + "acc_norm_stderr": 0.034576211690701054, + "mc1": 0.3463892288861689, + "mc1_stderr": 0.01665699710912514, + "mc2": 0.5114755425083032, + "mc2_stderr": 0.015500857240755488, + "em": 0.004928691275167785, + "em_stderr": 0.0007171872517059772, + "f1": 0.06691170302013415, + "f1_stderr": 0.0015363127511980274 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "4b4293f4648cec4d" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "5470d5f61cf6fb40" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "1a17a2da6a074496" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/prithivida/Asimov-7B-v2/results_2023-11-28T19-29-47.574027.json b/eval-results/prithivida/Asimov-7B-v2/results_2023-11-28T19-29-47.574027.json new file mode 100644 index 0000000000000000000000000000000000000000..5e4d89ae1a91143a8e7c4a65a6e170dedbd20fb8 --- /dev/null +++ b/eval-results/prithivida/Asimov-7B-v2/results_2023-11-28T19-29-47.574027.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1037599.718079506, + "end_time": 1052694.784414338, + "total_evaluation_time_secondes": "15095.066334831994", + "model_name": "prithivida/Asimov-7B-v2", + "model_sha": "0aeea2284ac78cac081bee88e5a98a19bb987227", + "model_dtype": "4bit", + "model_size": "4.24 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4974402730375427, + "acc_stderr": 0.014611199329843784, + "acc_norm": 0.5426621160409556, + "acc_norm_stderr": 0.014558106543924065 + }, + "harness|hellaswag|10": { + "acc": 0.5972913762198765, + "acc_stderr": 0.0048944072572158085, + "acc_norm": 0.7871937860983867, + "acc_norm_stderr": 0.004084552641903654 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4222222222222222, + "acc_stderr": 0.042667634040995814, + "acc_norm": 0.4222222222222222, + "acc_norm_stderr": 0.042667634040995814 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5723684210526315, + "acc_stderr": 0.04026097083296564, + "acc_norm": 0.5723684210526315, + "acc_norm_stderr": 0.04026097083296564 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5584905660377358, + "acc_stderr": 0.030561590426731837, + "acc_norm": 0.5584905660377358, + "acc_norm_stderr": 0.030561590426731837 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5902777777777778, + "acc_stderr": 0.04112490974670787, + "acc_norm": 0.5902777777777778, + "acc_norm_stderr": 0.04112490974670787 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4682080924855491, + "acc_stderr": 0.03804749744364763, + "acc_norm": 0.4682080924855491, + "acc_norm_stderr": 0.03804749744364763 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.042801058373643966, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.042801058373643966 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4595744680851064, + "acc_stderr": 0.032579014820998356, + "acc_norm": 0.4595744680851064, + "acc_norm_stderr": 0.032579014820998356 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.04537815354939392, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.04537815354939392 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4413793103448276, + "acc_stderr": 0.04137931034482758, + "acc_norm": 0.4413793103448276, + "acc_norm_stderr": 0.04137931034482758 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.02459497512892094, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.02459497512892094 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30952380952380953, + "acc_stderr": 0.041349130183033156, + "acc_norm": 0.30952380952380953, + "acc_norm_stderr": 0.041349130183033156 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.632258064516129, + "acc_stderr": 0.027430866579973463, + "acc_norm": 0.632258064516129, + "acc_norm_stderr": 0.027430866579973463 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4187192118226601, + "acc_stderr": 0.03471192860518468, + "acc_norm": 0.4187192118226601, + "acc_norm_stderr": 0.03471192860518468 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6484848484848484, + "acc_stderr": 0.0372820699868265, + "acc_norm": 0.6484848484848484, + "acc_norm_stderr": 0.0372820699868265 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6565656565656566, + "acc_stderr": 0.03383201223244442, + "acc_norm": 0.6565656565656566, + "acc_norm_stderr": 0.03383201223244442 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7098445595854922, + "acc_stderr": 0.03275264467791516, + "acc_norm": 0.7098445595854922, + "acc_norm_stderr": 0.03275264467791516 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5051282051282051, + "acc_stderr": 0.025349672906838653, + "acc_norm": 0.5051282051282051, + "acc_norm_stderr": 0.025349672906838653 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.028037929969114986, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.028037929969114986 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.49159663865546216, + "acc_stderr": 0.03247390276569669, + "acc_norm": 0.49159663865546216, + "acc_norm_stderr": 0.03247390276569669 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6788990825688074, + "acc_stderr": 0.020018149772733747, + "acc_norm": 0.6788990825688074, + "acc_norm_stderr": 0.020018149772733747 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4305555555555556, + "acc_stderr": 0.03376922151252336, + "acc_norm": 0.4305555555555556, + "acc_norm_stderr": 0.03376922151252336 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.03283472056108561, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.03283472056108561 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6582278481012658, + "acc_stderr": 0.030874537537553617, + "acc_norm": 0.6582278481012658, + "acc_norm_stderr": 0.030874537537553617 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5964125560538116, + "acc_stderr": 0.03292802819330314, + "acc_norm": 0.5964125560538116, + "acc_norm_stderr": 0.03292802819330314 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6776859504132231, + "acc_stderr": 0.04266416363352168, + "acc_norm": 0.6776859504132231, + "acc_norm_stderr": 0.04266416363352168 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6203703703703703, + "acc_stderr": 0.04691521224077742, + "acc_norm": 0.6203703703703703, + "acc_norm_stderr": 0.04691521224077742 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6012269938650306, + "acc_stderr": 0.038470214204560246, + "acc_norm": 0.6012269938650306, + "acc_norm_stderr": 0.038470214204560246 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.44642857142857145, + "acc_stderr": 0.04718471485219588, + "acc_norm": 0.44642857142857145, + "acc_norm_stderr": 0.04718471485219588 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7087378640776699, + "acc_stderr": 0.04498676320572921, + "acc_norm": 0.7087378640776699, + "acc_norm_stderr": 0.04498676320572921 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7606837606837606, + "acc_stderr": 0.027951826808924333, + "acc_norm": 0.7606837606837606, + "acc_norm_stderr": 0.027951826808924333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.58, + "acc_stderr": 0.04960449637488583, + "acc_norm": 0.58, + "acc_norm_stderr": 0.04960449637488583 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7062579821200511, + "acc_stderr": 0.016287759388491672, + "acc_norm": 0.7062579821200511, + "acc_norm_stderr": 0.016287759388491672 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5346820809248555, + "acc_stderr": 0.026854257928258896, + "acc_norm": 0.5346820809248555, + "acc_norm_stderr": 0.026854257928258896 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25139664804469275, + "acc_stderr": 0.014508979453553979, + "acc_norm": 0.25139664804469275, + "acc_norm_stderr": 0.014508979453553979 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5522875816993464, + "acc_stderr": 0.02847293847803353, + "acc_norm": 0.5522875816993464, + "acc_norm_stderr": 0.02847293847803353 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6045016077170418, + "acc_stderr": 0.027770918531427838, + "acc_norm": 0.6045016077170418, + "acc_norm_stderr": 0.027770918531427838 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5493827160493827, + "acc_stderr": 0.027684721415656196, + "acc_norm": 0.5493827160493827, + "acc_norm_stderr": 0.027684721415656196 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4148936170212766, + "acc_stderr": 0.0293922365846125, + "acc_norm": 0.4148936170212766, + "acc_norm_stderr": 0.0293922365846125 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.35984354628422427, + "acc_stderr": 0.012258260483689802, + "acc_norm": 0.35984354628422427, + "acc_norm_stderr": 0.012258260483689802 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5514705882352942, + "acc_stderr": 0.030211479609121596, + "acc_norm": 0.5514705882352942, + "acc_norm_stderr": 0.030211479609121596 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5098039215686274, + "acc_stderr": 0.020223946005074295, + "acc_norm": 0.5098039215686274, + "acc_norm_stderr": 0.020223946005074295 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5363636363636364, + "acc_stderr": 0.04776449162396197, + "acc_norm": 0.5363636363636364, + "acc_norm_stderr": 0.04776449162396197 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6122448979591837, + "acc_stderr": 0.031192230726795656, + "acc_norm": 0.6122448979591837, + "acc_norm_stderr": 0.031192230726795656 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.746268656716418, + "acc_stderr": 0.030769444967296018, + "acc_norm": 0.746268656716418, + "acc_norm_stderr": 0.030769444967296018 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.39156626506024095, + "acc_stderr": 0.037998574544796354, + "acc_norm": 0.39156626506024095, + "acc_norm_stderr": 0.037998574544796354 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6900584795321637, + "acc_stderr": 0.035469769593931624, + "acc_norm": 0.6900584795321637, + "acc_norm_stderr": 0.035469769593931624 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3072215422276622, + "mc1_stderr": 0.01615020132132302, + "mc2": 0.4544484235399292, + "mc2_stderr": 0.01561416491835134 + }, + "harness|winogrande|5": { + "acc": 0.7182320441988951, + "acc_stderr": 0.012643326011852944 + }, + "harness|drop|3": { + "em": 0.0030411073825503355, + "em_stderr": 0.0005638896908753139, + "f1": 0.06545931208053697, + "f1_stderr": 0.0014724949241401988 + }, + "harness|gsm8k|5": { + "acc": 0.0667172100075815, + "acc_stderr": 0.006873340544455128 + }, + "all": { + "acc": 0.5222413455083205, + "acc_stderr": 0.03427431304408158, + "acc_norm": 0.5306253741471694, + "acc_norm_stderr": 0.03509073697849893, + "mc1": 0.3072215422276622, + "mc1_stderr": 0.01615020132132302, + "mc2": 0.4544484235399292, + "mc2_stderr": 0.01561416491835134, + "em": 0.0030411073825503355, + "em_stderr": 0.0005638896908753139, + "f1": 0.06545931208053697, + "f1_stderr": 0.0014724949241401988 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "96286e9793e4e1a4" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "a48145172cfc43ff" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "35588b1bd2b52ee1" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/prithivida/Asimov-7B-v2/results_2023-12-03T19-02-27.334666.json b/eval-results/prithivida/Asimov-7B-v2/results_2023-12-03T19-02-27.334666.json new file mode 100644 index 0000000000000000000000000000000000000000..1f3d72261d9947fea0df016fa6f2b29727db9cb7 --- /dev/null +++ b/eval-results/prithivida/Asimov-7B-v2/results_2023-12-03T19-02-27.334666.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 82176.309001891, + "end_time": 85015.069307528, + "total_evaluation_time_secondes": "2838.760305636999", + "model_name": "prithivida/Asimov-7B-v2", + "model_sha": "0aeea2284ac78cac081bee88e5a98a19bb987227", + "model_dtype": "4bit", + "model_size": "4.24 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.10917361637604246, + "acc_stderr": 0.008590089300511155 + }, + "all": { + "acc": 0.10917361637604246, + "acc_stderr": 0.008590089300511155 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "a48145172cfc43ff" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "f17391d49d33b9c0", + "hash_cont_tokens": "3da40463318e6272" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/s1ghhh/medllama-2-70b-qlora-1.1/results_2023-10-09T01-34-27.623935.json b/eval-results/s1ghhh/medllama-2-70b-qlora-1.1/results_2023-10-09T01-34-27.623935.json new file mode 100644 index 0000000000000000000000000000000000000000..2e8549d65326c1a4cee7e1e02600a5593b0a9cea --- /dev/null +++ b/eval-results/s1ghhh/medllama-2-70b-qlora-1.1/results_2023-10-09T01-34-27.623935.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "s1ghhh/medllama-2-70b-qlora-1.1", + "model_sha": "d55e05e9d67418c639933c85a5b9d17c6f531a92", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.643344709897611, + "acc_stderr": 0.013998056902620197, + "acc_norm": 0.6902730375426621, + "acc_norm_stderr": 0.013512058415238361 + }, + "harness|hellaswag|10": { + "acc": 0.6767576180043816, + "acc_stderr": 0.004667585072717502, + "acc_norm": 0.8717386974706234, + "acc_norm_stderr": 0.003336971535131172 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6370370370370371, + "acc_stderr": 0.04153948404742399, + "acc_norm": 0.6370370370370371, + "acc_norm_stderr": 0.04153948404742399 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7894736842105263, + "acc_stderr": 0.03317672787533157, + "acc_norm": 0.7894736842105263, + "acc_norm_stderr": 0.03317672787533157 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.76, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.76, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7622641509433963, + "acc_stderr": 0.026199808807561918, + "acc_norm": 0.7622641509433963, + "acc_norm_stderr": 0.026199808807561918 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8125, + "acc_stderr": 0.032639560491693344, + "acc_norm": 0.8125, + "acc_norm_stderr": 0.032639560491693344 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.44, + "acc_stderr": 0.0498887651569859, + "acc_norm": 0.44, + "acc_norm_stderr": 0.0498887651569859 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.7052023121387283, + "acc_stderr": 0.03476599607516477, + "acc_norm": 0.7052023121387283, + "acc_norm_stderr": 0.03476599607516477 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3431372549019608, + "acc_stderr": 0.04724007352383888, + "acc_norm": 0.3431372549019608, + "acc_norm_stderr": 0.04724007352383888 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.74, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.74, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.676595744680851, + "acc_stderr": 0.030579442773610334, + "acc_norm": 0.676595744680851, + "acc_norm_stderr": 0.030579442773610334 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.45614035087719296, + "acc_stderr": 0.046854730419077895, + "acc_norm": 0.45614035087719296, + "acc_norm_stderr": 0.046854730419077895 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6413793103448275, + "acc_stderr": 0.03996629574876719, + "acc_norm": 0.6413793103448275, + "acc_norm_stderr": 0.03996629574876719 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.025591857761382182, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.025591857761382182 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.48412698412698413, + "acc_stderr": 0.04469881854072606, + "acc_norm": 0.48412698412698413, + "acc_norm_stderr": 0.04469881854072606 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8258064516129032, + "acc_stderr": 0.021576248184514587, + "acc_norm": 0.8258064516129032, + "acc_norm_stderr": 0.021576248184514587 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5369458128078818, + "acc_stderr": 0.035083705204426656, + "acc_norm": 0.5369458128078818, + "acc_norm_stderr": 0.035083705204426656 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8484848484848485, + "acc_stderr": 0.027998073798781678, + "acc_norm": 0.8484848484848485, + "acc_norm_stderr": 0.027998073798781678 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.898989898989899, + "acc_stderr": 0.021469735576055346, + "acc_norm": 0.898989898989899, + "acc_norm_stderr": 0.021469735576055346 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9430051813471503, + "acc_stderr": 0.016731085293607548, + "acc_norm": 0.9430051813471503, + "acc_norm_stderr": 0.016731085293607548 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7307692307692307, + "acc_stderr": 0.022489389793654817, + "acc_norm": 0.7307692307692307, + "acc_norm_stderr": 0.022489389793654817 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.37037037037037035, + "acc_stderr": 0.02944316932303154, + "acc_norm": 0.37037037037037035, + "acc_norm_stderr": 0.02944316932303154 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.8025210084033614, + "acc_stderr": 0.02585916412205146, + "acc_norm": 0.8025210084033614, + "acc_norm_stderr": 0.02585916412205146 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4370860927152318, + "acc_stderr": 0.04050035722230636, + "acc_norm": 0.4370860927152318, + "acc_norm_stderr": 0.04050035722230636 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.9045871559633027, + "acc_stderr": 0.012595899282335793, + "acc_norm": 0.9045871559633027, + "acc_norm_stderr": 0.012595899282335793 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.6712962962962963, + "acc_stderr": 0.032036140846700596, + "acc_norm": 0.6712962962962963, + "acc_norm_stderr": 0.032036140846700596 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9264705882352942, + "acc_stderr": 0.01831885585008968, + "acc_norm": 0.9264705882352942, + "acc_norm_stderr": 0.01831885585008968 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8987341772151899, + "acc_stderr": 0.019637720526065494, + "acc_norm": 0.8987341772151899, + "acc_norm_stderr": 0.019637720526065494 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7892376681614349, + "acc_stderr": 0.02737309550054019, + "acc_norm": 0.7892376681614349, + "acc_norm_stderr": 0.02737309550054019 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8854961832061069, + "acc_stderr": 0.027927473753597446, + "acc_norm": 0.8854961832061069, + "acc_norm_stderr": 0.027927473753597446 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8760330578512396, + "acc_stderr": 0.030083098716035216, + "acc_norm": 0.8760330578512396, + "acc_norm_stderr": 0.030083098716035216 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8148148148148148, + "acc_stderr": 0.03755265865037182, + "acc_norm": 0.8148148148148148, + "acc_norm_stderr": 0.03755265865037182 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8098159509202454, + "acc_stderr": 0.03083349114628124, + "acc_norm": 0.8098159509202454, + "acc_norm_stderr": 0.03083349114628124 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5446428571428571, + "acc_stderr": 0.04726835553719097, + "acc_norm": 0.5446428571428571, + "acc_norm_stderr": 0.04726835553719097 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.0376017800602662, + "acc_norm": 0.8252427184466019, + "acc_norm_stderr": 0.0376017800602662 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9358974358974359, + "acc_stderr": 0.01604626163167314, + "acc_norm": 0.9358974358974359, + "acc_norm_stderr": 0.01604626163167314 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.77, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.77, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8697318007662835, + "acc_stderr": 0.012036729568216055, + "acc_norm": 0.8697318007662835, + "acc_norm_stderr": 0.012036729568216055 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.8179190751445087, + "acc_stderr": 0.020776761102512965, + "acc_norm": 0.8179190751445087, + "acc_norm_stderr": 0.020776761102512965 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4960893854748603, + "acc_stderr": 0.016721990073156657, + "acc_norm": 0.4960893854748603, + "acc_norm_stderr": 0.016721990073156657 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7712418300653595, + "acc_stderr": 0.024051029739912255, + "acc_norm": 0.7712418300653595, + "acc_norm_stderr": 0.024051029739912255 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.8006430868167203, + "acc_stderr": 0.022691033780549656, + "acc_norm": 0.8006430868167203, + "acc_norm_stderr": 0.022691033780549656 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8611111111111112, + "acc_stderr": 0.019242526226544543, + "acc_norm": 0.8611111111111112, + "acc_norm_stderr": 0.019242526226544543 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5780141843971631, + "acc_stderr": 0.029462189233370593, + "acc_norm": 0.5780141843971631, + "acc_norm_stderr": 0.029462189233370593 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5730117340286832, + "acc_stderr": 0.012633353557534414, + "acc_norm": 0.5730117340286832, + "acc_norm_stderr": 0.012633353557534414 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.8014705882352942, + "acc_stderr": 0.0242310133705411, + "acc_norm": 0.8014705882352942, + "acc_norm_stderr": 0.0242310133705411 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7712418300653595, + "acc_stderr": 0.016992723465466215, + "acc_norm": 0.7712418300653595, + "acc_norm_stderr": 0.016992723465466215 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7363636363636363, + "acc_stderr": 0.04220224692971987, + "acc_norm": 0.7363636363636363, + "acc_norm_stderr": 0.04220224692971987 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7918367346938775, + "acc_stderr": 0.025991117672813296, + "acc_norm": 0.7918367346938775, + "acc_norm_stderr": 0.025991117672813296 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8955223880597015, + "acc_stderr": 0.021628920516700643, + "acc_norm": 0.8955223880597015, + "acc_norm_stderr": 0.021628920516700643 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.92, + "acc_stderr": 0.0272659924344291, + "acc_norm": 0.92, + "acc_norm_stderr": 0.0272659924344291 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5602409638554217, + "acc_stderr": 0.03864139923699122, + "acc_norm": 0.5602409638554217, + "acc_norm_stderr": 0.03864139923699122 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8596491228070176, + "acc_stderr": 0.0266405825391332, + "acc_norm": 0.8596491228070176, + "acc_norm_stderr": 0.0266405825391332 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3635250917992656, + "mc1_stderr": 0.016838862883965838, + "mc2": 0.5240600477973963, + "mc2_stderr": 0.014676584418259612 + }, + "all": { + "acc": 0.708690489375333, + "acc_stderr": 0.030558651569776513, + "acc_norm": 0.712790648817897, + "acc_norm_stderr": 0.030527861535455017, + "mc1": 0.3635250917992656, + "mc1_stderr": 0.016838862883965838, + "mc2": 0.5240600477973963, + "mc2_stderr": 0.014676584418259612 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "46057.53097200394", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/s1ghhh/medllama-2-70b-qlora-1.1/results_2023-10-28T23-37-36.261412.json b/eval-results/s1ghhh/medllama-2-70b-qlora-1.1/results_2023-10-28T23-37-36.261412.json new file mode 100644 index 0000000000000000000000000000000000000000..3a6666bacdb76c5cf8dc8d9558be50d4f4105b21 --- /dev/null +++ b/eval-results/s1ghhh/medllama-2-70b-qlora-1.1/results_2023-10-28T23-37-36.261412.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "s1ghhh/medllama-2-70b-qlora-1.1", + "model_sha": "d55e05e9d67418c639933c85a5b9d17c6f531a92", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.4476719798657718, + "em_stderr": 0.005092348829658167, + "f1": 0.49099203020134397, + "f1_stderr": 0.004914477006067904 + }, + "harness|gsm8k|5": { + "acc": 0.3206974981046247, + "acc_stderr": 0.012856468433722304 + }, + "harness|winogrande|5": { + "acc": 0.8421468034727704, + "acc_stderr": 0.010247165248719763 + }, + "all": { + "em": 0.4476719798657718, + "em_stderr": 0.005092348829658167, + "f1": 0.49099203020134397, + "f1_stderr": 0.004914477006067904, + "acc": 0.5814221507886975, + "acc_stderr": 0.011551816841221033 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "1e52b3a7ac1341ad" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "784d28e670cfd098" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "d9c96549d50638f4" + }, + "total_evaluation_time_secondes": "34982.29192328453", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/sequelbox/DaringFortitude/results_2023-11-15T00-35-47.431209.json b/eval-results/sequelbox/DaringFortitude/results_2023-11-15T00-35-47.431209.json new file mode 100644 index 0000000000000000000000000000000000000000..5ea2504a042659d291b21031b39af70a7e3520b5 --- /dev/null +++ b/eval-results/sequelbox/DaringFortitude/results_2023-11-15T00-35-47.431209.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 2545527.35802311, + "end_time": 2563496.18360011, + "total_evaluation_time_secondes": "17968.825577000156", + "model_name": "sequelbox/DaringFortitude", + "model_sha": "0c463888cd83b7acebd7b6fb961562e11402e47d", + "model_dtype": "torch.float16", + "model_size": "24.32 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6032423208191127, + "acc_stderr": 0.01429651302018063, + "acc_norm": 0.6348122866894198, + "acc_norm_stderr": 0.014070265519268802 + }, + "harness|hellaswag|10": { + "acc": 0.6360286795459071, + "acc_stderr": 0.004801572028920796, + "acc_norm": 0.8355905198167696, + "acc_norm_stderr": 0.003698892388380099 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5259259259259259, + "acc_stderr": 0.04313531696750575, + "acc_norm": 0.5259259259259259, + "acc_norm_stderr": 0.04313531696750575 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.618421052631579, + "acc_stderr": 0.03953173377749194, + "acc_norm": 0.618421052631579, + "acc_norm_stderr": 0.03953173377749194 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6226415094339622, + "acc_stderr": 0.029832808114796005, + "acc_norm": 0.6226415094339622, + "acc_norm_stderr": 0.029832808114796005 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6597222222222222, + "acc_stderr": 0.039621355734862175, + "acc_norm": 0.6597222222222222, + "acc_norm_stderr": 0.039621355734862175 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5953757225433526, + "acc_stderr": 0.03742461193887248, + "acc_norm": 0.5953757225433526, + "acc_norm_stderr": 0.03742461193887248 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.04655010411319616, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.04655010411319616 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.49361702127659574, + "acc_stderr": 0.03268335899936336, + "acc_norm": 0.49361702127659574, + "acc_norm_stderr": 0.03268335899936336 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.35964912280701755, + "acc_stderr": 0.04514496132873634, + "acc_norm": 0.35964912280701755, + "acc_norm_stderr": 0.04514496132873634 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5724137931034483, + "acc_stderr": 0.04122737111370333, + "acc_norm": 0.5724137931034483, + "acc_norm_stderr": 0.04122737111370333 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3412698412698413, + "acc_stderr": 0.024419234966819067, + "acc_norm": 0.3412698412698413, + "acc_norm_stderr": 0.024419234966819067 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04285714285714281, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04285714285714281 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6709677419354839, + "acc_stderr": 0.026729499068349958, + "acc_norm": 0.6709677419354839, + "acc_norm_stderr": 0.026729499068349958 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.035158955511656986, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.035158955511656986 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.62, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.62, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7212121212121212, + "acc_stderr": 0.035014387062967806, + "acc_norm": 0.7212121212121212, + "acc_norm_stderr": 0.035014387062967806 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.029620227874790482, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.029620227874790482 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8808290155440415, + "acc_stderr": 0.02338193534812143, + "acc_norm": 0.8808290155440415, + "acc_norm_stderr": 0.02338193534812143 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6153846153846154, + "acc_stderr": 0.024666744915187222, + "acc_norm": 0.6153846153846154, + "acc_norm_stderr": 0.024666744915187222 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32222222222222224, + "acc_stderr": 0.028493465091028597, + "acc_norm": 0.32222222222222224, + "acc_norm_stderr": 0.028493465091028597 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5966386554621849, + "acc_stderr": 0.031866081214088314, + "acc_norm": 0.5966386554621849, + "acc_norm_stderr": 0.031866081214088314 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.038227469376587525, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.038227469376587525 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7889908256880734, + "acc_stderr": 0.017493922404112648, + "acc_norm": 0.7889908256880734, + "acc_norm_stderr": 0.017493922404112648 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4305555555555556, + "acc_stderr": 0.03376922151252336, + "acc_norm": 0.4305555555555556, + "acc_norm_stderr": 0.03376922151252336 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.02615686752393104, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.02615686752393104 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7848101265822784, + "acc_stderr": 0.02675082699467617, + "acc_norm": 0.7848101265822784, + "acc_norm_stderr": 0.02675082699467617 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6860986547085202, + "acc_stderr": 0.03114679648297246, + "acc_norm": 0.6860986547085202, + "acc_norm_stderr": 0.03114679648297246 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6870229007633588, + "acc_stderr": 0.04066962905677698, + "acc_norm": 0.6870229007633588, + "acc_norm_stderr": 0.04066962905677698 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908706, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908706 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7685185185185185, + "acc_stderr": 0.04077494709252627, + "acc_norm": 0.7685185185185185, + "acc_norm_stderr": 0.04077494709252627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6932515337423313, + "acc_stderr": 0.03623089915724146, + "acc_norm": 0.6932515337423313, + "acc_norm_stderr": 0.03623089915724146 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8504273504273504, + "acc_stderr": 0.02336505149175372, + "acc_norm": 0.8504273504273504, + "acc_norm_stderr": 0.02336505149175372 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7956577266922095, + "acc_stderr": 0.0144191239809319, + "acc_norm": 0.7956577266922095, + "acc_norm_stderr": 0.0144191239809319 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6560693641618497, + "acc_stderr": 0.02557412378654667, + "acc_norm": 0.6560693641618497, + "acc_norm_stderr": 0.02557412378654667 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.48268156424581005, + "acc_stderr": 0.01671246744170252, + "acc_norm": 0.48268156424581005, + "acc_norm_stderr": 0.01671246744170252 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6699346405228758, + "acc_stderr": 0.026925654653615693, + "acc_norm": 0.6699346405228758, + "acc_norm_stderr": 0.026925654653615693 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.684887459807074, + "acc_stderr": 0.026385273703464492, + "acc_norm": 0.684887459807074, + "acc_norm_stderr": 0.026385273703464492 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7191358024691358, + "acc_stderr": 0.025006469755799208, + "acc_norm": 0.7191358024691358, + "acc_norm_stderr": 0.025006469755799208 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.475177304964539, + "acc_stderr": 0.02979071924382972, + "acc_norm": 0.475177304964539, + "acc_norm_stderr": 0.02979071924382972 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.45827900912646674, + "acc_stderr": 0.012725701656953642, + "acc_norm": 0.45827900912646674, + "acc_norm_stderr": 0.012725701656953642 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6102941176470589, + "acc_stderr": 0.0296246635811597, + "acc_norm": 0.6102941176470589, + "acc_norm_stderr": 0.0296246635811597 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5915032679738562, + "acc_stderr": 0.01988622103750187, + "acc_norm": 0.5915032679738562, + "acc_norm_stderr": 0.01988622103750187 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.673469387755102, + "acc_stderr": 0.03002105623844031, + "acc_norm": 0.673469387755102, + "acc_norm_stderr": 0.03002105623844031 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7562189054726368, + "acc_stderr": 0.030360490154014645, + "acc_norm": 0.7562189054726368, + "acc_norm_stderr": 0.030360490154014645 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4939759036144578, + "acc_stderr": 0.03892212195333045, + "acc_norm": 0.4939759036144578, + "acc_norm_stderr": 0.03892212195333045 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8011695906432749, + "acc_stderr": 0.030611116557432528, + "acc_norm": 0.8011695906432749, + "acc_norm_stderr": 0.030611116557432528 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.40269277845777235, + "mc1_stderr": 0.017168830935187215, + "mc2": 0.559561930249219, + "mc2_stderr": 0.015693079433704838 + }, + "harness|winogrande|5": { + "acc": 0.7647987371744278, + "acc_stderr": 0.011920008163650865 + }, + "harness|drop|3": { + "em": 0.01950503355704698, + "em_stderr": 0.0014162361849700607, + "f1": 0.12218750000000013, + "f1_stderr": 0.002284380268622334 + }, + "harness|gsm8k|5": { + "acc": 0.08794541319181198, + "acc_stderr": 0.007801162197487721 + }, + "all": { + "acc": 0.5932217761298214, + "acc_stderr": 0.03305656216343214, + "acc_norm": 0.6027951864354921, + "acc_norm_stderr": 0.03382034227909779, + "mc1": 0.40269277845777235, + "mc1_stderr": 0.017168830935187215, + "mc2": 0.559561930249219, + "mc2_stderr": 0.015693079433704838, + "em": 0.01950503355704698, + "em_stderr": 0.0014162361849700607, + "f1": 0.12218750000000013, + "f1_stderr": 0.002284380268622334 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "6df49c1ce7cc6369" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "91a223356f11be5d" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "379266f3a5365f9d", + "hash_cont_tokens": "31748112f1ee37e0" + }, + "truncated": 3, + "non_truncated": 38192, + "padded": 113348, + "non_padded": 11060, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/sequelbox/SharpBalance/results_2023-10-09T05-49-47.525988.json b/eval-results/sequelbox/SharpBalance/results_2023-10-09T05-49-47.525988.json new file mode 100644 index 0000000000000000000000000000000000000000..9ae1df8f8ec95ed87f159e14d6b13968ff77a2c1 --- /dev/null +++ b/eval-results/sequelbox/SharpBalance/results_2023-10-09T05-49-47.525988.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "sequelbox/SharpBalance", + "model_sha": "a87cb1756d7b7389cc5a6d4647cf53377e962aea", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6527303754266212, + "acc_stderr": 0.013913034529620446, + "acc_norm": 0.6928327645051194, + "acc_norm_stderr": 0.013481034054980941 + }, + "harness|hellaswag|10": { + "acc": 0.6878111929894444, + "acc_stderr": 0.004624393690966902, + "acc_norm": 0.8759211312487553, + "acc_norm_stderr": 0.0032899775233939097 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6444444444444445, + "acc_stderr": 0.04135176749720385, + "acc_norm": 0.6444444444444445, + "acc_norm_stderr": 0.04135176749720385 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.8092105263157895, + "acc_stderr": 0.031975658210325, + "acc_norm": 0.8092105263157895, + "acc_norm_stderr": 0.031975658210325 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.720754716981132, + "acc_stderr": 0.027611163402399715, + "acc_norm": 0.720754716981132, + "acc_norm_stderr": 0.027611163402399715 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8055555555555556, + "acc_stderr": 0.03309615177059006, + "acc_norm": 0.8055555555555556, + "acc_norm_stderr": 0.03309615177059006 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6647398843930635, + "acc_stderr": 0.035995863012470763, + "acc_norm": 0.6647398843930635, + "acc_norm_stderr": 0.035995863012470763 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.39215686274509803, + "acc_stderr": 0.04858083574266345, + "acc_norm": 0.39215686274509803, + "acc_norm_stderr": 0.04858083574266345 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6723404255319149, + "acc_stderr": 0.030683020843231004, + "acc_norm": 0.6723404255319149, + "acc_norm_stderr": 0.030683020843231004 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.41228070175438597, + "acc_stderr": 0.04630653203366595, + "acc_norm": 0.41228070175438597, + "acc_norm_stderr": 0.04630653203366595 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6, + "acc_stderr": 0.040824829046386284, + "acc_norm": 0.6, + "acc_norm_stderr": 0.040824829046386284 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.43915343915343913, + "acc_stderr": 0.02555992055053101, + "acc_norm": 0.43915343915343913, + "acc_norm_stderr": 0.02555992055053101 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5238095238095238, + "acc_stderr": 0.04467062628403273, + "acc_norm": 0.5238095238095238, + "acc_norm_stderr": 0.04467062628403273 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8161290322580645, + "acc_stderr": 0.022037217340267826, + "acc_norm": 0.8161290322580645, + "acc_norm_stderr": 0.022037217340267826 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5369458128078818, + "acc_stderr": 0.035083705204426656, + "acc_norm": 0.5369458128078818, + "acc_norm_stderr": 0.035083705204426656 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8303030303030303, + "acc_stderr": 0.029311188674983134, + "acc_norm": 0.8303030303030303, + "acc_norm_stderr": 0.029311188674983134 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8838383838383839, + "acc_stderr": 0.02282888177524938, + "acc_norm": 0.8838383838383839, + "acc_norm_stderr": 0.02282888177524938 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.927461139896373, + "acc_stderr": 0.018718998520678178, + "acc_norm": 0.927461139896373, + "acc_norm_stderr": 0.018718998520678178 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7076923076923077, + "acc_stderr": 0.02306043838085774, + "acc_norm": 0.7076923076923077, + "acc_norm_stderr": 0.02306043838085774 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.027840811495871934, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.027840811495871934 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7521008403361344, + "acc_stderr": 0.028047967224176892, + "acc_norm": 0.7521008403361344, + "acc_norm_stderr": 0.028047967224176892 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4768211920529801, + "acc_stderr": 0.04078093859163083, + "acc_norm": 0.4768211920529801, + "acc_norm_stderr": 0.04078093859163083 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8880733944954129, + "acc_stderr": 0.013517352714958792, + "acc_norm": 0.8880733944954129, + "acc_norm_stderr": 0.013517352714958792 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5462962962962963, + "acc_stderr": 0.033953227263757976, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.033953227263757976 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9166666666666666, + "acc_stderr": 0.019398452135813905, + "acc_norm": 0.9166666666666666, + "acc_norm_stderr": 0.019398452135813905 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8776371308016878, + "acc_stderr": 0.021331741829746786, + "acc_norm": 0.8776371308016878, + "acc_norm_stderr": 0.021331741829746786 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7982062780269058, + "acc_stderr": 0.02693611191280227, + "acc_norm": 0.7982062780269058, + "acc_norm_stderr": 0.02693611191280227 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8473282442748091, + "acc_stderr": 0.031545216720054725, + "acc_norm": 0.8473282442748091, + "acc_norm_stderr": 0.031545216720054725 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8760330578512396, + "acc_stderr": 0.03008309871603521, + "acc_norm": 0.8760330578512396, + "acc_norm_stderr": 0.03008309871603521 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8148148148148148, + "acc_stderr": 0.03755265865037181, + "acc_norm": 0.8148148148148148, + "acc_norm_stderr": 0.03755265865037181 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8098159509202454, + "acc_stderr": 0.030833491146281235, + "acc_norm": 0.8098159509202454, + "acc_norm_stderr": 0.030833491146281235 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.0376017800602662, + "acc_norm": 0.8252427184466019, + "acc_norm_stderr": 0.0376017800602662 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9188034188034188, + "acc_stderr": 0.017893784904018533, + "acc_norm": 0.9188034188034188, + "acc_norm_stderr": 0.017893784904018533 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8735632183908046, + "acc_stderr": 0.01188448890589555, + "acc_norm": 0.8735632183908046, + "acc_norm_stderr": 0.01188448890589555 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7976878612716763, + "acc_stderr": 0.02162807738019612, + "acc_norm": 0.7976878612716763, + "acc_norm_stderr": 0.02162807738019612 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.529608938547486, + "acc_stderr": 0.01669315492738355, + "acc_norm": 0.529608938547486, + "acc_norm_stderr": 0.01669315492738355 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7352941176470589, + "acc_stderr": 0.02526169121972949, + "acc_norm": 0.7352941176470589, + "acc_norm_stderr": 0.02526169121972949 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7909967845659164, + "acc_stderr": 0.02309314039837422, + "acc_norm": 0.7909967845659164, + "acc_norm_stderr": 0.02309314039837422 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8148148148148148, + "acc_stderr": 0.0216138093952248, + "acc_norm": 0.8148148148148148, + "acc_norm_stderr": 0.0216138093952248 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5425531914893617, + "acc_stderr": 0.029719281272236834, + "acc_norm": 0.5425531914893617, + "acc_norm_stderr": 0.029719281272236834 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.546284224250326, + "acc_stderr": 0.012715404841277752, + "acc_norm": 0.546284224250326, + "acc_norm_stderr": 0.012715404841277752 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7169117647058824, + "acc_stderr": 0.02736586113151381, + "acc_norm": 0.7169117647058824, + "acc_norm_stderr": 0.02736586113151381 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.75, + "acc_stderr": 0.01751781884501444, + "acc_norm": 0.75, + "acc_norm_stderr": 0.01751781884501444 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04265792110940589, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04265792110940589 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.8, + "acc_stderr": 0.02560737598657916, + "acc_norm": 0.8, + "acc_norm_stderr": 0.02560737598657916 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8756218905472637, + "acc_stderr": 0.023335401790166327, + "acc_norm": 0.8756218905472637, + "acc_norm_stderr": 0.023335401790166327 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.92, + "acc_stderr": 0.0272659924344291, + "acc_norm": 0.92, + "acc_norm_stderr": 0.0272659924344291 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5481927710843374, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.5481927710843374, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8596491228070176, + "acc_stderr": 0.0266405825391332, + "acc_norm": 0.8596491228070176, + "acc_norm_stderr": 0.0266405825391332 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.4259485924112607, + "mc1_stderr": 0.01731047190407654, + "mc2": 0.5904880959366052, + "mc2_stderr": 0.014903036806895207 + }, + "all": { + "acc": 0.6942363573453332, + "acc_stderr": 0.031096725292772836, + "acc_norm": 0.6981043628934316, + "acc_norm_stderr": 0.03106678602765059, + "mc1": 0.4259485924112607, + "mc1_stderr": 0.01731047190407654, + "mc2": 0.5904880959366052, + "mc2_stderr": 0.014903036806895207 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "44777.57130050659", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/sequelbox/SharpBalance/results_2023-10-23T18-53-09.205615.json b/eval-results/sequelbox/SharpBalance/results_2023-10-23T18-53-09.205615.json new file mode 100644 index 0000000000000000000000000000000000000000..c13d92b8e5f1dc40925278d8e39705aa4412dc62 --- /dev/null +++ b/eval-results/sequelbox/SharpBalance/results_2023-10-23T18-53-09.205615.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "sequelbox/SharpBalance", + "model_sha": "e0a90e62155b92659189ee46a3e77de2a40c46f8", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.30861996644295303, + "em_stderr": 0.00473053301508219, + "f1": 0.3692638422818801, + "f1_stderr": 0.004628079358040571 + }, + "harness|gsm8k|5": { + "acc": 0.3464746019711903, + "acc_stderr": 0.013107179054313396 + }, + "harness|winogrande|5": { + "acc": 0.840568271507498, + "acc_stderr": 0.010288617479454764 + }, + "all": { + "em": 0.30861996644295303, + "em_stderr": 0.00473053301508219, + "f1": 0.3692638422818801, + "f1_stderr": 0.004628079358040571, + "acc": 0.5935214367393442, + "acc_stderr": 0.011697898266884079 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "ef3b7663f96640f0" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "e79dacefd2333c5b" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "a042ba6d1a1b4fe9" + }, + "total_evaluation_time_secondes": "42300.232100486755", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/sequelbox/StellarBright/results_2023-10-11T03-35-00.957425.json b/eval-results/sequelbox/StellarBright/results_2023-10-11T03-35-00.957425.json new file mode 100644 index 0000000000000000000000000000000000000000..4302eb14573e5f8ae733730035f99e2f2b2ce7e5 --- /dev/null +++ b/eval-results/sequelbox/StellarBright/results_2023-10-11T03-35-00.957425.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "sequelbox/StellarBright", + "model_sha": "43efad8bfdb47139934e810906c1e59c25b5e269", + "model_size": "128.64 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6885665529010239, + "acc_stderr": 0.013532472099850945, + "acc_norm": 0.7295221843003413, + "acc_norm_stderr": 0.012980954547659556 + }, + "harness|hellaswag|10": { + "acc": 0.690300736904999, + "acc_stderr": 0.004614246282055375, + "acc_norm": 0.8782115116510655, + "acc_norm_stderr": 0.0032637298176987762 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6370370370370371, + "acc_stderr": 0.041539484047424, + "acc_norm": 0.6370370370370371, + "acc_norm_stderr": 0.041539484047424 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.8092105263157895, + "acc_stderr": 0.03197565821032499, + "acc_norm": 0.8092105263157895, + "acc_norm_stderr": 0.03197565821032499 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.78, + "acc_stderr": 0.041633319989322605, + "acc_norm": 0.78, + "acc_norm_stderr": 0.041633319989322605 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7433962264150943, + "acc_stderr": 0.026880647889051985, + "acc_norm": 0.7433962264150943, + "acc_norm_stderr": 0.026880647889051985 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8263888888888888, + "acc_stderr": 0.03167473383795717, + "acc_norm": 0.8263888888888888, + "acc_norm_stderr": 0.03167473383795717 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6936416184971098, + "acc_stderr": 0.03514942551267439, + "acc_norm": 0.6936416184971098, + "acc_norm_stderr": 0.03514942551267439 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.04784060704105654, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.04784060704105654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.77, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.7106382978723405, + "acc_stderr": 0.02964400657700962, + "acc_norm": 0.7106382978723405, + "acc_norm_stderr": 0.02964400657700962 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.04697085136647863, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.04697085136647863 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6413793103448275, + "acc_stderr": 0.039966295748767186, + "acc_norm": 0.6413793103448275, + "acc_norm_stderr": 0.039966295748767186 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.47883597883597884, + "acc_stderr": 0.025728230952130723, + "acc_norm": 0.47883597883597884, + "acc_norm_stderr": 0.025728230952130723 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.5158730158730159, + "acc_stderr": 0.044698818540726076, + "acc_norm": 0.5158730158730159, + "acc_norm_stderr": 0.044698818540726076 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8161290322580645, + "acc_stderr": 0.022037217340267826, + "acc_norm": 0.8161290322580645, + "acc_norm_stderr": 0.022037217340267826 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5517241379310345, + "acc_stderr": 0.03499113137676744, + "acc_norm": 0.5517241379310345, + "acc_norm_stderr": 0.03499113137676744 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932262, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932262 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8484848484848485, + "acc_stderr": 0.027998073798781678, + "acc_norm": 0.8484848484848485, + "acc_norm_stderr": 0.027998073798781678 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8888888888888888, + "acc_stderr": 0.022390787638216763, + "acc_norm": 0.8888888888888888, + "acc_norm_stderr": 0.022390787638216763 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9430051813471503, + "acc_stderr": 0.01673108529360755, + "acc_norm": 0.9430051813471503, + "acc_norm_stderr": 0.01673108529360755 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7102564102564103, + "acc_stderr": 0.023000628243687968, + "acc_norm": 0.7102564102564103, + "acc_norm_stderr": 0.023000628243687968 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.337037037037037, + "acc_stderr": 0.028820884666253252, + "acc_norm": 0.337037037037037, + "acc_norm_stderr": 0.028820884666253252 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7773109243697479, + "acc_stderr": 0.027025433498882392, + "acc_norm": 0.7773109243697479, + "acc_norm_stderr": 0.027025433498882392 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.4900662251655629, + "acc_stderr": 0.04081677107248436, + "acc_norm": 0.4900662251655629, + "acc_norm_stderr": 0.04081677107248436 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.9119266055045872, + "acc_stderr": 0.01215074371948166, + "acc_norm": 0.9119266055045872, + "acc_norm_stderr": 0.01215074371948166 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.625, + "acc_stderr": 0.033016908987210894, + "acc_norm": 0.625, + "acc_norm_stderr": 0.033016908987210894 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9215686274509803, + "acc_stderr": 0.018869514646658925, + "acc_norm": 0.9215686274509803, + "acc_norm_stderr": 0.018869514646658925 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8945147679324894, + "acc_stderr": 0.01999556072375854, + "acc_norm": 0.8945147679324894, + "acc_norm_stderr": 0.01999556072375854 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7847533632286996, + "acc_stderr": 0.027584066602208274, + "acc_norm": 0.7847533632286996, + "acc_norm_stderr": 0.027584066602208274 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8473282442748091, + "acc_stderr": 0.031545216720054725, + "acc_norm": 0.8473282442748091, + "acc_norm_stderr": 0.031545216720054725 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8760330578512396, + "acc_stderr": 0.030083098716035206, + "acc_norm": 0.8760330578512396, + "acc_norm_stderr": 0.030083098716035206 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.03602814176392645, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.03602814176392645 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8220858895705522, + "acc_stderr": 0.03004735765580662, + "acc_norm": 0.8220858895705522, + "acc_norm_stderr": 0.03004735765580662 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.04697113923010213, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.04697113923010213 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8543689320388349, + "acc_stderr": 0.034926064766237906, + "acc_norm": 0.8543689320388349, + "acc_norm_stderr": 0.034926064766237906 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9145299145299145, + "acc_stderr": 0.018315891685625845, + "acc_norm": 0.9145299145299145, + "acc_norm_stderr": 0.018315891685625845 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542126, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542126 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8710089399744572, + "acc_stderr": 0.011986371548086867, + "acc_norm": 0.8710089399744572, + "acc_norm_stderr": 0.011986371548086867 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7832369942196532, + "acc_stderr": 0.022183477668412856, + "acc_norm": 0.7832369942196532, + "acc_norm_stderr": 0.022183477668412856 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.6245810055865921, + "acc_stderr": 0.01619510424846353, + "acc_norm": 0.6245810055865921, + "acc_norm_stderr": 0.01619510424846353 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7581699346405228, + "acc_stderr": 0.024518195641879334, + "acc_norm": 0.7581699346405228, + "acc_norm_stderr": 0.024518195641879334 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7877813504823151, + "acc_stderr": 0.023222756797435115, + "acc_norm": 0.7877813504823151, + "acc_norm_stderr": 0.023222756797435115 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.020736358408060002, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.020736358408060002 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5957446808510638, + "acc_stderr": 0.02927553215970472, + "acc_norm": 0.5957446808510638, + "acc_norm_stderr": 0.02927553215970472 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5827900912646675, + "acc_stderr": 0.012593959992906427, + "acc_norm": 0.5827900912646675, + "acc_norm_stderr": 0.012593959992906427 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.75, + "acc_stderr": 0.026303648393696036, + "acc_norm": 0.75, + "acc_norm_stderr": 0.026303648393696036 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7728758169934641, + "acc_stderr": 0.016949853279212373, + "acc_norm": 0.7728758169934641, + "acc_norm_stderr": 0.016949853279212373 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7454545454545455, + "acc_stderr": 0.041723430387053825, + "acc_norm": 0.7454545454545455, + "acc_norm_stderr": 0.041723430387053825 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7918367346938775, + "acc_stderr": 0.0259911176728133, + "acc_norm": 0.7918367346938775, + "acc_norm_stderr": 0.0259911176728133 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8805970149253731, + "acc_stderr": 0.02292879327721974, + "acc_norm": 0.8805970149253731, + "acc_norm_stderr": 0.02292879327721974 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.88, + "acc_stderr": 0.03265986323710906, + "acc_norm": 0.88, + "acc_norm_stderr": 0.03265986323710906 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5301204819277109, + "acc_stderr": 0.03885425420866767, + "acc_norm": 0.5301204819277109, + "acc_norm_stderr": 0.03885425420866767 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8771929824561403, + "acc_stderr": 0.02517298435015575, + "acc_norm": 0.8771929824561403, + "acc_norm_stderr": 0.02517298435015575 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.46511627906976744, + "mc1_stderr": 0.017460849975873965, + "mc2": 0.6446460697306154, + "mc2_stderr": 0.014753033588623255 + }, + "all": { + "acc": 0.7109524643752221, + "acc_stderr": 0.030739601585983465, + "acc_norm": 0.7148315560048047, + "acc_norm_stderr": 0.030707363721296215, + "mc1": 0.46511627906976744, + "mc1_stderr": 0.017460849975873965, + "mc2": 0.6446460697306154, + "mc2_stderr": 0.014753033588623255 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "45101.909573078156", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/sequelbox/StellarBright/results_2023-11-08T22-55-36.010619.json b/eval-results/sequelbox/StellarBright/results_2023-11-08T22-55-36.010619.json new file mode 100644 index 0000000000000000000000000000000000000000..04227446f25ecfd686928e756705460c064afb27 --- /dev/null +++ b/eval-results/sequelbox/StellarBright/results_2023-11-08T22-55-36.010619.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "sequelbox/StellarBright", + "model_sha": "7568e1ca8829780df41bdaa7fbb9b4c061e3b569", + "model_dtype": "torch.float16", + "model_size": "128.64 GB" + }, + "results": { + "harness|drop|3": { + "em": 0.34458892617449666, + "em_stderr": 0.004866841438021566, + "f1": 0.4966107382550379, + "f1_stderr": 0.004389897684698882 + }, + "harness|gsm8k|5": { + "acc": 0.3949962092494314, + "acc_stderr": 0.01346535496997321 + }, + "harness|winogrande|5": { + "acc": 0.8326756116811366, + "acc_stderr": 0.010490608806828082 + }, + "all": { + "em": 0.34458892617449666, + "em_stderr": 0.004866841438021566, + "f1": 0.4966107382550379, + "f1_stderr": 0.004389897684698882, + "acc": 0.613835910465284, + "acc_stderr": 0.011977981888400647 + } + }, + "versions": { + "all": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "252930058e8f4349" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "0f88aa500330dd11" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2432, + "non_padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "7ad991036fb8d822" + }, + "truncated": 3, + "non_truncated": 12119, + "padded": 2432, + "non_padded": 10957, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/sequelbox/SunsetBoulevard/results_2023-12-10T03-02-57.544409.json b/eval-results/sequelbox/SunsetBoulevard/results_2023-12-10T03-02-57.544409.json new file mode 100644 index 0000000000000000000000000000000000000000..3285516110a65f3bb9f2f5d17d3130b38ee42614 --- /dev/null +++ b/eval-results/sequelbox/SunsetBoulevard/results_2023-12-10T03-02-57.544409.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 527865.046653532, + "end_time": 632244.563305641, + "total_evaluation_time_secondes": "104379.51665210898", + "model_name": "sequelbox/SunsetBoulevard", + "model_sha": "b6070e47699fa55aac2002f579b05e6b4268cebb", + "model_dtype": "torch.float16", + "model_size": "128.64 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6552901023890785, + "acc_stderr": 0.01388881628678211, + "acc_norm": 0.7133105802047781, + "acc_norm_stderr": 0.013214986329274776 + }, + "harness|hellaswag|10": { + "acc": 0.7438757219677355, + "acc_stderr": 0.004355992090031012, + "acc_norm": 0.9095797649870544, + "acc_norm_stderr": 0.0028619676953189122 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.04072314811876837, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.04072314811876837 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.8157894736842105, + "acc_stderr": 0.0315469804508223, + "acc_norm": 0.8157894736842105, + "acc_norm_stderr": 0.0315469804508223 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.78, + "acc_stderr": 0.041633319989322605, + "acc_norm": 0.78, + "acc_norm_stderr": 0.041633319989322605 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7358490566037735, + "acc_stderr": 0.0271342916287417, + "acc_norm": 0.7358490566037735, + "acc_norm_stderr": 0.0271342916287417 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.8125, + "acc_stderr": 0.032639560491693344, + "acc_norm": 0.8125, + "acc_norm_stderr": 0.032639560491693344 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956913, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956913 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6820809248554913, + "acc_stderr": 0.035506839891655796, + "acc_norm": 0.6820809248554913, + "acc_norm_stderr": 0.035506839891655796 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.047840607041056527, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.047840607041056527 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.77, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6893617021276596, + "acc_stderr": 0.03025123757921317, + "acc_norm": 0.6893617021276596, + "acc_norm_stderr": 0.03025123757921317 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.45614035087719296, + "acc_stderr": 0.04685473041907789, + "acc_norm": 0.45614035087719296, + "acc_norm_stderr": 0.04685473041907789 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6482758620689655, + "acc_stderr": 0.0397923663749741, + "acc_norm": 0.6482758620689655, + "acc_norm_stderr": 0.0397923663749741 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4656084656084656, + "acc_stderr": 0.025690321762493848, + "acc_norm": 0.4656084656084656, + "acc_norm_stderr": 0.025690321762493848 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.48412698412698413, + "acc_stderr": 0.04469881854072606, + "acc_norm": 0.48412698412698413, + "acc_norm_stderr": 0.04469881854072606 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.8258064516129032, + "acc_stderr": 0.02157624818451459, + "acc_norm": 0.8258064516129032, + "acc_norm_stderr": 0.02157624818451459 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5320197044334976, + "acc_stderr": 0.035107665979592154, + "acc_norm": 0.5320197044334976, + "acc_norm_stderr": 0.035107665979592154 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.8303030303030303, + "acc_stderr": 0.029311188674983127, + "acc_norm": 0.8303030303030303, + "acc_norm_stderr": 0.029311188674983127 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8787878787878788, + "acc_stderr": 0.023253157951942084, + "acc_norm": 0.8787878787878788, + "acc_norm_stderr": 0.023253157951942084 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9378238341968912, + "acc_stderr": 0.017426974154240528, + "acc_norm": 0.9378238341968912, + "acc_norm_stderr": 0.017426974154240528 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.7128205128205128, + "acc_stderr": 0.022939925418530616, + "acc_norm": 0.7128205128205128, + "acc_norm_stderr": 0.022939925418530616 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.337037037037037, + "acc_stderr": 0.028820884666253255, + "acc_norm": 0.337037037037037, + "acc_norm_stderr": 0.028820884666253255 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7773109243697479, + "acc_stderr": 0.02702543349888238, + "acc_norm": 0.7773109243697479, + "acc_norm_stderr": 0.02702543349888238 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.5165562913907285, + "acc_stderr": 0.04080244185628972, + "acc_norm": 0.5165562913907285, + "acc_norm_stderr": 0.04080244185628972 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.9100917431192661, + "acc_stderr": 0.012264304540230444, + "acc_norm": 0.9100917431192661, + "acc_norm_stderr": 0.012264304540230444 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.6342592592592593, + "acc_stderr": 0.032847388576472056, + "acc_norm": 0.6342592592592593, + "acc_norm_stderr": 0.032847388576472056 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.9166666666666666, + "acc_stderr": 0.019398452135813905, + "acc_norm": 0.9166666666666666, + "acc_norm_stderr": 0.019398452135813905 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.890295358649789, + "acc_stderr": 0.02034340073486884, + "acc_norm": 0.890295358649789, + "acc_norm_stderr": 0.02034340073486884 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.8295964125560538, + "acc_stderr": 0.025234593447136175, + "acc_norm": 0.8295964125560538, + "acc_norm_stderr": 0.025234593447136175 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.8473282442748091, + "acc_stderr": 0.031545216720054725, + "acc_norm": 0.8473282442748091, + "acc_norm_stderr": 0.031545216720054725 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8512396694214877, + "acc_stderr": 0.03248470083807194, + "acc_norm": 0.8512396694214877, + "acc_norm_stderr": 0.03248470083807194 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.03602814176392645, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.03602814176392645 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.8404907975460123, + "acc_stderr": 0.028767481725983854, + "acc_norm": 0.8404907975460123, + "acc_norm_stderr": 0.028767481725983854 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5267857142857143, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.5267857142857143, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8349514563106796, + "acc_stderr": 0.036756688322331886, + "acc_norm": 0.8349514563106796, + "acc_norm_stderr": 0.036756688322331886 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.9273504273504274, + "acc_stderr": 0.017004368568132346, + "acc_norm": 0.9273504273504274, + "acc_norm_stderr": 0.017004368568132346 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8722860791826309, + "acc_stderr": 0.011935626313999876, + "acc_norm": 0.8722860791826309, + "acc_norm_stderr": 0.011935626313999876 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7976878612716763, + "acc_stderr": 0.02162807738019612, + "acc_norm": 0.7976878612716763, + "acc_norm_stderr": 0.02162807738019612 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.6122905027932961, + "acc_stderr": 0.016295332328155807, + "acc_norm": 0.6122905027932961, + "acc_norm_stderr": 0.016295332328155807 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7712418300653595, + "acc_stderr": 0.024051029739912258, + "acc_norm": 0.7712418300653595, + "acc_norm_stderr": 0.024051029739912258 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.77491961414791, + "acc_stderr": 0.023720088516179027, + "acc_norm": 0.77491961414791, + "acc_norm_stderr": 0.023720088516179027 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.02073635840806, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.02073635840806 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.599290780141844, + "acc_stderr": 0.029233465745573096, + "acc_norm": 0.599290780141844, + "acc_norm_stderr": 0.029233465745573096 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.5684485006518905, + "acc_stderr": 0.012650007999463909, + "acc_norm": 0.5684485006518905, + "acc_norm_stderr": 0.012650007999463909 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.7352941176470589, + "acc_stderr": 0.026799562024887667, + "acc_norm": 0.7352941176470589, + "acc_norm_stderr": 0.026799562024887667 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.7679738562091504, + "acc_stderr": 0.01707737337785693, + "acc_norm": 0.7679738562091504, + "acc_norm_stderr": 0.01707737337785693 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04265792110940588, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04265792110940588 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.8122448979591836, + "acc_stderr": 0.02500025603954619, + "acc_norm": 0.8122448979591836, + "acc_norm_stderr": 0.02500025603954619 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8706467661691543, + "acc_stderr": 0.02372983088101853, + "acc_norm": 0.8706467661691543, + "acc_norm_stderr": 0.02372983088101853 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.91, + "acc_stderr": 0.02876234912646612, + "acc_norm": 0.91, + "acc_norm_stderr": 0.02876234912646612 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5481927710843374, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.5481927710843374, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8771929824561403, + "acc_stderr": 0.02517298435015575, + "acc_norm": 0.8771929824561403, + "acc_norm_stderr": 0.02517298435015575 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.5569155446756426, + "mc1_stderr": 0.01738973034687711, + "mc2": 0.7029226076594556, + "mc2_stderr": 0.013335950631417065 + }, + "harness|winogrande|5": { + "acc": 0.8421468034727704, + "acc_stderr": 0.010247165248719763 + }, + "harness|gsm8k|5": { + "acc": 0.5466262319939348, + "acc_stderr": 0.013712471049515446 + }, + "all": { + "acc": 0.7110861444687467, + "acc_stderr": 0.030063430253086363, + "acc_norm": 0.7154441745417264, + "acc_norm_stderr": 0.030639690759115483, + "mc1": 0.5569155446756426, + "mc1_stderr": 0.01738973034687711, + "mc2": 0.7029226076594556, + "mc2_stderr": 0.013335950631417065 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "df848fc47559c759" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "a8fa53915153e1db", + "hash_cont_tokens": "99f280219fad78e3" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/teknium/CollectiveCognition-v1-Mistral-7B/results_2023-10-12T08-39-18.628472.json b/eval-results/teknium/CollectiveCognition-v1-Mistral-7B/results_2023-10-12T08-39-18.628472.json new file mode 100644 index 0000000000000000000000000000000000000000..d46bc88bb5845eb0aa88e5350fcb79eb4931f640 --- /dev/null +++ b/eval-results/teknium/CollectiveCognition-v1-Mistral-7B/results_2023-10-12T08-39-18.628472.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "teknium/CollectiveCognition-v1-Mistral-7B", + "model_sha": "58777f0563610fa770c4fa252c0350de71d4ab9d", + "model_size": "13.99 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5981228668941979, + "acc_stderr": 0.014327268614578276, + "acc_norm": 0.6237201365187713, + "acc_norm_stderr": 0.014157022555407154 + }, + "harness|hellaswag|10": { + "acc": 0.6682931686914957, + "acc_stderr": 0.004698640688271199, + "acc_norm": 0.855008962358096, + "acc_norm_stderr": 0.003513722251954675 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6074074074074074, + "acc_stderr": 0.0421850621536888, + "acc_norm": 0.6074074074074074, + "acc_norm_stderr": 0.0421850621536888 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6578947368421053, + "acc_stderr": 0.03860731599316092, + "acc_norm": 0.6578947368421053, + "acc_norm_stderr": 0.03860731599316092 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6943396226415094, + "acc_stderr": 0.028353298073322663, + "acc_norm": 0.6943396226415094, + "acc_norm_stderr": 0.028353298073322663 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7361111111111112, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.7361111111111112, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6358381502890174, + "acc_stderr": 0.03669072477416906, + "acc_norm": 0.6358381502890174, + "acc_norm_stderr": 0.03669072477416906 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.04755129616062946, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.04755129616062946 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932261, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932261 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5659574468085107, + "acc_stderr": 0.03240038086792747, + "acc_norm": 0.5659574468085107, + "acc_norm_stderr": 0.03240038086792747 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.45614035087719296, + "acc_stderr": 0.046854730419077895, + "acc_norm": 0.45614035087719296, + "acc_norm_stderr": 0.046854730419077895 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5862068965517241, + "acc_stderr": 0.04104269211806232, + "acc_norm": 0.5862068965517241, + "acc_norm_stderr": 0.04104269211806232 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3835978835978836, + "acc_stderr": 0.025043757318520196, + "acc_norm": 0.3835978835978836, + "acc_norm_stderr": 0.025043757318520196 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.043062412591271526, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.043062412591271526 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7548387096774194, + "acc_stderr": 0.024472243840895525, + "acc_norm": 0.7548387096774194, + "acc_norm_stderr": 0.024472243840895525 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5024630541871922, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.5024630541871922, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7515151515151515, + "acc_stderr": 0.03374402644139402, + "acc_norm": 0.7515151515151515, + "acc_norm_stderr": 0.03374402644139402 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7929292929292929, + "acc_stderr": 0.02886977846026705, + "acc_norm": 0.7929292929292929, + "acc_norm_stderr": 0.02886977846026705 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8756476683937824, + "acc_stderr": 0.02381447708659355, + "acc_norm": 0.8756476683937824, + "acc_norm_stderr": 0.02381447708659355 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6487179487179487, + "acc_stderr": 0.024203665177902803, + "acc_norm": 0.6487179487179487, + "acc_norm_stderr": 0.024203665177902803 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34444444444444444, + "acc_stderr": 0.02897264888484427, + "acc_norm": 0.34444444444444444, + "acc_norm_stderr": 0.02897264888484427 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.031566630992154156, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.031566630992154156 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.36423841059602646, + "acc_stderr": 0.03929111781242742, + "acc_norm": 0.36423841059602646, + "acc_norm_stderr": 0.03929111781242742 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8091743119266055, + "acc_stderr": 0.016847676400091098, + "acc_norm": 0.8091743119266055, + "acc_norm_stderr": 0.016847676400091098 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.44907407407407407, + "acc_stderr": 0.03392238405321616, + "acc_norm": 0.44907407407407407, + "acc_norm_stderr": 0.03392238405321616 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7696078431372549, + "acc_stderr": 0.02955429260569507, + "acc_norm": 0.7696078431372549, + "acc_norm_stderr": 0.02955429260569507 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7890295358649789, + "acc_stderr": 0.02655837250266192, + "acc_norm": 0.7890295358649789, + "acc_norm_stderr": 0.02655837250266192 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6636771300448431, + "acc_stderr": 0.031708824268455, + "acc_norm": 0.6636771300448431, + "acc_norm_stderr": 0.031708824268455 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7480916030534351, + "acc_stderr": 0.03807387116306086, + "acc_norm": 0.7480916030534351, + "acc_norm_stderr": 0.03807387116306086 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7851239669421488, + "acc_stderr": 0.037494924487096966, + "acc_norm": 0.7851239669421488, + "acc_norm_stderr": 0.037494924487096966 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7607361963190185, + "acc_stderr": 0.0335195387952127, + "acc_norm": 0.7607361963190185, + "acc_norm_stderr": 0.0335195387952127 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.48214285714285715, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.48214285714285715, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.03916667762822585, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.03916667762822585 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.02190190511507333, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.02190190511507333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8135376756066411, + "acc_stderr": 0.013927751372001506, + "acc_norm": 0.8135376756066411, + "acc_norm_stderr": 0.013927751372001506 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7138728323699421, + "acc_stderr": 0.02433214677913413, + "acc_norm": 0.7138728323699421, + "acc_norm_stderr": 0.02433214677913413 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.26927374301675977, + "acc_stderr": 0.014835616582882611, + "acc_norm": 0.26927374301675977, + "acc_norm_stderr": 0.014835616582882611 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7352941176470589, + "acc_stderr": 0.025261691219729487, + "acc_norm": 0.7352941176470589, + "acc_norm_stderr": 0.025261691219729487 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6752411575562701, + "acc_stderr": 0.026596782287697043, + "acc_norm": 0.6752411575562701, + "acc_norm_stderr": 0.026596782287697043 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7191358024691358, + "acc_stderr": 0.02500646975579921, + "acc_norm": 0.7191358024691358, + "acc_norm_stderr": 0.02500646975579921 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4858156028368794, + "acc_stderr": 0.02981549448368206, + "acc_norm": 0.4858156028368794, + "acc_norm_stderr": 0.02981549448368206 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4426336375488918, + "acc_stderr": 0.012685906538206245, + "acc_norm": 0.4426336375488918, + "acc_norm_stderr": 0.012685906538206245 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6727941176470589, + "acc_stderr": 0.028501452860396553, + "acc_norm": 0.6727941176470589, + "acc_norm_stderr": 0.028501452860396553 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6584967320261438, + "acc_stderr": 0.019184639328092487, + "acc_norm": 0.6584967320261438, + "acc_norm_stderr": 0.019184639328092487 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.726530612244898, + "acc_stderr": 0.028535560337128438, + "acc_norm": 0.726530612244898, + "acc_norm_stderr": 0.028535560337128438 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8507462686567164, + "acc_stderr": 0.02519692987482706, + "acc_norm": 0.8507462686567164, + "acc_norm_stderr": 0.02519692987482706 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5180722891566265, + "acc_stderr": 0.03889951252827216, + "acc_norm": 0.5180722891566265, + "acc_norm_stderr": 0.03889951252827216 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8538011695906432, + "acc_stderr": 0.02709729011807082, + "acc_norm": 0.8538011695906432, + "acc_norm_stderr": 0.02709729011807082 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3880048959608323, + "mc1_stderr": 0.017058761501347972, + "mc2": 0.5448189714738055, + "mc2_stderr": 0.01574989708463125 + }, + "all": { + "acc": 0.6278342126485141, + "acc_stderr": 0.03305495735233525, + "acc_norm": 0.6314327391449748, + "acc_norm_stderr": 0.033031988462581215, + "mc1": 0.3880048959608323, + "mc1_stderr": 0.017058761501347972, + "mc2": 0.5448189714738055, + "mc2_stderr": 0.01574989708463125 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "4383.472042798996", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/teknium/CollectiveCognition-v1-Mistral-7B/results_2023-10-29T01-40-21.634950.json b/eval-results/teknium/CollectiveCognition-v1-Mistral-7B/results_2023-10-29T01-40-21.634950.json new file mode 100644 index 0000000000000000000000000000000000000000..3b03dfc4e1a8c827981fd1e3583e02d29aa0ddab --- /dev/null +++ b/eval-results/teknium/CollectiveCognition-v1-Mistral-7B/results_2023-10-29T01-40-21.634950.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "teknium/CollectiveCognition-v1-Mistral-7B", + "model_sha": "58777f0563610fa770c4fa252c0350de71d4ab9d", + "model_size": "13.99 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.014786073825503355, + "em_stderr": 0.0012360366760473097, + "f1": 0.07218645134228192, + "f1_stderr": 0.0017555798787673934 + }, + "harness|gsm8k|5": { + "acc": 0.17892342683851403, + "acc_stderr": 0.010557661392901294 + }, + "harness|winogrande|5": { + "acc": 0.7758484609313339, + "acc_stderr": 0.011720400740774099 + }, + "all": { + "em": 0.014786073825503355, + "em_stderr": 0.0012360366760473097, + "f1": 0.07218645134228192, + "f1_stderr": 0.0017555798787673934, + "acc": 0.47738594388492395, + "acc_stderr": 0.011139031066837696 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "cdddbbbeed19db0c" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "441b3da10953eb3d" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2397, + "non-padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "cfdcf5ac5a9495ed" + }, + "total_evaluation_time_secondes": "25887.062338352203", + "truncated": 0, + "non-truncated": 13389, + "padded": 2397, + "non-padded": 10992, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/teknium/CollectiveCognition-v1.1-Mistral-7B/results_2023-10-12T08-33-23.557832.json b/eval-results/teknium/CollectiveCognition-v1.1-Mistral-7B/results_2023-10-12T08-33-23.557832.json new file mode 100644 index 0000000000000000000000000000000000000000..5a68f270791159ec21d005ea9b6217b78c832d49 --- /dev/null +++ b/eval-results/teknium/CollectiveCognition-v1.1-Mistral-7B/results_2023-10-12T08-33-23.557832.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "teknium/CollectiveCognition-v1.1-Mistral-7B", + "model_sha": "5f57f70ec99450c70da2540e94dd7fd67be4b23c", + "model_size": "13.99 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5887372013651877, + "acc_stderr": 0.014379441068522084, + "acc_norm": 0.621160409556314, + "acc_norm_stderr": 0.014175915490000328 + }, + "harness|hellaswag|10": { + "acc": 0.6500697072296355, + "acc_stderr": 0.004759729267943188, + "acc_norm": 0.841665006970723, + "acc_norm_stderr": 0.0036430875292137238 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6148148148148148, + "acc_stderr": 0.04203921040156279, + "acc_norm": 0.6148148148148148, + "acc_norm_stderr": 0.04203921040156279 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6381578947368421, + "acc_stderr": 0.03910525752849724, + "acc_norm": 0.6381578947368421, + "acc_norm_stderr": 0.03910525752849724 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6830188679245283, + "acc_stderr": 0.02863723563980089, + "acc_norm": 0.6830188679245283, + "acc_norm_stderr": 0.02863723563980089 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6875, + "acc_stderr": 0.038760854559127644, + "acc_norm": 0.6875, + "acc_norm_stderr": 0.038760854559127644 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.630057803468208, + "acc_stderr": 0.0368122963339432, + "acc_norm": 0.630057803468208, + "acc_norm_stderr": 0.0368122963339432 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.048971049527263666, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.048971049527263666 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5659574468085107, + "acc_stderr": 0.03240038086792747, + "acc_norm": 0.5659574468085107, + "acc_norm_stderr": 0.03240038086792747 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5087719298245614, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.5087719298245614, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.593103448275862, + "acc_stderr": 0.04093793981266236, + "acc_norm": 0.593103448275862, + "acc_norm_stderr": 0.04093793981266236 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.36772486772486773, + "acc_stderr": 0.024833839825562413, + "acc_norm": 0.36772486772486773, + "acc_norm_stderr": 0.024833839825562413 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.043435254289490965, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.043435254289490965 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7161290322580646, + "acc_stderr": 0.02564938106302927, + "acc_norm": 0.7161290322580646, + "acc_norm_stderr": 0.02564938106302927 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.03515895551165698, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.03515895551165698 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7636363636363637, + "acc_stderr": 0.03317505930009181, + "acc_norm": 0.7636363636363637, + "acc_norm_stderr": 0.03317505930009181 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8131313131313131, + "acc_stderr": 0.027772533334218974, + "acc_norm": 0.8131313131313131, + "acc_norm_stderr": 0.027772533334218974 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8652849740932642, + "acc_stderr": 0.02463978909770944, + "acc_norm": 0.8652849740932642, + "acc_norm_stderr": 0.02463978909770944 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6102564102564103, + "acc_stderr": 0.024726967886647078, + "acc_norm": 0.6102564102564103, + "acc_norm_stderr": 0.024726967886647078 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3296296296296296, + "acc_stderr": 0.02866120111652458, + "acc_norm": 0.3296296296296296, + "acc_norm_stderr": 0.02866120111652458 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6302521008403361, + "acc_stderr": 0.03135709599613591, + "acc_norm": 0.6302521008403361, + "acc_norm_stderr": 0.03135709599613591 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2847682119205298, + "acc_stderr": 0.03684881521389023, + "acc_norm": 0.2847682119205298, + "acc_norm_stderr": 0.03684881521389023 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.818348623853211, + "acc_stderr": 0.016530617409266875, + "acc_norm": 0.818348623853211, + "acc_norm_stderr": 0.016530617409266875 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4398148148148148, + "acc_stderr": 0.03385177976044812, + "acc_norm": 0.4398148148148148, + "acc_norm_stderr": 0.03385177976044812 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7598039215686274, + "acc_stderr": 0.02998373305591362, + "acc_norm": 0.7598039215686274, + "acc_norm_stderr": 0.02998373305591362 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7721518987341772, + "acc_stderr": 0.027303484599069432, + "acc_norm": 0.7721518987341772, + "acc_norm_stderr": 0.027303484599069432 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6681614349775785, + "acc_stderr": 0.03160295143776679, + "acc_norm": 0.6681614349775785, + "acc_norm_stderr": 0.03160295143776679 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7480916030534351, + "acc_stderr": 0.03807387116306086, + "acc_norm": 0.7480916030534351, + "acc_norm_stderr": 0.03807387116306086 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.042365112580946315, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.042365112580946315 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7300613496932515, + "acc_stderr": 0.034878251684978906, + "acc_norm": 0.7300613496932515, + "acc_norm_stderr": 0.034878251684978906 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.024414947304543674, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.024414947304543674 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7943805874840357, + "acc_stderr": 0.01445250045678583, + "acc_norm": 0.7943805874840357, + "acc_norm_stderr": 0.01445250045678583 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7023121387283237, + "acc_stderr": 0.024617055388677003, + "acc_norm": 0.7023121387283237, + "acc_norm_stderr": 0.024617055388677003 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3229050279329609, + "acc_stderr": 0.015638440380241484, + "acc_norm": 0.3229050279329609, + "acc_norm_stderr": 0.015638440380241484 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7287581699346405, + "acc_stderr": 0.025457756696667878, + "acc_norm": 0.7287581699346405, + "acc_norm_stderr": 0.025457756696667878 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6881028938906752, + "acc_stderr": 0.026311858071854155, + "acc_norm": 0.6881028938906752, + "acc_norm_stderr": 0.026311858071854155 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7067901234567902, + "acc_stderr": 0.025329888171900922, + "acc_norm": 0.7067901234567902, + "acc_norm_stderr": 0.025329888171900922 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4858156028368794, + "acc_stderr": 0.02981549448368206, + "acc_norm": 0.4858156028368794, + "acc_norm_stderr": 0.02981549448368206 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4576271186440678, + "acc_stderr": 0.012724296550980188, + "acc_norm": 0.4576271186440678, + "acc_norm_stderr": 0.012724296550980188 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6397058823529411, + "acc_stderr": 0.029163128570670733, + "acc_norm": 0.6397058823529411, + "acc_norm_stderr": 0.029163128570670733 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6470588235294118, + "acc_stderr": 0.019333142020797164, + "acc_norm": 0.6470588235294118, + "acc_norm_stderr": 0.019333142020797164 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.0449429086625209, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.0449429086625209 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7183673469387755, + "acc_stderr": 0.028795185574291282, + "acc_norm": 0.7183673469387755, + "acc_norm_stderr": 0.028795185574291282 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.835820895522388, + "acc_stderr": 0.026193923544454115, + "acc_norm": 0.835820895522388, + "acc_norm_stderr": 0.026193923544454115 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5, + "acc_stderr": 0.03892494720807614, + "acc_norm": 0.5, + "acc_norm_stderr": 0.03892494720807614 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.408812729498164, + "mc1_stderr": 0.017209952151641728, + "mc2": 0.5762413893058237, + "mc2_stderr": 0.015711876502347427 + }, + "all": { + "acc": 0.6233590653839346, + "acc_stderr": 0.0334105502985208, + "acc_norm": 0.6271559892471925, + "acc_norm_stderr": 0.033388174581279254, + "mc1": 0.408812729498164, + "mc1_stderr": 0.017209952151641728, + "mc2": 0.5762413893058237, + "mc2_stderr": 0.015711876502347427 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "4393.418792009354", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/teknium/CollectiveCognition-v1.1-Mistral-7B/results_2023-10-24T18-24-08.168024.json b/eval-results/teknium/CollectiveCognition-v1.1-Mistral-7B/results_2023-10-24T18-24-08.168024.json new file mode 100644 index 0000000000000000000000000000000000000000..38a303bb09db77c105e843e901c401c25f969b4d --- /dev/null +++ b/eval-results/teknium/CollectiveCognition-v1.1-Mistral-7B/results_2023-10-24T18-24-08.168024.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "teknium/CollectiveCognition-v1.1-Mistral-7B", + "model_sha": "5f57f70ec99450c70da2540e94dd7fd67be4b23c", + "model_size": "13.99 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.14481963087248323, + "em_stderr": 0.003603978827087507, + "f1": 0.19846161912751598, + "f1_stderr": 0.0036570269650408635 + }, + "harness|gsm8k|5": { + "acc": 0.1561789234268385, + "acc_stderr": 0.00999950936975745 + }, + "harness|winogrande|5": { + "acc": 0.7537490134175217, + "acc_stderr": 0.012108365307437523 + }, + "all": { + "em": 0.14481963087248323, + "em_stderr": 0.003603978827087507, + "f1": 0.19846161912751598, + "f1_stderr": 0.0036570269650408635, + "acc": 0.45496396842218007, + "acc_stderr": 0.011053937338597487 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "367ff4874e8edef6" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "914819a44c2472a8" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2397, + "non-padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "a1db66e7e7e7abad" + }, + "total_evaluation_time_secondes": "23173.049478292465", + "truncated": 0, + "non-truncated": 13389, + "padded": 2397, + "non-padded": 10992, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/teknium/CollectiveCognition-v1.1-Mistral-7B/results_2023-11-08T13-48-47.550072.json b/eval-results/teknium/CollectiveCognition-v1.1-Mistral-7B/results_2023-11-08T13-48-47.550072.json new file mode 100644 index 0000000000000000000000000000000000000000..d2d083057ae5a371b78c54fc1aeb2020e9383133 --- /dev/null +++ b/eval-results/teknium/CollectiveCognition-v1.1-Mistral-7B/results_2023-11-08T13-48-47.550072.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "teknium/CollectiveCognition-v1.1-Mistral-7B", + "model_sha": "5f57f70ec99450c70da2540e94dd7fd67be4b23c", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5895904436860068, + "acc_stderr": 0.014374922192642666, + "acc_norm": 0.6254266211604096, + "acc_norm_stderr": 0.014144193471893452 + }, + "harness|hellaswag|10": { + "acc": 0.6495717984465246, + "acc_stderr": 0.004761289867046067, + "acc_norm": 0.8412666799442342, + "acc_norm_stderr": 0.0036468038997703447 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6074074074074074, + "acc_stderr": 0.04218506215368879, + "acc_norm": 0.6074074074074074, + "acc_norm_stderr": 0.04218506215368879 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6381578947368421, + "acc_stderr": 0.03910525752849724, + "acc_norm": 0.6381578947368421, + "acc_norm_stderr": 0.03910525752849724 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6754716981132075, + "acc_stderr": 0.028815615713432115, + "acc_norm": 0.6754716981132075, + "acc_norm_stderr": 0.028815615713432115 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7013888888888888, + "acc_stderr": 0.03827052357950756, + "acc_norm": 0.7013888888888888, + "acc_norm_stderr": 0.03827052357950756 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145634, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145634 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.036430371689585475, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.036430371689585475 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.43137254901960786, + "acc_stderr": 0.04928099597287534, + "acc_norm": 0.43137254901960786, + "acc_norm_stderr": 0.04928099597287534 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5659574468085107, + "acc_stderr": 0.03240038086792747, + "acc_norm": 0.5659574468085107, + "acc_norm_stderr": 0.03240038086792747 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5263157894736842, + "acc_stderr": 0.046970851366478626, + "acc_norm": 0.5263157894736842, + "acc_norm_stderr": 0.046970851366478626 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5862068965517241, + "acc_stderr": 0.04104269211806232, + "acc_norm": 0.5862068965517241, + "acc_norm_stderr": 0.04104269211806232 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.02490699045899257, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.02490699045899257 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.04375888492727061, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.04375888492727061 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7161290322580646, + "acc_stderr": 0.02564938106302927, + "acc_norm": 0.7161290322580646, + "acc_norm_stderr": 0.02564938106302927 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.035158955511656986, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.035158955511656986 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7696969696969697, + "acc_stderr": 0.0328766675860349, + "acc_norm": 0.7696969696969697, + "acc_norm_stderr": 0.0328766675860349 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8131313131313131, + "acc_stderr": 0.027772533334218974, + "acc_norm": 0.8131313131313131, + "acc_norm_stderr": 0.027772533334218974 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8704663212435233, + "acc_stderr": 0.024233532297758733, + "acc_norm": 0.8704663212435233, + "acc_norm_stderr": 0.024233532297758733 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6128205128205129, + "acc_stderr": 0.024697216930878934, + "acc_norm": 0.6128205128205129, + "acc_norm_stderr": 0.024697216930878934 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3296296296296296, + "acc_stderr": 0.02866120111652458, + "acc_norm": 0.3296296296296296, + "acc_norm_stderr": 0.02866120111652458 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.634453781512605, + "acc_stderr": 0.031282177063684614, + "acc_norm": 0.634453781512605, + "acc_norm_stderr": 0.031282177063684614 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2847682119205298, + "acc_stderr": 0.03684881521389023, + "acc_norm": 0.2847682119205298, + "acc_norm_stderr": 0.03684881521389023 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8201834862385321, + "acc_stderr": 0.016465345467391552, + "acc_norm": 0.8201834862385321, + "acc_norm_stderr": 0.016465345467391552 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.46296296296296297, + "acc_stderr": 0.03400603625538271, + "acc_norm": 0.46296296296296297, + "acc_norm_stderr": 0.03400603625538271 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7647058823529411, + "acc_stderr": 0.02977177522814562, + "acc_norm": 0.7647058823529411, + "acc_norm_stderr": 0.02977177522814562 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7721518987341772, + "acc_stderr": 0.027303484599069432, + "acc_norm": 0.7721518987341772, + "acc_norm_stderr": 0.027303484599069432 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6591928251121076, + "acc_stderr": 0.03181149747055359, + "acc_norm": 0.6591928251121076, + "acc_norm_stderr": 0.03181149747055359 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7480916030534351, + "acc_stderr": 0.03807387116306086, + "acc_norm": 0.7480916030534351, + "acc_norm_stderr": 0.03807387116306086 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.042365112580946315, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.042365112580946315 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7361963190184049, + "acc_stderr": 0.03462419931615624, + "acc_norm": 0.7361963190184049, + "acc_norm_stderr": 0.03462419931615624 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5089285714285714, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.5089285714285714, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8376068376068376, + "acc_stderr": 0.02416161812798774, + "acc_norm": 0.8376068376068376, + "acc_norm_stderr": 0.02416161812798774 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7956577266922095, + "acc_stderr": 0.014419123980931894, + "acc_norm": 0.7956577266922095, + "acc_norm_stderr": 0.014419123980931894 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6994219653179191, + "acc_stderr": 0.0246853168672578, + "acc_norm": 0.6994219653179191, + "acc_norm_stderr": 0.0246853168672578 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3329608938547486, + "acc_stderr": 0.015761716178397563, + "acc_norm": 0.3329608938547486, + "acc_norm_stderr": 0.015761716178397563 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7287581699346405, + "acc_stderr": 0.025457756696667878, + "acc_norm": 0.7287581699346405, + "acc_norm_stderr": 0.025457756696667878 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.684887459807074, + "acc_stderr": 0.026385273703464482, + "acc_norm": 0.684887459807074, + "acc_norm_stderr": 0.026385273703464482 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7006172839506173, + "acc_stderr": 0.02548311560119545, + "acc_norm": 0.7006172839506173, + "acc_norm_stderr": 0.02548311560119545 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.48226950354609927, + "acc_stderr": 0.02980873964223777, + "acc_norm": 0.48226950354609927, + "acc_norm_stderr": 0.02980873964223777 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4589308996088657, + "acc_stderr": 0.012727084826799804, + "acc_norm": 0.4589308996088657, + "acc_norm_stderr": 0.012727084826799804 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6360294117647058, + "acc_stderr": 0.02922719246003203, + "acc_norm": 0.6360294117647058, + "acc_norm_stderr": 0.02922719246003203 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6454248366013072, + "acc_stderr": 0.0193533605475537, + "acc_norm": 0.6454248366013072, + "acc_norm_stderr": 0.0193533605475537 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7224489795918367, + "acc_stderr": 0.02866685779027465, + "acc_norm": 0.7224489795918367, + "acc_norm_stderr": 0.02866685779027465 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8258706467661692, + "acc_stderr": 0.026814951200421603, + "acc_norm": 0.8258706467661692, + "acc_norm_stderr": 0.026814951200421603 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5, + "acc_stderr": 0.03892494720807614, + "acc_norm": 0.5, + "acc_norm_stderr": 0.03892494720807614 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8245614035087719, + "acc_stderr": 0.029170885500727665, + "acc_norm": 0.8245614035087719, + "acc_norm_stderr": 0.029170885500727665 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.40514075887392903, + "mc1_stderr": 0.01718561172775337, + "mc2": 0.5761479349192792, + "mc2_stderr": 0.015694985778154737 + }, + "all": { + "acc": 0.6255358463798112, + "acc_stderr": 0.03339672778015502, + "acc_norm": 0.6293923050064572, + "acc_norm_stderr": 0.03337392753120544, + "mc1": 0.40514075887392903, + "mc1_stderr": 0.01718561172775337, + "mc2": 0.5761479349192792, + "mc2_stderr": 0.015694985778154737 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4684, + "non_padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40039, + "non_padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1056, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 572, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 684, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 396, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 796, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2161, + "non_padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 445, + "non_padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1360, + "non_padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "truncated": 0, + "non_truncated": 26073, + "padded": 110793, + "non_padded": 226, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/teknium/CollectiveCognition-v1.1-Mistral-7B/results_2023-12-03T17-43-05.326590.json b/eval-results/teknium/CollectiveCognition-v1.1-Mistral-7B/results_2023-12-03T17-43-05.326590.json new file mode 100644 index 0000000000000000000000000000000000000000..14ca81d3797c23914a80ad96c7fc925a2ec6f947 --- /dev/null +++ b/eval-results/teknium/CollectiveCognition-v1.1-Mistral-7B/results_2023-12-03T17-43-05.326590.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 74976.421274166, + "end_time": 80256.176678411, + "total_evaluation_time_secondes": "5279.755404244992", + "model_name": "teknium/CollectiveCognition-v1.1-Mistral-7B", + "model_sha": "5f57f70ec99450c70da2540e94dd7fd67be4b23c", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.3525398028809704, + "acc_stderr": 0.013159909755930328 + }, + "all": { + "acc": 0.3525398028809704, + "acc_stderr": 0.013159909755930328 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "793cf072fa6bed86" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "f17391d49d33b9c0", + "hash_cont_tokens": "52a2ad19e378803d" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/teknium/CollectiveCognition-v1.1-Mistral-7B/results_2023-12-03T17-47-55.890655.json b/eval-results/teknium/CollectiveCognition-v1.1-Mistral-7B/results_2023-12-03T17-47-55.890655.json new file mode 100644 index 0000000000000000000000000000000000000000..643bbd881047d140b37bc805c49704b3ba4e30e5 --- /dev/null +++ b/eval-results/teknium/CollectiveCognition-v1.1-Mistral-7B/results_2023-12-03T17-47-55.890655.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 74975.534778895, + "end_time": 80548.57045554, + "total_evaluation_time_secondes": "5573.035676644999", + "model_name": "teknium/CollectiveCognition-v1.1-Mistral-7B", + "model_sha": "5f57f70ec99450c70da2540e94dd7fd67be4b23c", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.35860500379075055, + "acc_stderr": 0.01321031736413403 + }, + "all": { + "acc": 0.35860500379075055, + "acc_stderr": 0.01321031736413403 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "914819a44c2472a8" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "f17391d49d33b9c0", + "hash_cont_tokens": "45fe6727de72293c" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/teknium/Mistral-Trismegistus-7B/results_2023-10-12T08-45-24.509522.json b/eval-results/teknium/Mistral-Trismegistus-7B/results_2023-10-12T08-45-24.509522.json new file mode 100644 index 0000000000000000000000000000000000000000..b2f1a3d8dcb311c0de07d5393233de01ccd171a8 --- /dev/null +++ b/eval-results/teknium/Mistral-Trismegistus-7B/results_2023-10-12T08-45-24.509522.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "teknium/Mistral-Trismegistus-7B", + "model_sha": "0a5752d096ebab21759dbe203f6b7c7f6092faf2", + "model_size": "13.99 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5051194539249146, + "acc_stderr": 0.014610624890309157, + "acc_norm": 0.5409556313993175, + "acc_norm_stderr": 0.014562291073601229 + }, + "harness|hellaswag|10": { + "acc": 0.5929097789285003, + "acc_stderr": 0.004902878806733037, + "acc_norm": 0.7791276638119896, + "acc_norm_stderr": 0.004139867975116299 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5263157894736842, + "acc_stderr": 0.04063302731486671, + "acc_norm": 0.5263157894736842, + "acc_norm_stderr": 0.04063302731486671 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6, + "acc_stderr": 0.03015113445777629, + "acc_norm": 0.6, + "acc_norm_stderr": 0.03015113445777629 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.04076663253918567, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.04076663253918567 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5722543352601156, + "acc_stderr": 0.03772446857518027, + "acc_norm": 0.5722543352601156, + "acc_norm_stderr": 0.03772446857518027 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.30392156862745096, + "acc_stderr": 0.04576665403207762, + "acc_norm": 0.30392156862745096, + "acc_norm_stderr": 0.04576665403207762 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4595744680851064, + "acc_stderr": 0.03257901482099835, + "acc_norm": 0.4595744680851064, + "acc_norm_stderr": 0.03257901482099835 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.42105263157894735, + "acc_stderr": 0.04644602091222318, + "acc_norm": 0.42105263157894735, + "acc_norm_stderr": 0.04644602091222318 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5517241379310345, + "acc_stderr": 0.04144311810878152, + "acc_norm": 0.5517241379310345, + "acc_norm_stderr": 0.04144311810878152 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.36243386243386244, + "acc_stderr": 0.02475747390275206, + "acc_norm": 0.36243386243386244, + "acc_norm_stderr": 0.02475747390275206 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.04073524322147125, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.04073524322147125 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6967741935483871, + "acc_stderr": 0.026148685930671746, + "acc_norm": 0.6967741935483871, + "acc_norm_stderr": 0.026148685930671746 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.458128078817734, + "acc_stderr": 0.03505630140785741, + "acc_norm": 0.458128078817734, + "acc_norm_stderr": 0.03505630140785741 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.62, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.62, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.036085410115739666, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.036085410115739666 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7171717171717171, + "acc_stderr": 0.032087795587867514, + "acc_norm": 0.7171717171717171, + "acc_norm_stderr": 0.032087795587867514 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7409326424870466, + "acc_stderr": 0.03161877917935413, + "acc_norm": 0.7409326424870466, + "acc_norm_stderr": 0.03161877917935413 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5076923076923077, + "acc_stderr": 0.025348006031534757, + "acc_norm": 0.5076923076923077, + "acc_norm_stderr": 0.025348006031534757 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.027840811495871934, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.027840811495871934 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.03242225027115006, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.03242225027115006 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3576158940397351, + "acc_stderr": 0.03913453431177258, + "acc_norm": 0.3576158940397351, + "acc_norm_stderr": 0.03913453431177258 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6935779816513762, + "acc_stderr": 0.01976551722045852, + "acc_norm": 0.6935779816513762, + "acc_norm_stderr": 0.01976551722045852 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.41203703703703703, + "acc_stderr": 0.03356787758160835, + "acc_norm": 0.41203703703703703, + "acc_norm_stderr": 0.03356787758160835 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6715686274509803, + "acc_stderr": 0.03296245110172228, + "acc_norm": 0.6715686274509803, + "acc_norm_stderr": 0.03296245110172228 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7215189873417721, + "acc_stderr": 0.029178682304842534, + "acc_norm": 0.7215189873417721, + "acc_norm_stderr": 0.029178682304842534 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.600896860986547, + "acc_stderr": 0.03286745312567961, + "acc_norm": 0.600896860986547, + "acc_norm_stderr": 0.03286745312567961 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6106870229007634, + "acc_stderr": 0.04276486542814591, + "acc_norm": 0.6106870229007634, + "acc_norm_stderr": 0.04276486542814591 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.71900826446281, + "acc_stderr": 0.04103203830514512, + "acc_norm": 0.71900826446281, + "acc_norm_stderr": 0.04103203830514512 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6018518518518519, + "acc_stderr": 0.04732332615978813, + "acc_norm": 0.6018518518518519, + "acc_norm_stderr": 0.04732332615978813 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6134969325153374, + "acc_stderr": 0.038258255488486076, + "acc_norm": 0.6134969325153374, + "acc_norm_stderr": 0.038258255488486076 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4642857142857143, + "acc_stderr": 0.04733667890053756, + "acc_norm": 0.4642857142857143, + "acc_norm_stderr": 0.04733667890053756 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7087378640776699, + "acc_stderr": 0.044986763205729224, + "acc_norm": 0.7087378640776699, + "acc_norm_stderr": 0.044986763205729224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8034188034188035, + "acc_stderr": 0.02603538609895129, + "acc_norm": 0.8034188034188035, + "acc_norm_stderr": 0.02603538609895129 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7203065134099617, + "acc_stderr": 0.01605079214803653, + "acc_norm": 0.7203065134099617, + "acc_norm_stderr": 0.01605079214803653 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.02607431485165708, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.02607431485165708 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25921787709497207, + "acc_stderr": 0.01465578083749772, + "acc_norm": 0.25921787709497207, + "acc_norm_stderr": 0.01465578083749772 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5947712418300654, + "acc_stderr": 0.028110928492809065, + "acc_norm": 0.5947712418300654, + "acc_norm_stderr": 0.028110928492809065 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6109324758842444, + "acc_stderr": 0.027690337536485372, + "acc_norm": 0.6109324758842444, + "acc_norm_stderr": 0.027690337536485372 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5864197530864198, + "acc_stderr": 0.02740204204026996, + "acc_norm": 0.5864197530864198, + "acc_norm_stderr": 0.02740204204026996 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4078014184397163, + "acc_stderr": 0.02931601177634356, + "acc_norm": 0.4078014184397163, + "acc_norm_stderr": 0.02931601177634356 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.40091264667535853, + "acc_stderr": 0.01251696035064082, + "acc_norm": 0.40091264667535853, + "acc_norm_stderr": 0.01251696035064082 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5441176470588235, + "acc_stderr": 0.030254372573976715, + "acc_norm": 0.5441176470588235, + "acc_norm_stderr": 0.030254372573976715 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5179738562091504, + "acc_stderr": 0.020214761037872408, + "acc_norm": 0.5179738562091504, + "acc_norm_stderr": 0.020214761037872408 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5454545454545454, + "acc_stderr": 0.04769300568972743, + "acc_norm": 0.5454545454545454, + "acc_norm_stderr": 0.04769300568972743 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5755102040816327, + "acc_stderr": 0.031642094879429414, + "acc_norm": 0.5755102040816327, + "acc_norm_stderr": 0.031642094879429414 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7313432835820896, + "acc_stderr": 0.03134328358208954, + "acc_norm": 0.7313432835820896, + "acc_norm_stderr": 0.03134328358208954 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.82, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4457831325301205, + "acc_stderr": 0.03869543323472101, + "acc_norm": 0.4457831325301205, + "acc_norm_stderr": 0.03869543323472101 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7543859649122807, + "acc_stderr": 0.0330140594698725, + "acc_norm": 0.7543859649122807, + "acc_norm_stderr": 0.0330140594698725 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.33047735618115054, + "mc1_stderr": 0.0164667696136983, + "mc2": 0.49358570687694137, + "mc2_stderr": 0.016027365716640467 + }, + "all": { + "acc": 0.5450300908988583, + "acc_stderr": 0.03470010763865252, + "acc_norm": 0.5487937190744159, + "acc_norm_stderr": 0.03468635603444363, + "mc1": 0.33047735618115054, + "mc1_stderr": 0.0164667696136983, + "mc2": 0.49358570687694137, + "mc2_stderr": 0.016027365716640467 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "4392.902026414871", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/teknium/Mistral-Trismegistus-7B/results_2023-10-25T09-46-08.723071.json b/eval-results/teknium/Mistral-Trismegistus-7B/results_2023-10-25T09-46-08.723071.json new file mode 100644 index 0000000000000000000000000000000000000000..44b7e8d079f4f77259eecbc4c5ca308685969340 --- /dev/null +++ b/eval-results/teknium/Mistral-Trismegistus-7B/results_2023-10-25T09-46-08.723071.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "teknium/Mistral-Trismegistus-7B", + "model_sha": "897e8ac41711254f7ba172a1dc403e41a8317c58", + "model_size": "13.99 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.010591442953020135, + "em_stderr": 0.0010483469790502314, + "f1": 0.07238674496644287, + "f1_stderr": 0.001675223530701393 + }, + "harness|gsm8k|5": { + "acc": 0.09931766489764973, + "acc_stderr": 0.008238371412683985 + }, + "harness|winogrande|5": { + "acc": 0.7016574585635359, + "acc_stderr": 0.012858885010030421 + }, + "all": { + "em": 0.010591442953020135, + "em_stderr": 0.0010483469790502314, + "f1": 0.07238674496644287, + "f1_stderr": 0.001675223530701393, + "acc": 0.4004875617305928, + "acc_stderr": 0.010548628211357203 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "2e0b499b10bf2038" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "e835f27f99d5c8ff" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2397, + "non-padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "ddda2f7f7842799c" + }, + "total_evaluation_time_secondes": "21569.481523752213", + "truncated": 0, + "non-truncated": 13389, + "padded": 2397, + "non-padded": 10992, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/teknium/OpenHermes-13B/results_2023-09-13T01-56-57.835904.json b/eval-results/teknium/OpenHermes-13B/results_2023-09-13T01-56-57.835904.json new file mode 100644 index 0000000000000000000000000000000000000000..05e587b29bf65eab873e7b07e28c8b83235f00d7 --- /dev/null +++ b/eval-results/teknium/OpenHermes-13B/results_2023-09-13T01-56-57.835904.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "teknium/OpenHermes-13B", + "model_sha": "f09d0fe655ad57cce9179b7b40ea6f81e07db18c", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5648464163822525, + "acc_stderr": 0.014487986197186045, + "acc_norm": 0.5981228668941979, + "acc_norm_stderr": 0.014327268614578274 + }, + "harness|hellaswag|10": { + "acc": 0.6246763592909779, + "acc_stderr": 0.004832167854501644, + "acc_norm": 0.8224457279426409, + "acc_norm_stderr": 0.0038135610571503414 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.046482319871173156, + "acc_norm": 0.31, + "acc_norm_stderr": 0.046482319871173156 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5723684210526315, + "acc_stderr": 0.04026097083296564, + "acc_norm": 0.5723684210526315, + "acc_norm_stderr": 0.04026097083296564 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5735849056603773, + "acc_stderr": 0.03043779434298305, + "acc_norm": 0.5735849056603773, + "acc_norm_stderr": 0.03043779434298305 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5972222222222222, + "acc_stderr": 0.04101405519842426, + "acc_norm": 0.5972222222222222, + "acc_norm_stderr": 0.04101405519842426 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.45, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.43, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.43, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5375722543352601, + "acc_stderr": 0.0380168510452446, + "acc_norm": 0.5375722543352601, + "acc_norm_stderr": 0.0380168510452446 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.042207736591714506, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.042207736591714506 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4765957446808511, + "acc_stderr": 0.03265019475033583, + "acc_norm": 0.4765957446808511, + "acc_norm_stderr": 0.03265019475033583 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.35978835978835977, + "acc_stderr": 0.024718075944129277, + "acc_norm": 0.35978835978835977, + "acc_norm_stderr": 0.024718075944129277 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.042857142857142816, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.042857142857142816 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6387096774193548, + "acc_stderr": 0.027327548447957536, + "acc_norm": 0.6387096774193548, + "acc_norm_stderr": 0.027327548447957536 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.035158955511656986, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.035158955511656986 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6424242424242425, + "acc_stderr": 0.03742597043806587, + "acc_norm": 0.6424242424242425, + "acc_norm_stderr": 0.03742597043806587 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6818181818181818, + "acc_stderr": 0.0331847733384533, + "acc_norm": 0.6818181818181818, + "acc_norm_stderr": 0.0331847733384533 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7875647668393783, + "acc_stderr": 0.029519282616817234, + "acc_norm": 0.7875647668393783, + "acc_norm_stderr": 0.029519282616817234 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.541025641025641, + "acc_stderr": 0.025265525491284295, + "acc_norm": 0.541025641025641, + "acc_norm_stderr": 0.025265525491284295 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3074074074074074, + "acc_stderr": 0.028133252578815642, + "acc_norm": 0.3074074074074074, + "acc_norm_stderr": 0.028133252578815642 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.592436974789916, + "acc_stderr": 0.031918633744784645, + "acc_norm": 0.592436974789916, + "acc_norm_stderr": 0.031918633744784645 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3576158940397351, + "acc_stderr": 0.03913453431177258, + "acc_norm": 0.3576158940397351, + "acc_norm_stderr": 0.03913453431177258 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7357798165137615, + "acc_stderr": 0.018904164171510186, + "acc_norm": 0.7357798165137615, + "acc_norm_stderr": 0.018904164171510186 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5, + "acc_stderr": 0.034099716973523674, + "acc_norm": 0.5, + "acc_norm_stderr": 0.034099716973523674 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7696078431372549, + "acc_stderr": 0.029554292605695063, + "acc_norm": 0.7696078431372549, + "acc_norm_stderr": 0.029554292605695063 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7341772151898734, + "acc_stderr": 0.02875679962965834, + "acc_norm": 0.7341772151898734, + "acc_norm_stderr": 0.02875679962965834 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6636771300448431, + "acc_stderr": 0.031708824268455, + "acc_norm": 0.6636771300448431, + "acc_norm_stderr": 0.031708824268455 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6412213740458015, + "acc_stderr": 0.04206739313864908, + "acc_norm": 0.6412213740458015, + "acc_norm_stderr": 0.04206739313864908 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7603305785123967, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.7603305785123967, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.043300437496507416, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.043300437496507416 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6748466257668712, + "acc_stderr": 0.03680350371286461, + "acc_norm": 0.6748466257668712, + "acc_norm_stderr": 0.03680350371286461 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3392857142857143, + "acc_stderr": 0.04493949068613538, + "acc_norm": 0.3392857142857143, + "acc_norm_stderr": 0.04493949068613538 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7087378640776699, + "acc_stderr": 0.04498676320572924, + "acc_norm": 0.7087378640776699, + "acc_norm_stderr": 0.04498676320572924 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8247863247863247, + "acc_stderr": 0.024904439098918225, + "acc_norm": 0.8247863247863247, + "acc_norm_stderr": 0.024904439098918225 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7496807151979565, + "acc_stderr": 0.015491088951494567, + "acc_norm": 0.7496807151979565, + "acc_norm_stderr": 0.015491088951494567 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.025722802200895813, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.025722802200895813 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.45139664804469276, + "acc_stderr": 0.01664330737231587, + "acc_norm": 0.45139664804469276, + "acc_norm_stderr": 0.01664330737231587 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6274509803921569, + "acc_stderr": 0.027684181883302898, + "acc_norm": 0.6274509803921569, + "acc_norm_stderr": 0.027684181883302898 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6077170418006431, + "acc_stderr": 0.027731258647012005, + "acc_norm": 0.6077170418006431, + "acc_norm_stderr": 0.027731258647012005 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6419753086419753, + "acc_stderr": 0.026675611926037082, + "acc_norm": 0.6419753086419753, + "acc_norm_stderr": 0.026675611926037082 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.45390070921985815, + "acc_stderr": 0.02970045324729147, + "acc_norm": 0.45390070921985815, + "acc_norm_stderr": 0.02970045324729147 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4322033898305085, + "acc_stderr": 0.012652297777114968, + "acc_norm": 0.4322033898305085, + "acc_norm_stderr": 0.012652297777114968 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5330882352941176, + "acc_stderr": 0.03030625772246831, + "acc_norm": 0.5330882352941176, + "acc_norm_stderr": 0.03030625772246831 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5473856209150327, + "acc_stderr": 0.020136790918492523, + "acc_norm": 0.5473856209150327, + "acc_norm_stderr": 0.020136790918492523 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6489795918367347, + "acc_stderr": 0.030555316755573637, + "acc_norm": 0.6489795918367347, + "acc_norm_stderr": 0.030555316755573637 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7263681592039801, + "acc_stderr": 0.03152439186555401, + "acc_norm": 0.7263681592039801, + "acc_norm_stderr": 0.03152439186555401 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036847, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036847 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.46987951807228917, + "acc_stderr": 0.03885425420866767, + "acc_norm": 0.46987951807228917, + "acc_norm_stderr": 0.03885425420866767 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7719298245614035, + "acc_stderr": 0.03218093795602357, + "acc_norm": 0.7719298245614035, + "acc_norm_stderr": 0.03218093795602357 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.31946144430844553, + "mc1_stderr": 0.016322644182960498, + "mc2": 0.46011282176154084, + "mc2_stderr": 0.015289160619930926 + }, + "all": { + "acc": 0.5645334088060932, + "acc_stderr": 0.03451457992352676, + "acc_norm": 0.5684494396393746, + "acc_norm_stderr": 0.0344945913747139, + "mc1": 0.31946144430844553, + "mc1_stderr": 0.016322644182960498, + "mc2": 0.46011282176154084, + "mc2_stderr": 0.015289160619930926 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6374.828125953674", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/teknium/OpenHermes-13B/results_2023-09-13T02-06-09.559271.json b/eval-results/teknium/OpenHermes-13B/results_2023-09-13T02-06-09.559271.json new file mode 100644 index 0000000000000000000000000000000000000000..d76c95c3df31a0b059436cc43689f66e577d9e16 --- /dev/null +++ b/eval-results/teknium/OpenHermes-13B/results_2023-09-13T02-06-09.559271.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "teknium/OpenHermes-13B", + "model_sha": "f09d0fe655ad57cce9179b7b40ea6f81e07db18c", + "model_size": "24.32 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5639931740614335, + "acc_stderr": 0.014491225699230916, + "acc_norm": 0.6015358361774744, + "acc_norm_stderr": 0.014306946052735567 + }, + "harness|hellaswag|10": { + "acc": 0.6249751045608445, + "acc_stderr": 0.0048313992185002345, + "acc_norm": 0.8218482374029078, + "acc_norm_stderr": 0.003818584384635532 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411022, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411022 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5657894736842105, + "acc_stderr": 0.04033565667848319, + "acc_norm": 0.5657894736842105, + "acc_norm_stderr": 0.04033565667848319 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5660377358490566, + "acc_stderr": 0.030503292013342596, + "acc_norm": 0.5660377358490566, + "acc_norm_stderr": 0.030503292013342596 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5902777777777778, + "acc_stderr": 0.04112490974670787, + "acc_norm": 0.5902777777777778, + "acc_norm_stderr": 0.04112490974670787 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5433526011560693, + "acc_stderr": 0.03798106566014498, + "acc_norm": 0.5433526011560693, + "acc_norm_stderr": 0.03798106566014498 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526094, + "acc_norm": 0.67, + "acc_norm_stderr": 0.047258156262526094 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4765957446808511, + "acc_stderr": 0.03265019475033583, + "acc_norm": 0.4765957446808511, + "acc_norm_stderr": 0.03265019475033583 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.04164188720169377, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.04164188720169377 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.36243386243386244, + "acc_stderr": 0.02475747390275206, + "acc_norm": 0.36243386243386244, + "acc_norm_stderr": 0.02475747390275206 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3412698412698413, + "acc_stderr": 0.04240799327574924, + "acc_norm": 0.3412698412698413, + "acc_norm_stderr": 0.04240799327574924 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6419354838709678, + "acc_stderr": 0.027273890594300642, + "acc_norm": 0.6419354838709678, + "acc_norm_stderr": 0.027273890594300642 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.47783251231527096, + "acc_stderr": 0.035145285621750094, + "acc_norm": 0.47783251231527096, + "acc_norm_stderr": 0.035145285621750094 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6484848484848484, + "acc_stderr": 0.037282069986826503, + "acc_norm": 0.6484848484848484, + "acc_norm_stderr": 0.037282069986826503 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6767676767676768, + "acc_stderr": 0.033322999210706444, + "acc_norm": 0.6767676767676768, + "acc_norm_stderr": 0.033322999210706444 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7823834196891192, + "acc_stderr": 0.029778663037752954, + "acc_norm": 0.7823834196891192, + "acc_norm_stderr": 0.029778663037752954 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5256410256410257, + "acc_stderr": 0.02531764972644866, + "acc_norm": 0.5256410256410257, + "acc_norm_stderr": 0.02531764972644866 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3111111111111111, + "acc_stderr": 0.028226446749683522, + "acc_norm": 0.3111111111111111, + "acc_norm_stderr": 0.028226446749683522 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5756302521008403, + "acc_stderr": 0.032104790510157764, + "acc_norm": 0.5756302521008403, + "acc_norm_stderr": 0.032104790510157764 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3509933774834437, + "acc_stderr": 0.03896981964257374, + "acc_norm": 0.3509933774834437, + "acc_norm_stderr": 0.03896981964257374 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7302752293577982, + "acc_stderr": 0.01902848671111544, + "acc_norm": 0.7302752293577982, + "acc_norm_stderr": 0.01902848671111544 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.03407632093854053, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.03407632093854053 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7745098039215687, + "acc_stderr": 0.029331162294251735, + "acc_norm": 0.7745098039215687, + "acc_norm_stderr": 0.029331162294251735 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7426160337552743, + "acc_stderr": 0.028458820991460295, + "acc_norm": 0.7426160337552743, + "acc_norm_stderr": 0.028458820991460295 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6457399103139013, + "acc_stderr": 0.032100621541349864, + "acc_norm": 0.6457399103139013, + "acc_norm_stderr": 0.032100621541349864 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7520661157024794, + "acc_stderr": 0.039418975265163025, + "acc_norm": 0.7520661157024794, + "acc_norm_stderr": 0.039418975265163025 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7314814814814815, + "acc_stderr": 0.042844679680521934, + "acc_norm": 0.7314814814814815, + "acc_norm_stderr": 0.042844679680521934 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6809815950920245, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.6809815950920245, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.04432804055291519, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.04432804055291519 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7087378640776699, + "acc_stderr": 0.04498676320572924, + "acc_norm": 0.7087378640776699, + "acc_norm_stderr": 0.04498676320572924 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8290598290598291, + "acc_stderr": 0.024662496845209807, + "acc_norm": 0.8290598290598291, + "acc_norm_stderr": 0.024662496845209807 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7509578544061303, + "acc_stderr": 0.015464676163395951, + "acc_norm": 0.7509578544061303, + "acc_norm_stderr": 0.015464676163395951 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.02572280220089581, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.02572280220089581 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.42793296089385474, + "acc_stderr": 0.016547887997416105, + "acc_norm": 0.42793296089385474, + "acc_norm_stderr": 0.016547887997416105 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6241830065359477, + "acc_stderr": 0.02773283435336394, + "acc_norm": 0.6241830065359477, + "acc_norm_stderr": 0.02773283435336394 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6141479099678456, + "acc_stderr": 0.027648149599751464, + "acc_norm": 0.6141479099678456, + "acc_norm_stderr": 0.027648149599751464 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6327160493827161, + "acc_stderr": 0.026822801759507898, + "acc_norm": 0.6327160493827161, + "acc_norm_stderr": 0.026822801759507898 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4432624113475177, + "acc_stderr": 0.029634838473766006, + "acc_norm": 0.4432624113475177, + "acc_norm_stderr": 0.029634838473766006 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4315514993481095, + "acc_stderr": 0.01265000799946388, + "acc_norm": 0.4315514993481095, + "acc_norm_stderr": 0.01265000799946388 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5477941176470589, + "acc_stderr": 0.03023375855159645, + "acc_norm": 0.5477941176470589, + "acc_norm_stderr": 0.03023375855159645 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.545751633986928, + "acc_stderr": 0.020142974553795198, + "acc_norm": 0.545751633986928, + "acc_norm_stderr": 0.020142974553795198 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6530612244897959, + "acc_stderr": 0.030472526026726492, + "acc_norm": 0.6530612244897959, + "acc_norm_stderr": 0.030472526026726492 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7313432835820896, + "acc_stderr": 0.03134328358208955, + "acc_norm": 0.7313432835820896, + "acc_norm_stderr": 0.03134328358208955 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036847, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036847 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.46987951807228917, + "acc_stderr": 0.03885425420866767, + "acc_norm": 0.46987951807228917, + "acc_norm_stderr": 0.03885425420866767 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.031885780176863984, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.031885780176863984 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.31946144430844553, + "mc1_stderr": 0.016322644182960498, + "mc2": 0.459815379294228, + "mc2_stderr": 0.015281682974346678 + }, + "all": { + "acc": 0.5630411968474885, + "acc_stderr": 0.034493899434396784, + "acc_norm": 0.567014345914575, + "acc_norm_stderr": 0.03447360969744153, + "mc1": 0.31946144430844553, + "mc1_stderr": 0.016322644182960498, + "mc2": 0.459815379294228, + "mc2_stderr": 0.015281682974346678 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6709.942146778107", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/teknium/OpenHermes-13B/results_2023-10-24T20-23-56.851767.json b/eval-results/teknium/OpenHermes-13B/results_2023-10-24T20-23-56.851767.json new file mode 100644 index 0000000000000000000000000000000000000000..2f952f87bb708c0958772770c63f76e0dba362cb --- /dev/null +++ b/eval-results/teknium/OpenHermes-13B/results_2023-10-24T20-23-56.851767.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "teknium/OpenHermes-13B", + "model_sha": "bcad6fff9f8591e091d2d57356a3f102197e8c5f", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.003984899328859061, + "em_stderr": 0.0006451805848102473, + "f1": 0.06597944630872499, + "f1_stderr": 0.0014689416324005639 + }, + "harness|gsm8k|5": { + "acc": 0.11599696739954511, + "acc_stderr": 0.008820485491442487 + }, + "harness|winogrande|5": { + "acc": 0.7545382794001578, + "acc_stderr": 0.012095272937183644 + }, + "all": { + "em": 0.003984899328859061, + "em_stderr": 0.0006451805848102473, + "f1": 0.06597944630872499, + "f1_stderr": 0.0014689416324005639, + "acc": 0.4352676233998515, + "acc_stderr": 0.010457879214313065 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "6516ad72234505ad" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "88bba31bff0c60c8" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "a537fb441b4ecfcc" + }, + "total_evaluation_time_secondes": "39340.12885594368", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/teknium/OpenHermes-2-Mistral-7B/results_2023-10-17T08-19-50.329623.json b/eval-results/teknium/OpenHermes-2-Mistral-7B/results_2023-10-17T08-19-50.329623.json new file mode 100644 index 0000000000000000000000000000000000000000..3c755f1b89880f2e4cf3ecf4065fbaae17c83e44 --- /dev/null +++ b/eval-results/teknium/OpenHermes-2-Mistral-7B/results_2023-10-17T08-19-50.329623.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "lighteval_sha": "3df803626354cb9132d5a3a96960d19c76252f61", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "teknium/OpenHermes-2-Mistral-7B", + "model_sha": "843a9bb94fac7d7bfc1b7c9f201efba295b6f5d6", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6006825938566553, + "acc_stderr": 0.014312094557946716, + "acc_norm": 0.6305460750853242, + "acc_norm_stderr": 0.014104578366491887 + }, + "harness|hellaswag|10": { + "acc": 0.6379207329217288, + "acc_stderr": 0.004796193584930074, + "acc_norm": 0.8380800637323242, + "acc_norm_stderr": 0.0036762448867232607 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6074074074074074, + "acc_stderr": 0.04218506215368879, + "acc_norm": 0.6074074074074074, + "acc_norm_stderr": 0.04218506215368879 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.7171052631578947, + "acc_stderr": 0.03665349695640767, + "acc_norm": 0.7171052631578947, + "acc_norm_stderr": 0.03665349695640767 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6754716981132075, + "acc_stderr": 0.028815615713432115, + "acc_norm": 0.6754716981132075, + "acc_norm_stderr": 0.028815615713432115 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7569444444444444, + "acc_stderr": 0.0358687928008034, + "acc_norm": 0.7569444444444444, + "acc_norm_stderr": 0.0358687928008034 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6011560693641619, + "acc_stderr": 0.037336266553835096, + "acc_norm": 0.6011560693641619, + "acc_norm_stderr": 0.037336266553835096 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5617021276595745, + "acc_stderr": 0.03243618636108101, + "acc_norm": 0.5617021276595745, + "acc_norm_stderr": 0.03243618636108101 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.47368421052631576, + "acc_stderr": 0.04697085136647863, + "acc_norm": 0.47368421052631576, + "acc_norm_stderr": 0.04697085136647863 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.025355741263055266, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.025355741263055266 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.0442626668137991, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.0442626668137991 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7516129032258064, + "acc_stderr": 0.02458002892148101, + "acc_norm": 0.7516129032258064, + "acc_norm_stderr": 0.02458002892148101 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5024630541871922, + "acc_stderr": 0.035179450386910616, + "acc_norm": 0.5024630541871922, + "acc_norm_stderr": 0.035179450386910616 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252607, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252607 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7818181818181819, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.7818181818181819, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7929292929292929, + "acc_stderr": 0.028869778460267042, + "acc_norm": 0.7929292929292929, + "acc_norm_stderr": 0.028869778460267042 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8756476683937824, + "acc_stderr": 0.02381447708659355, + "acc_norm": 0.8756476683937824, + "acc_norm_stderr": 0.02381447708659355 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5974358974358974, + "acc_stderr": 0.02486499515976775, + "acc_norm": 0.5974358974358974, + "acc_norm_stderr": 0.02486499515976775 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.02831753349606648, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.02831753349606648 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6218487394957983, + "acc_stderr": 0.03149930577784906, + "acc_norm": 0.6218487394957983, + "acc_norm_stderr": 0.03149930577784906 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.038020397601079024, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.038020397601079024 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8348623853211009, + "acc_stderr": 0.01591955782997604, + "acc_norm": 0.8348623853211009, + "acc_norm_stderr": 0.01591955782997604 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8235294117647058, + "acc_stderr": 0.026756401538078962, + "acc_norm": 0.8235294117647058, + "acc_norm_stderr": 0.026756401538078962 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.810126582278481, + "acc_stderr": 0.02553010046023349, + "acc_norm": 0.810126582278481, + "acc_norm_stderr": 0.02553010046023349 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6995515695067265, + "acc_stderr": 0.030769352008229146, + "acc_norm": 0.6995515695067265, + "acc_norm_stderr": 0.030769352008229146 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7633587786259542, + "acc_stderr": 0.03727673575596914, + "acc_norm": 0.7633587786259542, + "acc_norm_stderr": 0.03727673575596914 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7914110429447853, + "acc_stderr": 0.031921934489347235, + "acc_norm": 0.7914110429447853, + "acc_norm_stderr": 0.031921934489347235 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5178571428571429, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.5178571428571429, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8846153846153846, + "acc_stderr": 0.020930193185179333, + "acc_norm": 0.8846153846153846, + "acc_norm_stderr": 0.020930193185179333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8326947637292464, + "acc_stderr": 0.013347327202920332, + "acc_norm": 0.8326947637292464, + "acc_norm_stderr": 0.013347327202920332 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7341040462427746, + "acc_stderr": 0.023786203255508297, + "acc_norm": 0.7341040462427746, + "acc_norm_stderr": 0.023786203255508297 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3642458100558659, + "acc_stderr": 0.016094338768474596, + "acc_norm": 0.3642458100558659, + "acc_norm_stderr": 0.016094338768474596 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7418300653594772, + "acc_stderr": 0.02505850331695814, + "acc_norm": 0.7418300653594772, + "acc_norm_stderr": 0.02505850331695814 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7009646302250804, + "acc_stderr": 0.02600330111788514, + "acc_norm": 0.7009646302250804, + "acc_norm_stderr": 0.02600330111788514 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7191358024691358, + "acc_stderr": 0.025006469755799208, + "acc_norm": 0.7191358024691358, + "acc_norm_stderr": 0.025006469755799208 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5, + "acc_stderr": 0.029827499313594685, + "acc_norm": 0.5, + "acc_norm_stderr": 0.029827499313594685 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.46870925684485004, + "acc_stderr": 0.01274520462608314, + "acc_norm": 0.46870925684485004, + "acc_norm_stderr": 0.01274520462608314 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6470588235294118, + "acc_stderr": 0.029029422815681393, + "acc_norm": 0.6470588235294118, + "acc_norm_stderr": 0.029029422815681393 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6617647058823529, + "acc_stderr": 0.01913994374848703, + "acc_norm": 0.6617647058823529, + "acc_norm_stderr": 0.01913994374848703 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7306122448979592, + "acc_stderr": 0.02840125202902294, + "acc_norm": 0.7306122448979592, + "acc_norm_stderr": 0.02840125202902294 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.845771144278607, + "acc_stderr": 0.025538433368578337, + "acc_norm": 0.845771144278607, + "acc_norm_stderr": 0.025538433368578337 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.88, + "acc_stderr": 0.03265986323710906, + "acc_norm": 0.88, + "acc_norm_stderr": 0.03265986323710906 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5240963855421686, + "acc_stderr": 0.03887971849597264, + "acc_norm": 0.5240963855421686, + "acc_norm_stderr": 0.03887971849597264 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3329253365973072, + "mc1_stderr": 0.016497402382012052, + "mc2": 0.5024236235238323, + "mc2_stderr": 0.015034918880371569 + }, + "all": { + "acc": 0.6340923864588642, + "acc_stderr": 0.03292343427112481, + "acc_norm": 0.6379910781883433, + "acc_norm_stderr": 0.03290093486621529, + "mc1": 0.3329253365973072, + "mc1_stderr": 0.016497402382012052, + "mc2": 0.5024236235238323, + "mc2_stderr": 0.015034918880371569 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4684, + "non_padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40039, + "non_padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1056, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 572, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 684, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 396, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 796, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2161, + "non_padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 445, + "non_padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1360, + "non_padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "truncated": 0, + "non_truncated": 26073, + "padded": 110793, + "non_padded": 226, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/teknium/OpenHermes-2.5-Mistral-7B/results_2023-11-14T22-44-46.514057.json b/eval-results/teknium/OpenHermes-2.5-Mistral-7B/results_2023-11-14T22-44-46.514057.json new file mode 100644 index 0000000000000000000000000000000000000000..b928251388258c34fda874f6ef930af1334d33c5 --- /dev/null +++ b/eval-results/teknium/OpenHermes-2.5-Mistral-7B/results_2023-11-14T22-44-46.514057.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1989338.667915871, + "end_time": 2016016.438031999, + "total_evaluation_time_secondes": "26677.770116128027", + "model_name": "teknium/OpenHermes-2.5-Mistral-7B", + "model_sha": "2a54cad766bc90828354db5c4199795aecfd0df1", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6126279863481229, + "acc_stderr": 0.014235872487909869, + "acc_norm": 0.6493174061433447, + "acc_norm_stderr": 0.013944635930726099 + }, + "harness|hellaswag|10": { + "acc": 0.6519617606054571, + "acc_stderr": 0.004753746951620152, + "acc_norm": 0.8429595698068114, + "acc_norm_stderr": 0.003630952999843739 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6074074074074074, + "acc_stderr": 0.0421850621536888, + "acc_norm": 0.6074074074074074, + "acc_norm_stderr": 0.0421850621536888 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6973684210526315, + "acc_stderr": 0.037385206761196686, + "acc_norm": 0.6973684210526315, + "acc_norm_stderr": 0.037385206761196686 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6867924528301886, + "acc_stderr": 0.02854479331905533, + "acc_norm": 0.6867924528301886, + "acc_norm_stderr": 0.02854479331905533 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7569444444444444, + "acc_stderr": 0.03586879280080341, + "acc_norm": 0.7569444444444444, + "acc_norm_stderr": 0.03586879280080341 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.036928207672648664, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.036928207672648664 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5659574468085107, + "acc_stderr": 0.03240038086792747, + "acc_norm": 0.5659574468085107, + "acc_norm_stderr": 0.03240038086792747 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.49122807017543857, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.49122807017543857, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5241379310344828, + "acc_stderr": 0.0416180850350153, + "acc_norm": 0.5241379310344828, + "acc_norm_stderr": 0.0416180850350153 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.02546714904546955, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.02546714904546955 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.46825396825396826, + "acc_stderr": 0.04463112720677172, + "acc_norm": 0.46825396825396826, + "acc_norm_stderr": 0.04463112720677172 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7935483870967742, + "acc_stderr": 0.02302589961718871, + "acc_norm": 0.7935483870967742, + "acc_norm_stderr": 0.02302589961718871 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.035158955511656986, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.035158955511656986 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7818181818181819, + "acc_stderr": 0.032250781083062896, + "acc_norm": 0.7818181818181819, + "acc_norm_stderr": 0.032250781083062896 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8080808080808081, + "acc_stderr": 0.028057791672989017, + "acc_norm": 0.8080808080808081, + "acc_norm_stderr": 0.028057791672989017 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8911917098445595, + "acc_stderr": 0.022473253332768776, + "acc_norm": 0.8911917098445595, + "acc_norm_stderr": 0.022473253332768776 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6128205128205129, + "acc_stderr": 0.024697216930878937, + "acc_norm": 0.6128205128205129, + "acc_norm_stderr": 0.024697216930878937 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.02803792996911499, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.02803792996911499 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.680672268907563, + "acc_stderr": 0.030283995525884396, + "acc_norm": 0.680672268907563, + "acc_norm_stderr": 0.030283995525884396 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.038020397601079024, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.038020397601079024 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8330275229357799, + "acc_stderr": 0.01599015488507338, + "acc_norm": 0.8330275229357799, + "acc_norm_stderr": 0.01599015488507338 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5092592592592593, + "acc_stderr": 0.034093869469927006, + "acc_norm": 0.5092592592592593, + "acc_norm_stderr": 0.034093869469927006 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.02812597226565437, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.02812597226565437 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8143459915611815, + "acc_stderr": 0.025310495376944856, + "acc_norm": 0.8143459915611815, + "acc_norm_stderr": 0.025310495376944856 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7040358744394619, + "acc_stderr": 0.030636591348699803, + "acc_norm": 0.7040358744394619, + "acc_norm_stderr": 0.030636591348699803 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7938931297709924, + "acc_stderr": 0.035477710041594654, + "acc_norm": 0.7938931297709924, + "acc_norm_stderr": 0.035477710041594654 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7603305785123967, + "acc_stderr": 0.03896878985070416, + "acc_norm": 0.7603305785123967, + "acc_norm_stderr": 0.03896878985070416 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.039578354719809805, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.039578354719809805 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7852760736196319, + "acc_stderr": 0.032262193772867744, + "acc_norm": 0.7852760736196319, + "acc_norm_stderr": 0.032262193772867744 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5089285714285714, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.5089285714285714, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8589743589743589, + "acc_stderr": 0.022801382534597528, + "acc_norm": 0.8589743589743589, + "acc_norm_stderr": 0.022801382534597528 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8301404853128991, + "acc_stderr": 0.013428186370608306, + "acc_norm": 0.8301404853128991, + "acc_norm_stderr": 0.013428186370608306 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7167630057803468, + "acc_stderr": 0.02425790170532338, + "acc_norm": 0.7167630057803468, + "acc_norm_stderr": 0.02425790170532338 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.30837988826815643, + "acc_stderr": 0.01544571691099888, + "acc_norm": 0.30837988826815643, + "acc_norm_stderr": 0.01544571691099888 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7549019607843137, + "acc_stderr": 0.024630048979824782, + "acc_norm": 0.7549019607843137, + "acc_norm_stderr": 0.024630048979824782 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.684887459807074, + "acc_stderr": 0.026385273703464485, + "acc_norm": 0.684887459807074, + "acc_norm_stderr": 0.026385273703464485 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7530864197530864, + "acc_stderr": 0.02399350170904211, + "acc_norm": 0.7530864197530864, + "acc_norm_stderr": 0.02399350170904211 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5070921985815603, + "acc_stderr": 0.02982449855912901, + "acc_norm": 0.5070921985815603, + "acc_norm_stderr": 0.02982449855912901 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.46936114732724904, + "acc_stderr": 0.012746237711716634, + "acc_norm": 0.46936114732724904, + "acc_norm_stderr": 0.012746237711716634 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.028418208619406762, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.028418208619406762 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.673202614379085, + "acc_stderr": 0.018975427920507215, + "acc_norm": 0.673202614379085, + "acc_norm_stderr": 0.018975427920507215 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7346938775510204, + "acc_stderr": 0.028263889943784596, + "acc_norm": 0.7346938775510204, + "acc_norm_stderr": 0.028263889943784596 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8159203980099502, + "acc_stderr": 0.027403859410786845, + "acc_norm": 0.8159203980099502, + "acc_norm_stderr": 0.027403859410786845 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.033799766898963086, + "acc_norm": 0.87, + "acc_norm_stderr": 0.033799766898963086 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5542168674698795, + "acc_stderr": 0.03869543323472101, + "acc_norm": 0.5542168674698795, + "acc_norm_stderr": 0.03869543323472101 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3598531211750306, + "mc1_stderr": 0.01680186046667716, + "mc2": 0.5230564118100686, + "mc2_stderr": 0.015250230546286025 + }, + "harness|winogrande|5": { + "acc": 0.7790055248618785, + "acc_stderr": 0.011661223637643412 + }, + "harness|drop|3": { + "em": 0.30830536912751677, + "em_stderr": 0.004729196914949925, + "f1": 0.35989723154362524, + "f1_stderr": 0.004629324720589026 + }, + "harness|gsm8k|5": { + "acc": 0.25473843821076575, + "acc_stderr": 0.012001731232879136 + }, + "all": { + "acc": 0.6340440103659418, + "acc_stderr": 0.03220763540877311, + "acc_norm": 0.6418750491228201, + "acc_norm_stderr": 0.032874386009418256, + "mc1": 0.3598531211750306, + "mc1_stderr": 0.01680186046667716, + "mc2": 0.5230564118100686, + "mc2_stderr": 0.015250230546286025, + "em": 0.30830536912751677, + "em_stderr": 0.004729196914949925, + "f1": 0.35989723154362524, + "f1_stderr": 0.004629324720589026 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "2eed147abcdae354" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "b03fc7042934bc77" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "e498f03653cabebb" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/teknium/OpenHermes-2.5-Mistral-7B/results_2023-11-20T08-19-51.425757.json b/eval-results/teknium/OpenHermes-2.5-Mistral-7B/results_2023-11-20T08-19-51.425757.json new file mode 100644 index 0000000000000000000000000000000000000000..7af80a563fab76ccbb038baa16f0795995502ae3 --- /dev/null +++ b/eval-results/teknium/OpenHermes-2.5-Mistral-7B/results_2023-11-20T08-19-51.425757.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 299089.900934758, + "end_time": 327361.46716007, + "total_evaluation_time_secondes": "28271.56622531201", + "model_name": "teknium/OpenHermes-2.5-Mistral-7B", + "model_sha": "2a54cad766bc90828354db5c4199795aecfd0df1", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6126279863481229, + "acc_stderr": 0.014235872487909869, + "acc_norm": 0.6493174061433447, + "acc_norm_stderr": 0.013944635930726099 + }, + "harness|hellaswag|10": { + "acc": 0.6522605058753237, + "acc_stderr": 0.004752794829825045, + "acc_norm": 0.8417645887273452, + "acc_norm_stderr": 0.0036421571661623443 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252606, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252606 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6, + "acc_stderr": 0.04232073695151589, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04232073695151589 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6973684210526315, + "acc_stderr": 0.037385206761196686, + "acc_norm": 0.6973684210526315, + "acc_norm_stderr": 0.037385206761196686 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6792452830188679, + "acc_stderr": 0.028727502957880267, + "acc_norm": 0.6792452830188679, + "acc_norm_stderr": 0.028727502957880267 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.75, + "acc_stderr": 0.03621034121889507, + "acc_norm": 0.75, + "acc_norm_stderr": 0.03621034121889507 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6127167630057804, + "acc_stderr": 0.03714325906302065, + "acc_norm": 0.6127167630057804, + "acc_norm_stderr": 0.03714325906302065 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5574468085106383, + "acc_stderr": 0.032469569197899575, + "acc_norm": 0.5574468085106383, + "acc_norm_stderr": 0.032469569197899575 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5, + "acc_stderr": 0.047036043419179864, + "acc_norm": 0.5, + "acc_norm_stderr": 0.047036043419179864 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4365079365079365, + "acc_stderr": 0.0255428468174005, + "acc_norm": 0.4365079365079365, + "acc_norm_stderr": 0.0255428468174005 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4603174603174603, + "acc_stderr": 0.04458029125470973, + "acc_norm": 0.4603174603174603, + "acc_norm_stderr": 0.04458029125470973 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7935483870967742, + "acc_stderr": 0.02302589961718871, + "acc_norm": 0.7935483870967742, + "acc_norm_stderr": 0.02302589961718871 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5073891625615764, + "acc_stderr": 0.035176035403610105, + "acc_norm": 0.5073891625615764, + "acc_norm_stderr": 0.035176035403610105 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.031922715695483016, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.031922715695483016 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.803030303030303, + "acc_stderr": 0.02833560973246336, + "acc_norm": 0.803030303030303, + "acc_norm_stderr": 0.02833560973246336 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8911917098445595, + "acc_stderr": 0.022473253332768776, + "acc_norm": 0.8911917098445595, + "acc_norm_stderr": 0.022473253332768776 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6076923076923076, + "acc_stderr": 0.024756000382130956, + "acc_norm": 0.6076923076923076, + "acc_norm_stderr": 0.024756000382130956 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.02784081149587193, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.02784081149587193 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.030388353551886797, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.030388353551886797 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.03861557546255169, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.03861557546255169 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8311926605504587, + "acc_stderr": 0.016060056268530343, + "acc_norm": 0.8311926605504587, + "acc_norm_stderr": 0.016060056268530343 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5046296296296297, + "acc_stderr": 0.03409825519163572, + "acc_norm": 0.5046296296296297, + "acc_norm_stderr": 0.03409825519163572 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.02812597226565437, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.02812597226565437 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8143459915611815, + "acc_stderr": 0.025310495376944856, + "acc_norm": 0.8143459915611815, + "acc_norm_stderr": 0.025310495376944856 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7040358744394619, + "acc_stderr": 0.030636591348699803, + "acc_norm": 0.7040358744394619, + "acc_norm_stderr": 0.030636591348699803 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7786259541984732, + "acc_stderr": 0.03641297081313728, + "acc_norm": 0.7786259541984732, + "acc_norm_stderr": 0.03641297081313728 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7603305785123967, + "acc_stderr": 0.03896878985070416, + "acc_norm": 0.7603305785123967, + "acc_norm_stderr": 0.03896878985070416 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.039578354719809805, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.039578354719809805 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7730061349693251, + "acc_stderr": 0.03291099578615769, + "acc_norm": 0.7730061349693251, + "acc_norm_stderr": 0.03291099578615769 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5178571428571429, + "acc_stderr": 0.04742762361243011, + "acc_norm": 0.5178571428571429, + "acc_norm_stderr": 0.04742762361243011 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.021901905115073325, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.021901905115073325 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.822477650063857, + "acc_stderr": 0.013664230995834832, + "acc_norm": 0.822477650063857, + "acc_norm_stderr": 0.013664230995834832 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7283236994219653, + "acc_stderr": 0.023948512905468355, + "acc_norm": 0.7283236994219653, + "acc_norm_stderr": 0.023948512905468355 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3106145251396648, + "acc_stderr": 0.015476515438005566, + "acc_norm": 0.3106145251396648, + "acc_norm_stderr": 0.015476515438005566 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7581699346405228, + "acc_stderr": 0.024518195641879334, + "acc_norm": 0.7581699346405228, + "acc_norm_stderr": 0.024518195641879334 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6945337620578779, + "acc_stderr": 0.026160584450140446, + "acc_norm": 0.6945337620578779, + "acc_norm_stderr": 0.026160584450140446 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.75, + "acc_stderr": 0.02409347123262133, + "acc_norm": 0.75, + "acc_norm_stderr": 0.02409347123262133 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.49645390070921985, + "acc_stderr": 0.02982674915328092, + "acc_norm": 0.49645390070921985, + "acc_norm_stderr": 0.02982674915328092 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.46936114732724904, + "acc_stderr": 0.012746237711716634, + "acc_norm": 0.46936114732724904, + "acc_norm_stderr": 0.012746237711716634 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6801470588235294, + "acc_stderr": 0.028332959514031208, + "acc_norm": 0.6801470588235294, + "acc_norm_stderr": 0.028332959514031208 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.673202614379085, + "acc_stderr": 0.018975427920507215, + "acc_norm": 0.673202614379085, + "acc_norm_stderr": 0.018975427920507215 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7428571428571429, + "acc_stderr": 0.027979823538744543, + "acc_norm": 0.7428571428571429, + "acc_norm_stderr": 0.027979823538744543 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8109452736318408, + "acc_stderr": 0.027686913588013024, + "acc_norm": 0.8109452736318408, + "acc_norm_stderr": 0.027686913588013024 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.033799766898963086, + "acc_norm": 0.87, + "acc_norm_stderr": 0.033799766898963086 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5542168674698795, + "acc_stderr": 0.03869543323472101, + "acc_norm": 0.5542168674698795, + "acc_norm_stderr": 0.03869543323472101 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8304093567251462, + "acc_stderr": 0.02878210810540171, + "acc_norm": 0.8304093567251462, + "acc_norm_stderr": 0.02878210810540171 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.36107711138310894, + "mc1_stderr": 0.016814312844836882, + "mc2": 0.5223798200841158, + "mc2_stderr": 0.015238875045496005 + }, + "harness|winogrande|5": { + "acc": 0.7805840568271507, + "acc_stderr": 0.01163126836060778 + }, + "harness|drop|3": { + "em": 0.30683724832214765, + "em_stderr": 0.004722927724288475, + "f1": 0.35794463087248424, + "f1_stderr": 0.004623146117214951 + }, + "harness|gsm8k|5": { + "acc": 0.2608036391205459, + "acc_stderr": 0.012094252417332745 + }, + "all": { + "acc": 0.632448279866198, + "acc_stderr": 0.032234779746923035, + "acc_norm": 0.6400703538735191, + "acc_norm_stderr": 0.03290159609429692, + "mc1": 0.36107711138310894, + "mc1_stderr": 0.016814312844836882, + "mc2": 0.5223798200841158, + "mc2_stderr": 0.015238875045496005, + "em": 0.30683724832214765, + "em_stderr": 0.004722927724288475, + "f1": 0.35794463087248424, + "f1_stderr": 0.004623146117214951 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "e443285bfb62f985" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "cf7e9ce3895d916f" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "a9e58c52ee8cbb40" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/teknium/OpenHermes-7B/results_2023-09-18T14-09-00.502210.json b/eval-results/teknium/OpenHermes-7B/results_2023-09-18T14-09-00.502210.json new file mode 100644 index 0000000000000000000000000000000000000000..b9bdb6da0e6c0ab12ec5a81ac1de2c70a90e0e30 --- /dev/null +++ b/eval-results/teknium/OpenHermes-7B/results_2023-09-18T14-09-00.502210.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "teknium/OpenHermes-7B", + "model_sha": "74edb1ad58d3d517ef46c4e2a31081084ecbc473", + "model_size": "12.61 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5281569965870307, + "acc_stderr": 0.014588204105102203, + "acc_norm": 0.5614334470989761, + "acc_norm_stderr": 0.014500682618212865 + }, + "harness|hellaswag|10": { + "acc": 0.59061939852619, + "acc_stderr": 0.00490714622934755, + "acc_norm": 0.7832105158334993, + "acc_norm_stderr": 0.004112158798877644 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4962962962962963, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.4962962962962963, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4144736842105263, + "acc_stderr": 0.04008973785779206, + "acc_norm": 0.4144736842105263, + "acc_norm_stderr": 0.04008973785779206 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5169811320754717, + "acc_stderr": 0.030755120364119905, + "acc_norm": 0.5169811320754717, + "acc_norm_stderr": 0.030755120364119905 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4791666666666667, + "acc_stderr": 0.041775789507399935, + "acc_norm": 0.4791666666666667, + "acc_norm_stderr": 0.041775789507399935 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4393063583815029, + "acc_stderr": 0.037842719328874674, + "acc_norm": 0.4393063583815029, + "acc_norm_stderr": 0.037842719328874674 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.0379328118530781, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.0379328118530781 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4340425531914894, + "acc_stderr": 0.03240038086792747, + "acc_norm": 0.4340425531914894, + "acc_norm_stderr": 0.03240038086792747 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.040969851398436716, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.040969851398436716 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.42758620689655175, + "acc_stderr": 0.04122737111370332, + "acc_norm": 0.42758620689655175, + "acc_norm_stderr": 0.04122737111370332 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.29894179894179895, + "acc_stderr": 0.023577604791655823, + "acc_norm": 0.29894179894179895, + "acc_norm_stderr": 0.023577604791655823 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.30952380952380953, + "acc_stderr": 0.04134913018303316, + "acc_norm": 0.30952380952380953, + "acc_norm_stderr": 0.04134913018303316 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5419354838709678, + "acc_stderr": 0.028343787250540618, + "acc_norm": 0.5419354838709678, + "acc_norm_stderr": 0.028343787250540618 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.35467980295566504, + "acc_stderr": 0.0336612448905145, + "acc_norm": 0.35467980295566504, + "acc_norm_stderr": 0.0336612448905145 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.03756335775187897, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.03756335775187897 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.0347327959083696, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.0347327959083696 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7098445595854922, + "acc_stderr": 0.03275264467791516, + "acc_norm": 0.7098445595854922, + "acc_norm_stderr": 0.03275264467791516 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.44358974358974357, + "acc_stderr": 0.025189149894764198, + "acc_norm": 0.44358974358974357, + "acc_norm_stderr": 0.025189149894764198 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.02620276653465215, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.02620276653465215 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.0322529423239964, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.0322529423239964 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2781456953642384, + "acc_stderr": 0.03658603262763743, + "acc_norm": 0.2781456953642384, + "acc_norm_stderr": 0.03658603262763743 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6678899082568808, + "acc_stderr": 0.020192682985423337, + "acc_norm": 0.6678899082568808, + "acc_norm_stderr": 0.020192682985423337 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3194444444444444, + "acc_stderr": 0.0317987634217685, + "acc_norm": 0.3194444444444444, + "acc_norm_stderr": 0.0317987634217685 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6470588235294118, + "acc_stderr": 0.03354092437591519, + "acc_norm": 0.6470588235294118, + "acc_norm_stderr": 0.03354092437591519 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6835443037974683, + "acc_stderr": 0.030274974880218977, + "acc_norm": 0.6835443037974683, + "acc_norm_stderr": 0.030274974880218977 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5919282511210763, + "acc_stderr": 0.03298574607842822, + "acc_norm": 0.5919282511210763, + "acc_norm_stderr": 0.03298574607842822 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6106870229007634, + "acc_stderr": 0.04276486542814591, + "acc_norm": 0.6106870229007634, + "acc_norm_stderr": 0.04276486542814591 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6446280991735537, + "acc_stderr": 0.0436923632657398, + "acc_norm": 0.6446280991735537, + "acc_norm_stderr": 0.0436923632657398 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5462962962962963, + "acc_stderr": 0.04812917324536823, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.04812917324536823 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.48466257668711654, + "acc_stderr": 0.039265223787088424, + "acc_norm": 0.48466257668711654, + "acc_norm_stderr": 0.039265223787088424 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6504854368932039, + "acc_stderr": 0.04721188506097172, + "acc_norm": 0.6504854368932039, + "acc_norm_stderr": 0.04721188506097172 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7264957264957265, + "acc_stderr": 0.02920254015343118, + "acc_norm": 0.7264957264957265, + "acc_norm_stderr": 0.02920254015343118 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.58, + "acc_stderr": 0.04960449637488583, + "acc_norm": 0.58, + "acc_norm_stderr": 0.04960449637488583 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6679438058748404, + "acc_stderr": 0.016841174655295724, + "acc_norm": 0.6679438058748404, + "acc_norm_stderr": 0.016841174655295724 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5317919075144508, + "acc_stderr": 0.026864624366756646, + "acc_norm": 0.5317919075144508, + "acc_norm_stderr": 0.026864624366756646 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.264804469273743, + "acc_stderr": 0.014756906483260657, + "acc_norm": 0.264804469273743, + "acc_norm_stderr": 0.014756906483260657 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5196078431372549, + "acc_stderr": 0.028607893699576066, + "acc_norm": 0.5196078431372549, + "acc_norm_stderr": 0.028607893699576066 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5819935691318328, + "acc_stderr": 0.028013651891995076, + "acc_norm": 0.5819935691318328, + "acc_norm_stderr": 0.028013651891995076 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5432098765432098, + "acc_stderr": 0.027716661650194038, + "acc_norm": 0.5432098765432098, + "acc_norm_stderr": 0.027716661650194038 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3829787234042553, + "acc_stderr": 0.028999080904806178, + "acc_norm": 0.3829787234042553, + "acc_norm_stderr": 0.028999080904806178 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3683181225554107, + "acc_stderr": 0.012319403369564639, + "acc_norm": 0.3683181225554107, + "acc_norm_stderr": 0.012319403369564639 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5183823529411765, + "acc_stderr": 0.030352303395351964, + "acc_norm": 0.5183823529411765, + "acc_norm_stderr": 0.030352303395351964 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.45588235294117646, + "acc_stderr": 0.020148939420415738, + "acc_norm": 0.45588235294117646, + "acc_norm_stderr": 0.020148939420415738 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5545454545454546, + "acc_stderr": 0.047605488214603246, + "acc_norm": 0.5545454545454546, + "acc_norm_stderr": 0.047605488214603246 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5510204081632653, + "acc_stderr": 0.03184213866687579, + "acc_norm": 0.5510204081632653, + "acc_norm_stderr": 0.03184213866687579 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6417910447761194, + "acc_stderr": 0.03390393042268813, + "acc_norm": 0.6417910447761194, + "acc_norm_stderr": 0.03390393042268813 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.038367221765980515, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.038367221765980515 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7076023391812866, + "acc_stderr": 0.03488647713457922, + "acc_norm": 0.7076023391812866, + "acc_norm_stderr": 0.03488647713457922 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.29498164014687883, + "mc1_stderr": 0.015964400965589657, + "mc2": 0.44995354312872166, + "mc2_stderr": 0.014767124906788017 + }, + "all": { + "acc": 0.4886592917372928, + "acc_stderr": 0.03506569549642699, + "acc_norm": 0.49248755559863605, + "acc_norm_stderr": 0.035050737718166664, + "mc1": 0.29498164014687883, + "mc1_stderr": 0.015964400965589657, + "mc2": 0.44995354312872166, + "mc2_stderr": 0.014767124906788017 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4470.825008869171", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/teknium/OpenHermes-7B/results_2023-10-26T05-03-25.636029.json b/eval-results/teknium/OpenHermes-7B/results_2023-10-26T05-03-25.636029.json new file mode 100644 index 0000000000000000000000000000000000000000..813fadd172df42cefefbb8247e78d872f6fd2401 --- /dev/null +++ b/eval-results/teknium/OpenHermes-7B/results_2023-10-26T05-03-25.636029.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "teknium/OpenHermes-7B", + "model_sha": "9f55d6eb15f1edd52ee1fd863a220aa682e78a00", + "model_size": "12.61 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.2645763422818792, + "em_stderr": 0.004517352215857921, + "f1": 0.33702810402684713, + "f1_stderr": 0.004480224621998652 + }, + "harness|gsm8k|5": { + "acc": 0.050037907505686124, + "acc_stderr": 0.006005442354577731 + }, + "harness|winogrande|5": { + "acc": 0.745067087608524, + "acc_stderr": 0.012248806969376422 + }, + "all": { + "em": 0.2645763422818792, + "em_stderr": 0.004517352215857921, + "f1": 0.33702810402684713, + "f1_stderr": 0.004480224621998652, + "acc": 0.3975524975571051, + "acc_stderr": 0.009127124661977076 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "22ca8cfd3c4f1856" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "6596ec15ff4fea62" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "6c6ccd3a078ad85e" + }, + "total_evaluation_time_secondes": "17711.14144730568", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uberkie/metharme-1.3b-finetuned/results_2023-08-12T17-46-30.309807.json b/eval-results/uberkie/metharme-1.3b-finetuned/results_2023-08-12T17-46-30.309807.json new file mode 100644 index 0000000000000000000000000000000000000000..36e374473e1a960e600c7570626e68c683b9588e --- /dev/null +++ b/eval-results/uberkie/metharme-1.3b-finetuned/results_2023-08-12T17-46-30.309807.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.1766211604095563, + "acc_stderr": 0.011144042769316501, + "acc_norm": 0.20563139931740615, + "acc_norm_stderr": 0.01181074526074258 + }, + "harness|hellaswag|10": { + "acc": 0.27106154152559253, + "acc_stderr": 0.004435993492583849, + "acc_norm": 0.2802230631348337, + "acc_norm_stderr": 0.004481902637505662 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.17, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.17, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.039725528847851375, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.039725528847851375 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.03583496176361065, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.03583496176361065 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2037735849056604, + "acc_stderr": 0.0247907845017754, + "acc_norm": 0.2037735849056604, + "acc_norm_stderr": 0.0247907845017754 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.25, + "acc_stderr": 0.03621034121889507, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03621034121889507 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3468208092485549, + "acc_stderr": 0.03629146670159663, + "acc_norm": 0.3468208092485549, + "acc_norm_stderr": 0.03629146670159663 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2680851063829787, + "acc_stderr": 0.028957342788342347, + "acc_norm": 0.2680851063829787, + "acc_norm_stderr": 0.028957342788342347 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.03835153954399421, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.03835153954399421 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2566137566137566, + "acc_stderr": 0.022494510767503154, + "acc_norm": 0.2566137566137566, + "acc_norm_stderr": 0.022494510767503154 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.15079365079365079, + "acc_stderr": 0.03200686497287392, + "acc_norm": 0.15079365079365079, + "acc_norm_stderr": 0.03200686497287392 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3064516129032258, + "acc_stderr": 0.026226485652553873, + "acc_norm": 0.3064516129032258, + "acc_norm_stderr": 0.026226485652553873 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.29064039408866993, + "acc_stderr": 0.0319474007226554, + "acc_norm": 0.29064039408866993, + "acc_norm_stderr": 0.0319474007226554 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768077, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768077 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.24242424242424243, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.24242424242424243, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2878787878787879, + "acc_stderr": 0.03225883512300993, + "acc_norm": 0.2878787878787879, + "acc_norm_stderr": 0.03225883512300993 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.26424870466321243, + "acc_stderr": 0.031821550509166484, + "acc_norm": 0.26424870466321243, + "acc_norm_stderr": 0.031821550509166484 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2205128205128205, + "acc_stderr": 0.021020672680827912, + "acc_norm": 0.2205128205128205, + "acc_norm_stderr": 0.021020672680827912 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2851851851851852, + "acc_stderr": 0.027528599210340492, + "acc_norm": 0.2851851851851852, + "acc_norm_stderr": 0.027528599210340492 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.31092436974789917, + "acc_stderr": 0.030066761582977934, + "acc_norm": 0.31092436974789917, + "acc_norm_stderr": 0.030066761582977934 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23178807947019867, + "acc_stderr": 0.03445406271987054, + "acc_norm": 0.23178807947019867, + "acc_norm_stderr": 0.03445406271987054 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.24770642201834864, + "acc_stderr": 0.018508143602547822, + "acc_norm": 0.24770642201834864, + "acc_norm_stderr": 0.018508143602547822 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.030778554678693254, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.030778554678693254 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.22784810126582278, + "acc_stderr": 0.027303484599069422, + "acc_norm": 0.22784810126582278, + "acc_norm_stderr": 0.027303484599069422 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.2645739910313901, + "acc_stderr": 0.029605103217038315, + "acc_norm": 0.2645739910313901, + "acc_norm_stderr": 0.029605103217038315 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.22900763358778625, + "acc_stderr": 0.036853466317118506, + "acc_norm": 0.22900763358778625, + "acc_norm_stderr": 0.036853466317118506 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.256198347107438, + "acc_stderr": 0.03984979653302871, + "acc_norm": 0.256198347107438, + "acc_norm_stderr": 0.03984979653302871 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.25766871165644173, + "acc_stderr": 0.03436150827846917, + "acc_norm": 0.25766871165644173, + "acc_norm_stderr": 0.03436150827846917 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.038946411200447915, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.038946411200447915 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.1941747572815534, + "acc_stderr": 0.03916667762822583, + "acc_norm": 0.1941747572815534, + "acc_norm_stderr": 0.03916667762822583 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.19658119658119658, + "acc_stderr": 0.02603538609895129, + "acc_norm": 0.19658119658119658, + "acc_norm_stderr": 0.02603538609895129 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.22094508301404853, + "acc_stderr": 0.01483620516733357, + "acc_norm": 0.22094508301404853, + "acc_norm_stderr": 0.01483620516733357 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2514450867052023, + "acc_stderr": 0.02335736578587404, + "acc_norm": 0.2514450867052023, + "acc_norm_stderr": 0.02335736578587404 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2324022346368715, + "acc_stderr": 0.014125968754673384, + "acc_norm": 0.2324022346368715, + "acc_norm_stderr": 0.014125968754673384 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.02582916327275747, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.02582916327275747 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2604501607717042, + "acc_stderr": 0.02492672322484556, + "acc_norm": 0.2604501607717042, + "acc_norm_stderr": 0.02492672322484556 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.02378858355165854, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.02378858355165854 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.20921985815602837, + "acc_stderr": 0.024264769439988478, + "acc_norm": 0.20921985815602837, + "acc_norm_stderr": 0.024264769439988478 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2438070404172099, + "acc_stderr": 0.010966507972178475, + "acc_norm": 0.2438070404172099, + "acc_norm_stderr": 0.010966507972178475 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.3860294117647059, + "acc_stderr": 0.029573269134411127, + "acc_norm": 0.3860294117647059, + "acc_norm_stderr": 0.029573269134411127 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2238562091503268, + "acc_stderr": 0.016863008585416613, + "acc_norm": 0.2238562091503268, + "acc_norm_stderr": 0.016863008585416613 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2636363636363636, + "acc_stderr": 0.04220224692971987, + "acc_norm": 0.2636363636363636, + "acc_norm_stderr": 0.04220224692971987 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.19591836734693877, + "acc_stderr": 0.025409301953225678, + "acc_norm": 0.19591836734693877, + "acc_norm_stderr": 0.025409301953225678 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24875621890547264, + "acc_stderr": 0.030567675938916707, + "acc_norm": 0.24875621890547264, + "acc_norm_stderr": 0.030567675938916707 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.26506024096385544, + "acc_stderr": 0.03436024037944967, + "acc_norm": 0.26506024096385544, + "acc_norm_stderr": 0.03436024037944967 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.19883040935672514, + "acc_stderr": 0.03061111655743253, + "acc_norm": 0.19883040935672514, + "acc_norm_stderr": 0.03061111655743253 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.22643818849449204, + "mc1_stderr": 0.014651337324602585, + "mc2": 0.4479962997290185, + "mc2_stderr": 0.015735133357613243 + }, + "all": { + "acc": 0.2516234850727529, + "acc_stderr": 0.03142007295614116, + "acc_norm": 0.252270464064568, + "acc_norm_stderr": 0.031432151119469086, + "mc1": 0.22643818849449204, + "mc1_stderr": 0.014651337324602585, + "mc2": 0.4479962997290185, + "mc2_stderr": 0.015735133357613243 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "uberkie/metharme-1.3b-finetuned", + "model_sha": "7335669475711806eb04f8850e4eef91a9d2677d", + "model_dtype": "torch.float16", + "lighteval_sha": "efe93333f9f25e7d48cc67a6bf362e6d576f727b", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "573b1b078b6e9deb", + "hash_cont_tokens": "22424bcffb42ecdf" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "f0fd0caf4d4c1110", + "hash_cont_tokens": "62a15ef112ea07d6" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40123, + "non-padded": 45, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "f076ac6b177ca28c", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "059827606e6b0780", + "hash_cont_tokens": "ec7e2288ab5f1ce9" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "1dd0dab88aa9e4b2", + "hash_cont_tokens": "d7e922da5bc6d1bf" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "d51eb5246cbe2173", + "hash_cont_tokens": "08933598b321179c" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "2337a7f17800c6ec", + "hash_cont_tokens": "bc82b3cc5072f164" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "e394ebbb8ceace76", + "hash_cont_tokens": "3bc45e0c4b6d612d" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "9221fbdf710a6f67", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "ebe2748d21b2ba41", + "hash_cont_tokens": "d839b8186e0f3d94" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "bfecefb08ffb7faa", + "hash_cont_tokens": "3c16f9c45a7a7272" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "2ac8aec9025dc58b", + "hash_cont_tokens": "16f654508cdc19c4" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 680, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "faf44c77f43368ef", + "hash_cont_tokens": "a3a24586c7218684" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "280c7f12abde10a5", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "217a841c86d2d992", + "hash_cont_tokens": "43818b3dc0c7496f" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "354267c0f98aad3b", + "hash_cont_tokens": "4f0a3e41169314a8" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "4f5e8d051d04dde0", + "hash_cont_tokens": "7e14ccd1e2688bb8" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "cd12bec1d5448dda", + "hash_cont_tokens": "317e29ee6bba387d" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "c549e395850984fe", + "hash_cont_tokens": "c01a9b75f55e32e0" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "81b06f5caa221f97", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "ad626d781102fe51", + "hash_cont_tokens": "edb2063e955bd5ca" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "2c0d3f2eacc6bbd5", + "hash_cont_tokens": "8000de09bc1dc113" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "aada51d0571db37b", + "hash_cont_tokens": "dcd6a0ada4ab8e0b" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "6e47d696116edd01", + "hash_cont_tokens": "47a5e5973f50fe17" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "0e8ee6c9e572e3c4", + "hash_cont_tokens": "812f79117b9593de" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8fa2bf90de3b07e7", + "hash_cont_tokens": "b4c405890ebd3ee1" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fabb8f176276af2f", + "hash_cont_tokens": "8d468d84a686647d" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3e86d13ef021476a", + "hash_cont_tokens": "e5d02f8f1c5dcf31" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1069, + "non-padded": 11, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a132b5e9c9531b36", + "hash_cont_tokens": "4c32e38c066727bc" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "f8f6fe5143776cb4", + "hash_cont_tokens": "9416ad85fd6f4a2c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "e28121967b27a315", + "hash_cont_tokens": "57cc212706ddcdf4" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "bdbe90efb4a1c4ce", + "hash_cont_tokens": "8c5c954092a64343" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "b8f58f05dc082011", + "hash_cont_tokens": "e5ab34a54e3f5b7c" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "3af911bf93093a85", + "hash_cont_tokens": "f3276c80ce1b205b" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "1dd2240eb90b9a70", + "hash_cont_tokens": "7982edf99219e1b0" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f3de2f8181824a79", + "hash_cont_tokens": "ed73d516c5552dd0" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "0c2a1dd63cc74137", + "hash_cont_tokens": "549d9b32b8a90e4e" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "08e3527985f33aab", + "hash_cont_tokens": "ddf5241e450210d6" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "bf7216a648529f68", + "hash_cont_tokens": "eb791fcbee9e0682" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "28f5891c956afd65", + "hash_cont_tokens": "c66b1f3b46001b09" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6de88b824d4f64c3", + "hash_cont_tokens": "27795e9c98bdeda8" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "5ef855d01044fd83", + "hash_cont_tokens": "874c5b0b496cbe8a" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "1840e0b96d7e619e", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "02483f6b53dc13ac", + "hash_cont_tokens": "313ee361fbdbab3c" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "93202e79d594dde4", + "hash_cont_tokens": "fe7747dc69c4909e" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1356, + "non-padded": 28, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "41c03f41d2ba9fe7", + "hash_cont_tokens": "e0d0ad58a3f1ff22" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "d83bcb6dd08809ac", + "hash_cont_tokens": "c55a10a018de0228" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "65c70474c8a5d205", + "hash_cont_tokens": "7916d26928435f1a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "4d4126ac9a91ac47", + "hash_cont_tokens": "81836c52a10e6ffd" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "592f80ad364d686a", + "hash_cont_tokens": "f5d669014a273483" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "7f837322b1b62ac1", + "hash_cont_tokens": "6b31cf265df9b81b" + }, + "truncated": 16, + "non-truncated": 6120, + "padded": 6120, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "05a8ef0dd10b4bba", + "hash_cont_tokens": "4b3ac60441ad14ec" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3c7944f0b2c49f64", + "hash_cont_tokens": "f139af481f2a9e74" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "637e934bb716d5ec", + "hash_cont_tokens": "ca79966b90cda0ea" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "3bad229573ed6a9c", + "hash_cont_tokens": "952a2e479fc3a83e" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "70a479e96d02d5d8", + "hash_cont_tokens": "f49476cf49b37d7c" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "0d690fc0db462440", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "4b0fdf8e692dd640", + "hash_cont_tokens": "0065c4bbe6134c1c" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "cfd7092dc8aacd96", + "hash_cont_tokens": "9a178e9ec050bf3e" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "e820abadeb7ebfb3", + "hash_cont_tokens": "7f48ddfffa64eb41" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "c86f5765cd1e9dab", + "hash_cont_tokens": "e41a73fcf362857f" + }, + "total_evaluation_time_secondes": "904.7156593799591", + "truncated": 1492, + "non-truncated": 109527, + "padded": 109403, + "non-padded": 1616, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/CollectiveCognition-v1.1-Mistral-7B-dare-0.85/results_2023-11-23T19-19-22.420919.json b/eval-results/uukuguy/CollectiveCognition-v1.1-Mistral-7B-dare-0.85/results_2023-11-23T19-19-22.420919.json new file mode 100644 index 0000000000000000000000000000000000000000..7f43d1b5ccce04697383a66711ccd5293f94cf0b --- /dev/null +++ b/eval-results/uukuguy/CollectiveCognition-v1.1-Mistral-7B-dare-0.85/results_2023-11-23T19-19-22.420919.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 630559.23747284, + "end_time": 645262.086728073, + "total_evaluation_time_secondes": "14702.849255232955", + "model_name": "uukuguy/CollectiveCognition-v1.1-Mistral-7B-dare-0.85", + "model_sha": "7ecfa4c5b100565bf8cfdfa7442e9772d28a9a23", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5802047781569966, + "acc_stderr": 0.014422181226303026, + "acc_norm": 0.6100682593856656, + "acc_norm_stderr": 0.014252959848892893 + }, + "harness|hellaswag|10": { + "acc": 0.6451902011551484, + "acc_stderr": 0.004774778180345194, + "acc_norm": 0.8430591515634336, + "acc_norm_stderr": 0.0036300159898963996 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6370370370370371, + "acc_stderr": 0.04153948404742398, + "acc_norm": 0.6370370370370371, + "acc_norm_stderr": 0.04153948404742398 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6513157894736842, + "acc_stderr": 0.038781398887976104, + "acc_norm": 0.6513157894736842, + "acc_norm_stderr": 0.038781398887976104 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.690566037735849, + "acc_stderr": 0.028450154794118637, + "acc_norm": 0.690566037735849, + "acc_norm_stderr": 0.028450154794118637 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.037455547914624555, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.037455547914624555 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.036430371689585475, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.036430371689585475 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.79, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.6, + "acc_stderr": 0.03202563076101735, + "acc_norm": 0.6, + "acc_norm_stderr": 0.03202563076101735 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5263157894736842, + "acc_stderr": 0.046970851366478626, + "acc_norm": 0.5263157894736842, + "acc_norm_stderr": 0.046970851366478626 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5793103448275863, + "acc_stderr": 0.0411391498118926, + "acc_norm": 0.5793103448275863, + "acc_norm_stderr": 0.0411391498118926 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.025107425481137282, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.025107425481137282 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.043758884927270605, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.043758884927270605 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7741935483870968, + "acc_stderr": 0.023785577884181012, + "acc_norm": 0.7741935483870968, + "acc_norm_stderr": 0.023785577884181012 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5320197044334976, + "acc_stderr": 0.035107665979592154, + "acc_norm": 0.5320197044334976, + "acc_norm_stderr": 0.035107665979592154 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7818181818181819, + "acc_stderr": 0.032250781083062896, + "acc_norm": 0.7818181818181819, + "acc_norm_stderr": 0.032250781083062896 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.029620227874790486, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.029620227874790486 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8756476683937824, + "acc_stderr": 0.02381447708659355, + "acc_norm": 0.8756476683937824, + "acc_norm_stderr": 0.02381447708659355 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6641025641025641, + "acc_stderr": 0.023946724741563976, + "acc_norm": 0.6641025641025641, + "acc_norm_stderr": 0.023946724741563976 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34074074074074073, + "acc_stderr": 0.028897748741131143, + "acc_norm": 0.34074074074074073, + "acc_norm_stderr": 0.028897748741131143 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6680672268907563, + "acc_stderr": 0.03058869701378364, + "acc_norm": 0.6680672268907563, + "acc_norm_stderr": 0.03058869701378364 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8220183486238533, + "acc_stderr": 0.016399436366612927, + "acc_norm": 0.8220183486238533, + "acc_norm_stderr": 0.016399436366612927 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5509259259259259, + "acc_stderr": 0.033922384053216174, + "acc_norm": 0.5509259259259259, + "acc_norm_stderr": 0.033922384053216174 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7892156862745098, + "acc_stderr": 0.028626547912437406, + "acc_norm": 0.7892156862745098, + "acc_norm_stderr": 0.028626547912437406 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7679324894514767, + "acc_stderr": 0.027479744550808514, + "acc_norm": 0.7679324894514767, + "acc_norm_stderr": 0.027479744550808514 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.031024411740572213, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.031024411740572213 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7862595419847328, + "acc_stderr": 0.0359546161177469, + "acc_norm": 0.7862595419847328, + "acc_norm_stderr": 0.0359546161177469 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7685185185185185, + "acc_stderr": 0.04077494709252627, + "acc_norm": 0.7685185185185185, + "acc_norm_stderr": 0.04077494709252627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7914110429447853, + "acc_stderr": 0.03192193448934724, + "acc_norm": 0.7914110429447853, + "acc_norm_stderr": 0.03192193448934724 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.03760178006026621, + "acc_norm": 0.8252427184466019, + "acc_norm_stderr": 0.03760178006026621 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.02190190511507333, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.02190190511507333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8173690932311622, + "acc_stderr": 0.013816335389973133, + "acc_norm": 0.8173690932311622, + "acc_norm_stderr": 0.013816335389973133 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7109826589595376, + "acc_stderr": 0.02440517393578323, + "acc_norm": 0.7109826589595376, + "acc_norm_stderr": 0.02440517393578323 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.32737430167597764, + "acc_stderr": 0.015694238967737383, + "acc_norm": 0.32737430167597764, + "acc_norm_stderr": 0.015694238967737383 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7549019607843137, + "acc_stderr": 0.024630048979824775, + "acc_norm": 0.7549019607843137, + "acc_norm_stderr": 0.024630048979824775 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7170418006430869, + "acc_stderr": 0.025583062489984824, + "acc_norm": 0.7170418006430869, + "acc_norm_stderr": 0.025583062489984824 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7345679012345679, + "acc_stderr": 0.024569223600460845, + "acc_norm": 0.7345679012345679, + "acc_norm_stderr": 0.024569223600460845 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4858156028368794, + "acc_stderr": 0.02981549448368206, + "acc_norm": 0.4858156028368794, + "acc_norm_stderr": 0.02981549448368206 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.45371577574967403, + "acc_stderr": 0.012715404841277738, + "acc_norm": 0.45371577574967403, + "acc_norm_stderr": 0.012715404841277738 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6617647058823529, + "acc_stderr": 0.028739328513983572, + "acc_norm": 0.6617647058823529, + "acc_norm_stderr": 0.028739328513983572 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.018926082916083383, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.018926082916083383 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.0449429086625209, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.0449429086625209 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7346938775510204, + "acc_stderr": 0.028263889943784593, + "acc_norm": 0.7346938775510204, + "acc_norm_stderr": 0.028263889943784593 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.835820895522388, + "acc_stderr": 0.026193923544454125, + "acc_norm": 0.835820895522388, + "acc_norm_stderr": 0.026193923544454125 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5481927710843374, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.5481927710843374, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8421052631578947, + "acc_stderr": 0.027966785859160896, + "acc_norm": 0.8421052631578947, + "acc_norm_stderr": 0.027966785859160896 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3023255813953488, + "mc1_stderr": 0.016077509266133026, + "mc2": 0.44867041308885225, + "mc2_stderr": 0.014511741253113358 + }, + "harness|winogrande|5": { + "acc": 0.7884767166535123, + "acc_stderr": 0.011477747684223194 + }, + "harness|drop|3": { + "em": 0.001572986577181208, + "em_stderr": 0.00040584511324177333, + "f1": 0.06318477348993282, + "f1_stderr": 0.0013946687452644612 + }, + "harness|gsm8k|5": { + "acc": 0.18953752843062927, + "acc_stderr": 0.010795837931896386 + }, + "all": { + "acc": 0.6373539881235634, + "acc_stderr": 0.032200043467933794, + "acc_norm": 0.6462425671540708, + "acc_norm_stderr": 0.032891781056948864, + "mc1": 0.3023255813953488, + "mc1_stderr": 0.016077509266133026, + "mc2": 0.44867041308885225, + "mc2_stderr": 0.014511741253113358, + "em": 0.001572986577181208, + "em_stderr": 0.00040584511324177333, + "f1": 0.06318477348993282, + "f1_stderr": 0.0013946687452644612 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "bc1ff0975c932cfb" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "217a622522e97037" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "70a5c758fab6f4c0" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/Mistral-7B-OpenOrca-lora/results_2023-11-13T15-44-18.785582.json b/eval-results/uukuguy/Mistral-7B-OpenOrca-lora/results_2023-11-13T15-44-18.785582.json new file mode 100644 index 0000000000000000000000000000000000000000..8e67fc9b79bfbe05f99e5938bfb6b02a4dc3550d --- /dev/null +++ b/eval-results/uukuguy/Mistral-7B-OpenOrca-lora/results_2023-11-13T15-44-18.785582.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 472854.4414539, + "end_time": 487761.83440561, + "total_evaluation_time_secondes": "14907.392951709975", + "model_name": "uukuguy/Mistral-7B-OpenOrca-lora", + "model_sha": "605dc043063cb9589c06883d839122920ed1eca5", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5742320819112628, + "acc_stderr": 0.014449464278868807, + "acc_norm": 0.6194539249146758, + "acc_norm_stderr": 0.014188277712349814 + }, + "harness|hellaswag|10": { + "acc": 0.6357299342760406, + "acc_stderr": 0.004802413919932666, + "acc_norm": 0.8361880103565027, + "acc_norm_stderr": 0.003693484894179416 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6444444444444445, + "acc_stderr": 0.04135176749720385, + "acc_norm": 0.6444444444444445, + "acc_norm_stderr": 0.04135176749720385 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6644736842105263, + "acc_stderr": 0.03842498559395268, + "acc_norm": 0.6644736842105263, + "acc_norm_stderr": 0.03842498559395268 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6792452830188679, + "acc_stderr": 0.028727502957880267, + "acc_norm": 0.6792452830188679, + "acc_norm_stderr": 0.028727502957880267 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7361111111111112, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.7361111111111112, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6358381502890174, + "acc_stderr": 0.03669072477416907, + "acc_norm": 0.6358381502890174, + "acc_norm_stderr": 0.03669072477416907 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.79, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5787234042553191, + "acc_stderr": 0.03227834510146268, + "acc_norm": 0.5787234042553191, + "acc_norm_stderr": 0.03227834510146268 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.5087719298245614, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.5087719298245614, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5793103448275863, + "acc_stderr": 0.0411391498118926, + "acc_norm": 0.5793103448275863, + "acc_norm_stderr": 0.0411391498118926 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.02490699045899257, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.02490699045899257 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.04390259265377562, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.04390259265377562 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7580645161290323, + "acc_stderr": 0.024362599693031096, + "acc_norm": 0.7580645161290323, + "acc_norm_stderr": 0.024362599693031096 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5270935960591133, + "acc_stderr": 0.03512819077876106, + "acc_norm": 0.5270935960591133, + "acc_norm_stderr": 0.03512819077876106 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7757575757575758, + "acc_stderr": 0.03256866661681102, + "acc_norm": 0.7757575757575758, + "acc_norm_stderr": 0.03256866661681102 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7626262626262627, + "acc_stderr": 0.0303137105381989, + "acc_norm": 0.7626262626262627, + "acc_norm_stderr": 0.0303137105381989 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8860103626943006, + "acc_stderr": 0.022935144053919443, + "acc_norm": 0.8860103626943006, + "acc_norm_stderr": 0.022935144053919443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.023901157979402534, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.023901157979402534 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34814814814814815, + "acc_stderr": 0.029045600290616258, + "acc_norm": 0.34814814814814815, + "acc_norm_stderr": 0.029045600290616258 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6638655462184874, + "acc_stderr": 0.03068473711513536, + "acc_norm": 0.6638655462184874, + "acc_norm_stderr": 0.03068473711513536 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8220183486238533, + "acc_stderr": 0.016399436366612927, + "acc_norm": 0.8220183486238533, + "acc_norm_stderr": 0.016399436366612927 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5509259259259259, + "acc_stderr": 0.033922384053216174, + "acc_norm": 0.5509259259259259, + "acc_norm_stderr": 0.033922384053216174 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.803921568627451, + "acc_stderr": 0.027865942286639318, + "acc_norm": 0.803921568627451, + "acc_norm_stderr": 0.027865942286639318 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7637130801687764, + "acc_stderr": 0.027652153144159263, + "acc_norm": 0.7637130801687764, + "acc_norm_stderr": 0.027652153144159263 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.695067264573991, + "acc_stderr": 0.030898610882477515, + "acc_norm": 0.695067264573991, + "acc_norm_stderr": 0.030898610882477515 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7862595419847328, + "acc_stderr": 0.0359546161177469, + "acc_norm": 0.7862595419847328, + "acc_norm_stderr": 0.0359546161177469 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7975460122699386, + "acc_stderr": 0.03157065078911901, + "acc_norm": 0.7975460122699386, + "acc_norm_stderr": 0.03157065078911901 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.49107142857142855, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.49107142857142855, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.03916667762822585, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.03916667762822585 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8888888888888888, + "acc_stderr": 0.020588491316092375, + "acc_norm": 0.8888888888888888, + "acc_norm_stderr": 0.020588491316092375 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8109833971902938, + "acc_stderr": 0.014000791294407003, + "acc_norm": 0.8109833971902938, + "acc_norm_stderr": 0.014000791294407003 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7196531791907514, + "acc_stderr": 0.02418242749657761, + "acc_norm": 0.7196531791907514, + "acc_norm_stderr": 0.02418242749657761 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3318435754189944, + "acc_stderr": 0.015748421208187306, + "acc_norm": 0.3318435754189944, + "acc_norm_stderr": 0.015748421208187306 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7516339869281046, + "acc_stderr": 0.02473998135511359, + "acc_norm": 0.7516339869281046, + "acc_norm_stderr": 0.02473998135511359 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.707395498392283, + "acc_stderr": 0.025839898334877983, + "acc_norm": 0.707395498392283, + "acc_norm_stderr": 0.025839898334877983 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7283950617283951, + "acc_stderr": 0.024748624490537375, + "acc_norm": 0.7283950617283951, + "acc_norm_stderr": 0.024748624490537375 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4858156028368794, + "acc_stderr": 0.02981549448368206, + "acc_norm": 0.4858156028368794, + "acc_norm_stderr": 0.02981549448368206 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.45241199478487615, + "acc_stderr": 0.012712265105889133, + "acc_norm": 0.45241199478487615, + "acc_norm_stderr": 0.012712265105889133 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.028418208619406755, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.028418208619406755 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6781045751633987, + "acc_stderr": 0.018901015322093085, + "acc_norm": 0.6781045751633987, + "acc_norm_stderr": 0.018901015322093085 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7346938775510204, + "acc_stderr": 0.028263889943784593, + "acc_norm": 0.7346938775510204, + "acc_norm_stderr": 0.028263889943784593 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8407960199004975, + "acc_stderr": 0.02587064676616913, + "acc_norm": 0.8407960199004975, + "acc_norm_stderr": 0.02587064676616913 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.033799766898963086, + "acc_norm": 0.87, + "acc_norm_stderr": 0.033799766898963086 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5542168674698795, + "acc_stderr": 0.038695433234721015, + "acc_norm": 0.5542168674698795, + "acc_norm_stderr": 0.038695433234721015 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8304093567251462, + "acc_stderr": 0.02878210810540171, + "acc_norm": 0.8304093567251462, + "acc_norm_stderr": 0.02878210810540171 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2839657282741738, + "mc1_stderr": 0.015785370858396725, + "mc2": 0.4274271734982197, + "mc2_stderr": 0.014247308828610854 + }, + "harness|winogrande|5": { + "acc": 0.7908445146014207, + "acc_stderr": 0.011430450045881575 + }, + "harness|drop|3": { + "em": 0.0019924496644295304, + "em_stderr": 0.00045666764626669387, + "f1": 0.06191694630872485, + "f1_stderr": 0.0013823026381279647 + }, + "harness|gsm8k|5": { + "acc": 0.1728582259287339, + "acc_stderr": 0.010415432246200585 + }, + "all": { + "acc": 0.6351832920969729, + "acc_stderr": 0.03210898212657927, + "acc_norm": 0.6445450507876114, + "acc_norm_stderr": 0.03280393070910138, + "mc1": 0.2839657282741738, + "mc1_stderr": 0.015785370858396725, + "mc2": 0.4274271734982197, + "mc2_stderr": 0.014247308828610854, + "em": 0.0019924496644295304, + "em_stderr": 0.00045666764626669387, + "f1": 0.06191694630872485, + "f1_stderr": 0.0013823026381279647 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "bbee90be9916952c" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "9d73f67ca9e73658" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "325fced41990b120" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/Orca-2-13b-f16/results_2023-12-04T16-43-12.398370.json b/eval-results/uukuguy/Orca-2-13b-f16/results_2023-12-04T16-43-12.398370.json new file mode 100644 index 0000000000000000000000000000000000000000..2dd2c048e94a99ef1c74be9a9f6dc2a2bb9ca101 --- /dev/null +++ b/eval-results/uukuguy/Orca-2-13b-f16/results_2023-12-04T16-43-12.398370.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 153262.371508829, + "end_time": 163069.337780726, + "total_evaluation_time_secondes": "9806.966271896992", + "model_name": "uukuguy/Orca-2-13b-f16", + "model_sha": "b29c52ea0757c460e83592e55ea89e016cef3549", + "model_dtype": "torch.float16", + "model_size": "24.32 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5733788395904437, + "acc_stderr": 0.014453185592920293, + "acc_norm": 0.606655290102389, + "acc_norm_stderr": 0.014275101465693024 + }, + "harness|hellaswag|10": { + "acc": 0.6115315674168492, + "acc_stderr": 0.004864058877626273, + "acc_norm": 0.7981477793268273, + "acc_norm_stderr": 0.004005621755121483 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6074074074074074, + "acc_stderr": 0.04218506215368879, + "acc_norm": 0.6074074074074074, + "acc_norm_stderr": 0.04218506215368879 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.743421052631579, + "acc_stderr": 0.03554180368025689, + "acc_norm": 0.743421052631579, + "acc_norm_stderr": 0.03554180368025689 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6264150943396226, + "acc_stderr": 0.029773082713319875, + "acc_norm": 0.6264150943396226, + "acc_norm_stderr": 0.029773082713319875 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6875, + "acc_stderr": 0.038760854559127644, + "acc_norm": 0.6875, + "acc_norm_stderr": 0.038760854559127644 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5549132947976878, + "acc_stderr": 0.03789401760283648, + "acc_norm": 0.5549132947976878, + "acc_norm_stderr": 0.03789401760283648 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04690650298201943, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04690650298201943 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5617021276595745, + "acc_stderr": 0.032436186361081004, + "acc_norm": 0.5617021276595745, + "acc_norm_stderr": 0.032436186361081004 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5517241379310345, + "acc_stderr": 0.04144311810878152, + "acc_norm": 0.5517241379310345, + "acc_norm_stderr": 0.04144311810878152 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.37566137566137564, + "acc_stderr": 0.02494236893115979, + "acc_norm": 0.37566137566137564, + "acc_norm_stderr": 0.02494236893115979 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.04306241259127153, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.04306241259127153 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7387096774193549, + "acc_stderr": 0.02499305339776481, + "acc_norm": 0.7387096774193549, + "acc_norm_stderr": 0.02499305339776481 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4729064039408867, + "acc_stderr": 0.03512819077876106, + "acc_norm": 0.4729064039408867, + "acc_norm_stderr": 0.03512819077876106 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.64, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.64, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.03477691162163659, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.03477691162163659 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7373737373737373, + "acc_stderr": 0.03135305009533086, + "acc_norm": 0.7373737373737373, + "acc_norm_stderr": 0.03135305009533086 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8341968911917098, + "acc_stderr": 0.026839845022314415, + "acc_norm": 0.8341968911917098, + "acc_norm_stderr": 0.026839845022314415 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5974358974358974, + "acc_stderr": 0.024864995159767762, + "acc_norm": 0.5974358974358974, + "acc_norm_stderr": 0.024864995159767762 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3111111111111111, + "acc_stderr": 0.028226446749683515, + "acc_norm": 0.3111111111111111, + "acc_norm_stderr": 0.028226446749683515 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.634453781512605, + "acc_stderr": 0.031282177063684614, + "acc_norm": 0.634453781512605, + "acc_norm_stderr": 0.031282177063684614 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8128440366972477, + "acc_stderr": 0.016722684526200144, + "acc_norm": 0.8128440366972477, + "acc_norm_stderr": 0.016722684526200144 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.803921568627451, + "acc_stderr": 0.027865942286639325, + "acc_norm": 0.803921568627451, + "acc_norm_stderr": 0.027865942286639325 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8143459915611815, + "acc_stderr": 0.025310495376944856, + "acc_norm": 0.8143459915611815, + "acc_norm_stderr": 0.025310495376944856 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6681614349775785, + "acc_stderr": 0.031602951437766785, + "acc_norm": 0.6681614349775785, + "acc_norm_stderr": 0.031602951437766785 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7175572519083969, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.7175572519083969, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7851239669421488, + "acc_stderr": 0.03749492448709697, + "acc_norm": 0.7851239669421488, + "acc_norm_stderr": 0.03749492448709697 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7962962962962963, + "acc_stderr": 0.03893542518824847, + "acc_norm": 0.7962962962962963, + "acc_norm_stderr": 0.03893542518824847 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7239263803680982, + "acc_stderr": 0.035123852837050475, + "acc_norm": 0.7239263803680982, + "acc_norm_stderr": 0.035123852837050475 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.045218299028335865, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.045218299028335865 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384493, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384493 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8632478632478633, + "acc_stderr": 0.022509033937077802, + "acc_norm": 0.8632478632478633, + "acc_norm_stderr": 0.022509033937077802 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.776500638569604, + "acc_stderr": 0.01489723522945071, + "acc_norm": 0.776500638569604, + "acc_norm_stderr": 0.01489723522945071 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6791907514450867, + "acc_stderr": 0.025131000233647897, + "acc_norm": 0.6791907514450867, + "acc_norm_stderr": 0.025131000233647897 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3039106145251397, + "acc_stderr": 0.015382845587584518, + "acc_norm": 0.3039106145251397, + "acc_norm_stderr": 0.015382845587584518 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6633986928104575, + "acc_stderr": 0.027057974624494382, + "acc_norm": 0.6633986928104575, + "acc_norm_stderr": 0.027057974624494382 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6881028938906752, + "acc_stderr": 0.02631185807185416, + "acc_norm": 0.6881028938906752, + "acc_norm_stderr": 0.02631185807185416 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7067901234567902, + "acc_stderr": 0.025329888171900933, + "acc_norm": 0.7067901234567902, + "acc_norm_stderr": 0.025329888171900933 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.45390070921985815, + "acc_stderr": 0.029700453247291484, + "acc_norm": 0.45390070921985815, + "acc_norm_stderr": 0.029700453247291484 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4361147327249022, + "acc_stderr": 0.012665568135455335, + "acc_norm": 0.4361147327249022, + "acc_norm_stderr": 0.012665568135455335 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5772058823529411, + "acc_stderr": 0.030008562845003476, + "acc_norm": 0.5772058823529411, + "acc_norm_stderr": 0.030008562845003476 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6045751633986928, + "acc_stderr": 0.019780465954777508, + "acc_norm": 0.6045751633986928, + "acc_norm_stderr": 0.019780465954777508 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7224489795918367, + "acc_stderr": 0.02866685779027465, + "acc_norm": 0.7224489795918367, + "acc_norm_stderr": 0.02866685779027465 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7313432835820896, + "acc_stderr": 0.03134328358208954, + "acc_norm": 0.7313432835820896, + "acc_norm_stderr": 0.03134328358208954 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5180722891566265, + "acc_stderr": 0.03889951252827216, + "acc_norm": 0.5180722891566265, + "acc_norm_stderr": 0.03889951252827216 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8011695906432749, + "acc_stderr": 0.030611116557432528, + "acc_norm": 0.8011695906432749, + "acc_norm_stderr": 0.030611116557432528 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.401468788249694, + "mc1_stderr": 0.017160273901693654, + "mc2": 0.5641081747684346, + "mc2_stderr": 0.015927666604862285 + }, + "harness|winogrande|5": { + "acc": 0.7663772691397001, + "acc_stderr": 0.011892194477183525 + }, + "harness|gsm8k|5": { + "acc": 0.38968915845337376, + "acc_stderr": 0.013433123236110692 + }, + "all": { + "acc": 0.6024963144778468, + "acc_stderr": 0.03292700891541927, + "acc_norm": 0.6070525664063983, + "acc_norm_stderr": 0.03359636787928049, + "mc1": 0.401468788249694, + "mc1_stderr": 0.017160273901693654, + "mc2": 0.5641081747684346, + "mc2_stderr": 0.015927666604862285 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "d16f923b558be7f4" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "a8fa53915153e1db", + "hash_cont_tokens": "43c6ee79c463469b" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/Orca-2-7b-f16/results_2023-11-25T05-57-22.285671.json b/eval-results/uukuguy/Orca-2-7b-f16/results_2023-11-25T05-57-22.285671.json new file mode 100644 index 0000000000000000000000000000000000000000..796c7e5f4cd3ec288ed835a73f2028f85a3b7fa3 --- /dev/null +++ b/eval-results/uukuguy/Orca-2-7b-f16/results_2023-11-25T05-57-22.285671.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 739778.889502726, + "end_time": 769331.043769742, + "total_evaluation_time_secondes": "29552.154267015983", + "model_name": "uukuguy/Orca-2-7b-f16", + "model_sha": "f6b2f717467dc12b2b19cad90ed4362153863ad9", + "model_dtype": "torch.float16", + "model_size": "12.61 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.23378839590443687, + "acc_stderr": 0.01236822537850714, + "acc_norm": 0.2960750853242321, + "acc_norm_stderr": 0.013340916085246263 + }, + "harness|hellaswag|10": { + "acc": 0.2548297151961761, + "acc_stderr": 0.0043487487305299355, + "acc_norm": 0.2562238597888867, + "acc_norm_stderr": 0.004356547185847041 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.035914440841969694, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.035914440841969694 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3026315789473684, + "acc_stderr": 0.03738520676119669, + "acc_norm": 0.3026315789473684, + "acc_norm_stderr": 0.03738520676119669 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.30566037735849055, + "acc_stderr": 0.028353298073322666, + "acc_norm": 0.30566037735849055, + "acc_norm_stderr": 0.028353298073322666 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.03745554791462457, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.03745554791462457 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.27167630057803466, + "acc_stderr": 0.03391750322321659, + "acc_norm": 0.27167630057803466, + "acc_norm_stderr": 0.03391750322321659 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.04280105837364395, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.04280105837364395 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768077, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768077 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.225531914893617, + "acc_stderr": 0.02732107841738754, + "acc_norm": 0.225531914893617, + "acc_norm_stderr": 0.02732107841738754 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.038351539543994194, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.038351539543994194 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2896551724137931, + "acc_stderr": 0.037800192304380156, + "acc_norm": 0.2896551724137931, + "acc_norm_stderr": 0.037800192304380156 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.22486772486772486, + "acc_stderr": 0.021502096078229147, + "acc_norm": 0.22486772486772486, + "acc_norm_stderr": 0.021502096078229147 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.04343525428949098, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.04343525428949098 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3161290322580645, + "acc_stderr": 0.02645087448904277, + "acc_norm": 0.3161290322580645, + "acc_norm_stderr": 0.02645087448904277 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.22167487684729065, + "acc_stderr": 0.029225575892489614, + "acc_norm": 0.22167487684729065, + "acc_norm_stderr": 0.029225575892489614 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2, + "acc_stderr": 0.031234752377721175, + "acc_norm": 0.2, + "acc_norm_stderr": 0.031234752377721175 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.0347327959083696, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.0347327959083696 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.24870466321243523, + "acc_stderr": 0.031195840877700293, + "acc_norm": 0.24870466321243523, + "acc_norm_stderr": 0.031195840877700293 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2948717948717949, + "acc_stderr": 0.023119362758232273, + "acc_norm": 0.2948717948717949, + "acc_norm_stderr": 0.023119362758232273 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23703703703703705, + "acc_stderr": 0.02592887613276611, + "acc_norm": 0.23703703703703705, + "acc_norm_stderr": 0.02592887613276611 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.29831932773109243, + "acc_stderr": 0.029719142876342846, + "acc_norm": 0.29831932773109243, + "acc_norm_stderr": 0.029719142876342846 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23178807947019867, + "acc_stderr": 0.034454062719870546, + "acc_norm": 0.23178807947019867, + "acc_norm_stderr": 0.034454062719870546 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.30091743119266057, + "acc_stderr": 0.019664751366802114, + "acc_norm": 0.30091743119266057, + "acc_norm_stderr": 0.019664751366802114 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.030546745264953178, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.030546745264953178 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.22058823529411764, + "acc_stderr": 0.02910225438967407, + "acc_norm": 0.22058823529411764, + "acc_norm_stderr": 0.02910225438967407 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2320675105485232, + "acc_stderr": 0.02747974455080852, + "acc_norm": 0.2320675105485232, + "acc_norm_stderr": 0.02747974455080852 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.15246636771300448, + "acc_stderr": 0.024126204813252863, + "acc_norm": 0.15246636771300448, + "acc_norm_stderr": 0.024126204813252863 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2824427480916031, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.2824427480916031, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2809917355371901, + "acc_stderr": 0.04103203830514511, + "acc_norm": 0.2809917355371901, + "acc_norm_stderr": 0.04103203830514511 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.19444444444444445, + "acc_stderr": 0.038260763248848646, + "acc_norm": 0.19444444444444445, + "acc_norm_stderr": 0.038260763248848646 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.25766871165644173, + "acc_stderr": 0.03436150827846917, + "acc_norm": 0.25766871165644173, + "acc_norm_stderr": 0.03436150827846917 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25892857142857145, + "acc_stderr": 0.04157751539865629, + "acc_norm": 0.25892857142857145, + "acc_norm_stderr": 0.04157751539865629 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.39805825242718446, + "acc_stderr": 0.04846748253977239, + "acc_norm": 0.39805825242718446, + "acc_norm_stderr": 0.04846748253977239 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.24786324786324787, + "acc_stderr": 0.0282863240755644, + "acc_norm": 0.24786324786324787, + "acc_norm_stderr": 0.0282863240755644 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816507, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816507 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.22988505747126436, + "acc_stderr": 0.01504630184669182, + "acc_norm": 0.22988505747126436, + "acc_norm_stderr": 0.01504630184669182 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.23121387283236994, + "acc_stderr": 0.022698657167855713, + "acc_norm": 0.23121387283236994, + "acc_norm_stderr": 0.022698657167855713 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.22793296089385476, + "acc_stderr": 0.014030149950805097, + "acc_norm": 0.22793296089385476, + "acc_norm_stderr": 0.014030149950805097 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2908496732026144, + "acc_stderr": 0.02600480036395211, + "acc_norm": 0.2908496732026144, + "acc_norm_stderr": 0.02600480036395211 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24115755627009647, + "acc_stderr": 0.024296594034763426, + "acc_norm": 0.24115755627009647, + "acc_norm_stderr": 0.024296594034763426 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.25617283950617287, + "acc_stderr": 0.024288533637726095, + "acc_norm": 0.25617283950617287, + "acc_norm_stderr": 0.024288533637726095 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2553191489361702, + "acc_stderr": 0.026011992930902006, + "acc_norm": 0.2553191489361702, + "acc_norm_stderr": 0.026011992930902006 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2438070404172099, + "acc_stderr": 0.01096650797217848, + "acc_norm": 0.2438070404172099, + "acc_norm_stderr": 0.01096650797217848 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.27941176470588236, + "acc_stderr": 0.02725720260611494, + "acc_norm": 0.27941176470588236, + "acc_norm_stderr": 0.02725720260611494 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.24673202614379086, + "acc_stderr": 0.0174408203674025, + "acc_norm": 0.24673202614379086, + "acc_norm_stderr": 0.0174408203674025 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.32653061224489793, + "acc_stderr": 0.030021056238440317, + "acc_norm": 0.32653061224489793, + "acc_norm_stderr": 0.030021056238440317 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.27860696517412936, + "acc_stderr": 0.031700561834973086, + "acc_norm": 0.27860696517412936, + "acc_norm_stderr": 0.031700561834973086 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.24096385542168675, + "acc_stderr": 0.03329394119073528, + "acc_norm": 0.24096385542168675, + "acc_norm_stderr": 0.03329394119073528 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.29239766081871343, + "acc_stderr": 0.034886477134579215, + "acc_norm": 0.29239766081871343, + "acc_norm_stderr": 0.034886477134579215 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2350061199510404, + "mc1_stderr": 0.0148430615077316, + "mc2": 0.4836424685770379, + "mc2_stderr": 0.017011052216455772 + }, + "harness|winogrande|5": { + "acc": 0.5059194948697711, + "acc_stderr": 0.014051500838485807 + }, + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 4.718959731543626e-05, + "f1_stderr": 1.3131442946208309e-05 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "all": { + "acc": 0.2657693278278556, + "acc_stderr": 0.03135662339817443, + "acc_norm": 0.2672828870617276, + "acc_norm_stderr": 0.032198017213766285, + "mc1": 0.2350061199510404, + "mc1_stderr": 0.0148430615077316, + "mc2": 0.4836424685770379, + "mc2_stderr": 0.017011052216455772, + "em": 0.0, + "em_stderr": 0.0, + "f1": 4.718959731543626e-05, + "f1_stderr": 1.3131442946208309e-05 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "e1aa577e358e43cd" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "dc72d2ccd9a7bd31" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "379266f3a5365f9d", + "hash_cont_tokens": "f25e5101ca995c06" + }, + "truncated": 3, + "non_truncated": 38192, + "padded": 113348, + "non_padded": 11060, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/SynthIA-7B-v1.3-dare-0.85/results_2023-11-23T18-58-41.159609.json b/eval-results/uukuguy/SynthIA-7B-v1.3-dare-0.85/results_2023-11-23T18-58-41.159609.json new file mode 100644 index 0000000000000000000000000000000000000000..02bc6d8006da03866c1e0cf06e447a6ccfdfafcc --- /dev/null +++ b/eval-results/uukuguy/SynthIA-7B-v1.3-dare-0.85/results_2023-11-23T18-58-41.159609.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 231483.306669362, + "end_time": 245626.345597411, + "total_evaluation_time_secondes": "14143.038928049005", + "model_name": "uukuguy/SynthIA-7B-v1.3-dare-0.85", + "model_sha": "91381d0ac625dcde542428ed6cb35177b4260923", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5750853242320819, + "acc_stderr": 0.014445698968520769, + "acc_norm": 0.6100682593856656, + "acc_norm_stderr": 0.014252959848892893 + }, + "harness|hellaswag|10": { + "acc": 0.6336387173869747, + "acc_stderr": 0.004808251269682433, + "acc_norm": 0.8349930292770364, + "acc_norm_stderr": 0.00370428239078172 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6370370370370371, + "acc_stderr": 0.04153948404742398, + "acc_norm": 0.6370370370370371, + "acc_norm_stderr": 0.04153948404742398 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6578947368421053, + "acc_stderr": 0.03860731599316091, + "acc_norm": 0.6578947368421053, + "acc_norm_stderr": 0.03860731599316091 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7094339622641509, + "acc_stderr": 0.027943219989337135, + "acc_norm": 0.7094339622641509, + "acc_norm_stderr": 0.027943219989337135 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7361111111111112, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.7361111111111112, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6358381502890174, + "acc_stderr": 0.03669072477416907, + "acc_norm": 0.6358381502890174, + "acc_norm_stderr": 0.03669072477416907 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3431372549019608, + "acc_stderr": 0.04724007352383887, + "acc_norm": 0.3431372549019608, + "acc_norm_stderr": 0.04724007352383887 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.79, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5787234042553191, + "acc_stderr": 0.03227834510146268, + "acc_norm": 0.5787234042553191, + "acc_norm_stderr": 0.03227834510146268 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.49122807017543857, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.49122807017543857, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5724137931034483, + "acc_stderr": 0.041227371113703316, + "acc_norm": 0.5724137931034483, + "acc_norm_stderr": 0.041227371113703316 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.0252798503974049, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.0252798503974049 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.04403438954768177, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.04403438954768177 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7709677419354839, + "acc_stderr": 0.02390491431178265, + "acc_norm": 0.7709677419354839, + "acc_norm_stderr": 0.02390491431178265 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5221674876847291, + "acc_stderr": 0.03514528562175008, + "acc_norm": 0.5221674876847291, + "acc_norm_stderr": 0.03514528562175008 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7696969696969697, + "acc_stderr": 0.032876667586034906, + "acc_norm": 0.7696969696969697, + "acc_norm_stderr": 0.032876667586034906 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7929292929292929, + "acc_stderr": 0.028869778460267042, + "acc_norm": 0.7929292929292929, + "acc_norm_stderr": 0.028869778460267042 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8756476683937824, + "acc_stderr": 0.02381447708659355, + "acc_norm": 0.8756476683937824, + "acc_norm_stderr": 0.02381447708659355 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6641025641025641, + "acc_stderr": 0.023946724741563976, + "acc_norm": 0.6641025641025641, + "acc_norm_stderr": 0.023946724741563976 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34074074074074073, + "acc_stderr": 0.028897748741131147, + "acc_norm": 0.34074074074074073, + "acc_norm_stderr": 0.028897748741131147 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6512605042016807, + "acc_stderr": 0.030956636328566548, + "acc_norm": 0.6512605042016807, + "acc_norm_stderr": 0.030956636328566548 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8146788990825689, + "acc_stderr": 0.01665927970029584, + "acc_norm": 0.8146788990825689, + "acc_norm_stderr": 0.01665927970029584 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5509259259259259, + "acc_stderr": 0.033922384053216174, + "acc_norm": 0.5509259259259259, + "acc_norm_stderr": 0.033922384053216174 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.028125972265654373, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.028125972265654373 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7890295358649789, + "acc_stderr": 0.02655837250266192, + "acc_norm": 0.7890295358649789, + "acc_norm_stderr": 0.02655837250266192 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7040358744394619, + "acc_stderr": 0.0306365913486998, + "acc_norm": 0.7040358744394619, + "acc_norm_stderr": 0.0306365913486998 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7938931297709924, + "acc_stderr": 0.03547771004159463, + "acc_norm": 0.7938931297709924, + "acc_norm_stderr": 0.03547771004159463 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8016528925619835, + "acc_stderr": 0.03640118271990947, + "acc_norm": 0.8016528925619835, + "acc_norm_stderr": 0.03640118271990947 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.803680981595092, + "acc_stderr": 0.031207970394709218, + "acc_norm": 0.803680981595092, + "acc_norm_stderr": 0.031207970394709218 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5089285714285714, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.5089285714285714, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8155339805825242, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.8155339805825242, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8803418803418803, + "acc_stderr": 0.021262719400406953, + "acc_norm": 0.8803418803418803, + "acc_norm_stderr": 0.021262719400406953 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8109833971902938, + "acc_stderr": 0.014000791294407006, + "acc_norm": 0.8109833971902938, + "acc_norm_stderr": 0.014000791294407006 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7225433526011561, + "acc_stderr": 0.02410571260775431, + "acc_norm": 0.7225433526011561, + "acc_norm_stderr": 0.02410571260775431 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.34972067039106147, + "acc_stderr": 0.015949308790233645, + "acc_norm": 0.34972067039106147, + "acc_norm_stderr": 0.015949308790233645 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7581699346405228, + "acc_stderr": 0.024518195641879334, + "acc_norm": 0.7581699346405228, + "acc_norm_stderr": 0.024518195641879334 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7009646302250804, + "acc_stderr": 0.02600330111788514, + "acc_norm": 0.7009646302250804, + "acc_norm_stderr": 0.02600330111788514 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7469135802469136, + "acc_stderr": 0.024191808600713, + "acc_norm": 0.7469135802469136, + "acc_norm_stderr": 0.024191808600713 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5141843971631206, + "acc_stderr": 0.02981549448368206, + "acc_norm": 0.5141843971631206, + "acc_norm_stderr": 0.02981549448368206 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4485006518904824, + "acc_stderr": 0.012702317490559802, + "acc_norm": 0.4485006518904824, + "acc_norm_stderr": 0.012702317490559802 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6727941176470589, + "acc_stderr": 0.028501452860396556, + "acc_norm": 0.6727941176470589, + "acc_norm_stderr": 0.028501452860396556 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6830065359477124, + "acc_stderr": 0.018824219512706207, + "acc_norm": 0.6830065359477124, + "acc_norm_stderr": 0.018824219512706207 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7387755102040816, + "acc_stderr": 0.028123429335142783, + "acc_norm": 0.7387755102040816, + "acc_norm_stderr": 0.028123429335142783 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8507462686567164, + "acc_stderr": 0.025196929874827072, + "acc_norm": 0.8507462686567164, + "acc_norm_stderr": 0.025196929874827072 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.033799766898963086, + "acc_norm": 0.87, + "acc_norm_stderr": 0.033799766898963086 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5421686746987951, + "acc_stderr": 0.03878626771002361, + "acc_norm": 0.5421686746987951, + "acc_norm_stderr": 0.03878626771002361 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2974296205630355, + "mc1_stderr": 0.016002651487361, + "mc2": 0.4377418572010016, + "mc2_stderr": 0.014257418960086683 + }, + "harness|winogrande|5": { + "acc": 0.7892659826361483, + "acc_stderr": 0.011462046419710686 + }, + "harness|drop|3": { + "em": 0.0018875838926174498, + "em_stderr": 0.0004445109990558977, + "f1": 0.06350356543624144, + "f1_stderr": 0.0013999691906909637 + }, + "harness|gsm8k|5": { + "acc": 0.18574677786201668, + "acc_stderr": 0.010712298902729095 + }, + "all": { + "acc": 0.6384101997004026, + "acc_stderr": 0.0320658451939497, + "acc_norm": 0.6475312994622042, + "acc_norm_stderr": 0.032755008534067175, + "mc1": 0.2974296205630355, + "mc1_stderr": 0.016002651487361, + "mc2": 0.4377418572010016, + "mc2_stderr": 0.014257418960086683, + "em": 0.0018875838926174498, + "em_stderr": 0.0004445109990558977, + "f1": 0.06350356543624144, + "f1_stderr": 0.0013999691906909637 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "2abb00306dcb3cb0" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "8c11605ef29a2575" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "e2f9f5669a993584" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/SynthIA-7B-v1.3-dare-0.85/results_2023-11-23T22-59-57.395887.json b/eval-results/uukuguy/SynthIA-7B-v1.3-dare-0.85/results_2023-11-23T22-59-57.395887.json new file mode 100644 index 0000000000000000000000000000000000000000..c47170c24e504dcbab46a1cd2bcfadd77abc8aa6 --- /dev/null +++ b/eval-results/uukuguy/SynthIA-7B-v1.3-dare-0.85/results_2023-11-23T22-59-57.395887.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 621481.353721072, + "end_time": 635699.417862762, + "total_evaluation_time_secondes": "14218.064141690033", + "model_name": "uukuguy/SynthIA-7B-v1.3-dare-0.85", + "model_sha": "91381d0ac625dcde542428ed6cb35177b4260923", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5750853242320819, + "acc_stderr": 0.014445698968520769, + "acc_norm": 0.6100682593856656, + "acc_norm_stderr": 0.014252959848892893 + }, + "harness|hellaswag|10": { + "acc": 0.6336387173869747, + "acc_stderr": 0.004808251269682433, + "acc_norm": 0.8349930292770364, + "acc_norm_stderr": 0.00370428239078172 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6370370370370371, + "acc_stderr": 0.04153948404742398, + "acc_norm": 0.6370370370370371, + "acc_norm_stderr": 0.04153948404742398 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6578947368421053, + "acc_stderr": 0.03860731599316091, + "acc_norm": 0.6578947368421053, + "acc_norm_stderr": 0.03860731599316091 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7094339622641509, + "acc_stderr": 0.027943219989337135, + "acc_norm": 0.7094339622641509, + "acc_norm_stderr": 0.027943219989337135 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7361111111111112, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.7361111111111112, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6358381502890174, + "acc_stderr": 0.03669072477416907, + "acc_norm": 0.6358381502890174, + "acc_norm_stderr": 0.03669072477416907 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3431372549019608, + "acc_stderr": 0.04724007352383887, + "acc_norm": 0.3431372549019608, + "acc_norm_stderr": 0.04724007352383887 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.79, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5787234042553191, + "acc_stderr": 0.03227834510146268, + "acc_norm": 0.5787234042553191, + "acc_norm_stderr": 0.03227834510146268 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.49122807017543857, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.49122807017543857, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5724137931034483, + "acc_stderr": 0.041227371113703316, + "acc_norm": 0.5724137931034483, + "acc_norm_stderr": 0.041227371113703316 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.0252798503974049, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.0252798503974049 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.04403438954768177, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.04403438954768177 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7709677419354839, + "acc_stderr": 0.02390491431178265, + "acc_norm": 0.7709677419354839, + "acc_norm_stderr": 0.02390491431178265 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5221674876847291, + "acc_stderr": 0.03514528562175008, + "acc_norm": 0.5221674876847291, + "acc_norm_stderr": 0.03514528562175008 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7696969696969697, + "acc_stderr": 0.032876667586034906, + "acc_norm": 0.7696969696969697, + "acc_norm_stderr": 0.032876667586034906 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7929292929292929, + "acc_stderr": 0.028869778460267042, + "acc_norm": 0.7929292929292929, + "acc_norm_stderr": 0.028869778460267042 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8756476683937824, + "acc_stderr": 0.02381447708659355, + "acc_norm": 0.8756476683937824, + "acc_norm_stderr": 0.02381447708659355 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6641025641025641, + "acc_stderr": 0.023946724741563976, + "acc_norm": 0.6641025641025641, + "acc_norm_stderr": 0.023946724741563976 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34074074074074073, + "acc_stderr": 0.028897748741131147, + "acc_norm": 0.34074074074074073, + "acc_norm_stderr": 0.028897748741131147 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6512605042016807, + "acc_stderr": 0.030956636328566548, + "acc_norm": 0.6512605042016807, + "acc_norm_stderr": 0.030956636328566548 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8146788990825689, + "acc_stderr": 0.01665927970029584, + "acc_norm": 0.8146788990825689, + "acc_norm_stderr": 0.01665927970029584 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5509259259259259, + "acc_stderr": 0.033922384053216174, + "acc_norm": 0.5509259259259259, + "acc_norm_stderr": 0.033922384053216174 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.028125972265654373, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.028125972265654373 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7890295358649789, + "acc_stderr": 0.02655837250266192, + "acc_norm": 0.7890295358649789, + "acc_norm_stderr": 0.02655837250266192 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7040358744394619, + "acc_stderr": 0.0306365913486998, + "acc_norm": 0.7040358744394619, + "acc_norm_stderr": 0.0306365913486998 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7938931297709924, + "acc_stderr": 0.03547771004159463, + "acc_norm": 0.7938931297709924, + "acc_norm_stderr": 0.03547771004159463 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8016528925619835, + "acc_stderr": 0.03640118271990947, + "acc_norm": 0.8016528925619835, + "acc_norm_stderr": 0.03640118271990947 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.803680981595092, + "acc_stderr": 0.031207970394709218, + "acc_norm": 0.803680981595092, + "acc_norm_stderr": 0.031207970394709218 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5089285714285714, + "acc_stderr": 0.04745033255489123, + "acc_norm": 0.5089285714285714, + "acc_norm_stderr": 0.04745033255489123 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8155339805825242, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.8155339805825242, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8803418803418803, + "acc_stderr": 0.021262719400406953, + "acc_norm": 0.8803418803418803, + "acc_norm_stderr": 0.021262719400406953 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8109833971902938, + "acc_stderr": 0.014000791294407006, + "acc_norm": 0.8109833971902938, + "acc_norm_stderr": 0.014000791294407006 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7225433526011561, + "acc_stderr": 0.02410571260775431, + "acc_norm": 0.7225433526011561, + "acc_norm_stderr": 0.02410571260775431 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.34972067039106147, + "acc_stderr": 0.015949308790233645, + "acc_norm": 0.34972067039106147, + "acc_norm_stderr": 0.015949308790233645 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7581699346405228, + "acc_stderr": 0.024518195641879334, + "acc_norm": 0.7581699346405228, + "acc_norm_stderr": 0.024518195641879334 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7009646302250804, + "acc_stderr": 0.02600330111788514, + "acc_norm": 0.7009646302250804, + "acc_norm_stderr": 0.02600330111788514 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7469135802469136, + "acc_stderr": 0.024191808600713, + "acc_norm": 0.7469135802469136, + "acc_norm_stderr": 0.024191808600713 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5141843971631206, + "acc_stderr": 0.02981549448368206, + "acc_norm": 0.5141843971631206, + "acc_norm_stderr": 0.02981549448368206 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4485006518904824, + "acc_stderr": 0.012702317490559802, + "acc_norm": 0.4485006518904824, + "acc_norm_stderr": 0.012702317490559802 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6727941176470589, + "acc_stderr": 0.028501452860396556, + "acc_norm": 0.6727941176470589, + "acc_norm_stderr": 0.028501452860396556 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6830065359477124, + "acc_stderr": 0.018824219512706207, + "acc_norm": 0.6830065359477124, + "acc_norm_stderr": 0.018824219512706207 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7387755102040816, + "acc_stderr": 0.028123429335142783, + "acc_norm": 0.7387755102040816, + "acc_norm_stderr": 0.028123429335142783 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8507462686567164, + "acc_stderr": 0.025196929874827072, + "acc_norm": 0.8507462686567164, + "acc_norm_stderr": 0.025196929874827072 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.033799766898963086, + "acc_norm": 0.87, + "acc_norm_stderr": 0.033799766898963086 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5421686746987951, + "acc_stderr": 0.03878626771002361, + "acc_norm": 0.5421686746987951, + "acc_norm_stderr": 0.03878626771002361 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2974296205630355, + "mc1_stderr": 0.016002651487361, + "mc2": 0.4377418572010016, + "mc2_stderr": 0.014257418960086683 + }, + "harness|winogrande|5": { + "acc": 0.7892659826361483, + "acc_stderr": 0.011462046419710686 + }, + "harness|drop|3": { + "em": 0.0018875838926174498, + "em_stderr": 0.0004445109990558977, + "f1": 0.06350356543624144, + "f1_stderr": 0.0013999691906909637 + }, + "harness|gsm8k|5": { + "acc": 0.18574677786201668, + "acc_stderr": 0.010712298902729095 + }, + "all": { + "acc": 0.6384101997004026, + "acc_stderr": 0.0320658451939497, + "acc_norm": 0.6475312994622042, + "acc_norm_stderr": 0.032755008534067175, + "mc1": 0.2974296205630355, + "mc1_stderr": 0.016002651487361, + "mc2": 0.4377418572010016, + "mc2_stderr": 0.014257418960086683, + "em": 0.0018875838926174498, + "em_stderr": 0.0004445109990558977, + "f1": 0.06350356543624144, + "f1_stderr": 0.0013999691906909637 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "2abb00306dcb3cb0" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "8c11605ef29a2575" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "e2f9f5669a993584" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/airoboros-m-7b-3.1.2-dare-0.85/results_2023-11-23T23-04-08.316762.json b/eval-results/uukuguy/airoboros-m-7b-3.1.2-dare-0.85/results_2023-11-23T23-04-08.316762.json new file mode 100644 index 0000000000000000000000000000000000000000..af621f0408acf756140050b0119d5e43056e1fd6 --- /dev/null +++ b/eval-results/uukuguy/airoboros-m-7b-3.1.2-dare-0.85/results_2023-11-23T23-04-08.316762.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 544080.112345947, + "end_time": 558549.418312155, + "total_evaluation_time_secondes": "14469.30596620799", + "model_name": "uukuguy/airoboros-m-7b-3.1.2-dare-0.85", + "model_sha": "b5bc02f4e1008bd3a72046a93ac2f4dd4bef02da", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.575938566552901, + "acc_stderr": 0.0144418896274644, + "acc_norm": 0.6109215017064846, + "acc_norm_stderr": 0.014247309976045607 + }, + "harness|hellaswag|10": { + "acc": 0.6330412268472416, + "acc_stderr": 0.00480990115123484, + "acc_norm": 0.8356901015733917, + "acc_norm_stderr": 0.003697992356124477 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252606, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252606 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6222222222222222, + "acc_stderr": 0.04188307537595852, + "acc_norm": 0.6222222222222222, + "acc_norm_stderr": 0.04188307537595852 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6644736842105263, + "acc_stderr": 0.03842498559395268, + "acc_norm": 0.6644736842105263, + "acc_norm_stderr": 0.03842498559395268 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6830188679245283, + "acc_stderr": 0.02863723563980089, + "acc_norm": 0.6830188679245283, + "acc_norm_stderr": 0.02863723563980089 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7291666666666666, + "acc_stderr": 0.03716177437566017, + "acc_norm": 0.7291666666666666, + "acc_norm_stderr": 0.03716177437566017 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6647398843930635, + "acc_stderr": 0.03599586301247077, + "acc_norm": 0.6647398843930635, + "acc_norm_stderr": 0.03599586301247077 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.04897104952726366, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.04897104952726366 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5702127659574469, + "acc_stderr": 0.03236214467715564, + "acc_norm": 0.5702127659574469, + "acc_norm_stderr": 0.03236214467715564 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.49122807017543857, + "acc_stderr": 0.04702880432049615, + "acc_norm": 0.49122807017543857, + "acc_norm_stderr": 0.04702880432049615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5655172413793104, + "acc_stderr": 0.04130740879555497, + "acc_norm": 0.5655172413793104, + "acc_norm_stderr": 0.04130740879555497 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3862433862433862, + "acc_stderr": 0.02507598176760168, + "acc_norm": 0.3862433862433862, + "acc_norm_stderr": 0.02507598176760168 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.0436031486007746, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.0436031486007746 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7677419354838709, + "acc_stderr": 0.024022256130308235, + "acc_norm": 0.7677419354838709, + "acc_norm_stderr": 0.024022256130308235 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5221674876847291, + "acc_stderr": 0.03514528562175008, + "acc_norm": 0.5221674876847291, + "acc_norm_stderr": 0.03514528562175008 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7636363636363637, + "acc_stderr": 0.03317505930009182, + "acc_norm": 0.7636363636363637, + "acc_norm_stderr": 0.03317505930009182 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7626262626262627, + "acc_stderr": 0.0303137105381989, + "acc_norm": 0.7626262626262627, + "acc_norm_stderr": 0.0303137105381989 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8756476683937824, + "acc_stderr": 0.02381447708659355, + "acc_norm": 0.8756476683937824, + "acc_norm_stderr": 0.02381447708659355 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.658974358974359, + "acc_stderr": 0.02403548967633508, + "acc_norm": 0.658974358974359, + "acc_norm_stderr": 0.02403548967633508 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34074074074074073, + "acc_stderr": 0.028897748741131143, + "acc_norm": 0.34074074074074073, + "acc_norm_stderr": 0.028897748741131143 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6680672268907563, + "acc_stderr": 0.03058869701378364, + "acc_norm": 0.6680672268907563, + "acc_norm_stderr": 0.03058869701378364 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8201834862385321, + "acc_stderr": 0.016465345467391545, + "acc_norm": 0.8201834862385321, + "acc_norm_stderr": 0.016465345467391545 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5509259259259259, + "acc_stderr": 0.03392238405321617, + "acc_norm": 0.5509259259259259, + "acc_norm_stderr": 0.03392238405321617 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7941176470588235, + "acc_stderr": 0.028379449451588667, + "acc_norm": 0.7941176470588235, + "acc_norm_stderr": 0.028379449451588667 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229966, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229966 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6771300448430493, + "acc_stderr": 0.03138147637575499, + "acc_norm": 0.6771300448430493, + "acc_norm_stderr": 0.03138147637575499 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7786259541984732, + "acc_stderr": 0.0364129708131373, + "acc_norm": 0.7786259541984732, + "acc_norm_stderr": 0.0364129708131373 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7851239669421488, + "acc_stderr": 0.037494924487096966, + "acc_norm": 0.7851239669421488, + "acc_norm_stderr": 0.037494924487096966 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7791411042944786, + "acc_stderr": 0.03259177392742178, + "acc_norm": 0.7791411042944786, + "acc_norm_stderr": 0.03259177392742178 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.48214285714285715, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.48214285714285715, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.03760178006026621, + "acc_norm": 0.8252427184466019, + "acc_norm_stderr": 0.03760178006026621 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8846153846153846, + "acc_stderr": 0.020930193185179333, + "acc_norm": 0.8846153846153846, + "acc_norm_stderr": 0.020930193185179333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8135376756066411, + "acc_stderr": 0.013927751372001505, + "acc_norm": 0.8135376756066411, + "acc_norm_stderr": 0.013927751372001505 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7138728323699421, + "acc_stderr": 0.02433214677913413, + "acc_norm": 0.7138728323699421, + "acc_norm_stderr": 0.02433214677913413 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3027932960893855, + "acc_stderr": 0.01536686038639711, + "acc_norm": 0.3027932960893855, + "acc_norm_stderr": 0.01536686038639711 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7679738562091504, + "acc_stderr": 0.024170840879340863, + "acc_norm": 0.7679738562091504, + "acc_norm_stderr": 0.024170840879340863 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7041800643086816, + "acc_stderr": 0.02592237178881877, + "acc_norm": 0.7041800643086816, + "acc_norm_stderr": 0.02592237178881877 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7376543209876543, + "acc_stderr": 0.024477222856135114, + "acc_norm": 0.7376543209876543, + "acc_norm_stderr": 0.024477222856135114 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.475177304964539, + "acc_stderr": 0.02979071924382972, + "acc_norm": 0.475177304964539, + "acc_norm_stderr": 0.02979071924382972 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4471968709256845, + "acc_stderr": 0.012698825252435108, + "acc_norm": 0.4471968709256845, + "acc_norm_stderr": 0.012698825252435108 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.028418208619406752, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.028418208619406752 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6715686274509803, + "acc_stderr": 0.018999707383162673, + "acc_norm": 0.6715686274509803, + "acc_norm_stderr": 0.018999707383162673 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7346938775510204, + "acc_stderr": 0.028263889943784593, + "acc_norm": 0.7346938775510204, + "acc_norm_stderr": 0.028263889943784593 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8159203980099502, + "acc_stderr": 0.02740385941078685, + "acc_norm": 0.8159203980099502, + "acc_norm_stderr": 0.02740385941078685 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.036845294917747115, + "acc_norm": 0.84, + "acc_norm_stderr": 0.036845294917747115 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5301204819277109, + "acc_stderr": 0.03885425420866767, + "acc_norm": 0.5301204819277109, + "acc_norm_stderr": 0.03885425420866767 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8245614035087719, + "acc_stderr": 0.02917088550072767, + "acc_norm": 0.8245614035087719, + "acc_norm_stderr": 0.02917088550072767 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2937576499388005, + "mc1_stderr": 0.015945068581236614, + "mc2": 0.43638896018594414, + "mc2_stderr": 0.01419131146424957 + }, + "harness|winogrande|5": { + "acc": 0.7837411207576953, + "acc_stderr": 0.01157061486140935 + }, + "harness|drop|3": { + "em": 0.0016778523489932886, + "em_stderr": 0.00041913301788268467, + "f1": 0.06136954697986581, + "f1_stderr": 0.0013699074965009578 + }, + "harness|gsm8k|5": { + "acc": 0.17437452615617893, + "acc_stderr": 0.010451421361976233 + }, + "all": { + "acc": 0.634042190068382, + "acc_stderr": 0.032283010718078695, + "acc_norm": 0.6433235552057146, + "acc_norm_stderr": 0.03298195134123534, + "mc1": 0.2937576499388005, + "mc1_stderr": 0.015945068581236614, + "mc2": 0.43638896018594414, + "mc2_stderr": 0.01419131146424957, + "em": 0.0016778523489932886, + "em_stderr": 0.00041913301788268467, + "f1": 0.06136954697986581, + "f1_stderr": 0.0013699074965009578 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "87ed727b517e9594" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "a044232ed4357d47" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "45b97eea4f263eda" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/neural-chat-7b-v3-1-dare-0.85/results_2023-12-07T21-41-35.710987.json b/eval-results/uukuguy/neural-chat-7b-v3-1-dare-0.85/results_2023-12-07T21-41-35.710987.json new file mode 100644 index 0000000000000000000000000000000000000000..ef9a8c50d2c40d762ec7e59a216e106eddca6fa4 --- /dev/null +++ b/eval-results/uukuguy/neural-chat-7b-v3-1-dare-0.85/results_2023-12-07T21-41-35.710987.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 432809.207414669, + "end_time": 440172.651820051, + "total_evaluation_time_secondes": "7363.444405382033", + "model_name": "uukuguy/neural-chat-7b-v3-1-dare-0.85", + "model_sha": "3c15d3e2a7790e45501e105daed5eb88b665ceef", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5776450511945392, + "acc_stderr": 0.01443413871337998, + "acc_norm": 0.6194539249146758, + "acc_norm_stderr": 0.014188277712349812 + }, + "harness|hellaswag|10": { + "acc": 0.6362278430591516, + "acc_stderr": 0.0048010096576904405, + "acc_norm": 0.8383788090021908, + "acc_norm_stderr": 0.0036735065123709503 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621503, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621503 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6444444444444445, + "acc_stderr": 0.04135176749720385, + "acc_norm": 0.6444444444444445, + "acc_norm_stderr": 0.04135176749720385 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6710526315789473, + "acc_stderr": 0.03823428969926604, + "acc_norm": 0.6710526315789473, + "acc_norm_stderr": 0.03823428969926604 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6981132075471698, + "acc_stderr": 0.02825420034443866, + "acc_norm": 0.6981132075471698, + "acc_norm_stderr": 0.02825420034443866 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.03745554791462456, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.03745554791462456 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6647398843930635, + "acc_stderr": 0.03599586301247077, + "acc_norm": 0.6647398843930635, + "acc_norm_stderr": 0.03599586301247077 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.37254901960784315, + "acc_stderr": 0.04810840148082637, + "acc_norm": 0.37254901960784315, + "acc_norm_stderr": 0.04810840148082637 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5659574468085107, + "acc_stderr": 0.03240038086792747, + "acc_norm": 0.5659574468085107, + "acc_norm_stderr": 0.03240038086792747 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4824561403508772, + "acc_stderr": 0.04700708033551038, + "acc_norm": 0.4824561403508772, + "acc_norm_stderr": 0.04700708033551038 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5655172413793104, + "acc_stderr": 0.04130740879555498, + "acc_norm": 0.5655172413793104, + "acc_norm_stderr": 0.04130740879555498 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4021164021164021, + "acc_stderr": 0.02525303255499769, + "acc_norm": 0.4021164021164021, + "acc_norm_stderr": 0.02525303255499769 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.04403438954768177, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.04403438954768177 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7774193548387097, + "acc_stderr": 0.023664216671642518, + "acc_norm": 0.7774193548387097, + "acc_norm_stderr": 0.023664216671642518 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5221674876847291, + "acc_stderr": 0.03514528562175008, + "acc_norm": 0.5221674876847291, + "acc_norm_stderr": 0.03514528562175008 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7818181818181819, + "acc_stderr": 0.032250781083062896, + "acc_norm": 0.7818181818181819, + "acc_norm_stderr": 0.032250781083062896 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.797979797979798, + "acc_stderr": 0.028606204289229872, + "acc_norm": 0.797979797979798, + "acc_norm_stderr": 0.028606204289229872 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8756476683937824, + "acc_stderr": 0.02381447708659355, + "acc_norm": 0.8756476683937824, + "acc_norm_stderr": 0.02381447708659355 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.023901157979402534, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.023901157979402534 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3592592592592593, + "acc_stderr": 0.029252905927251976, + "acc_norm": 0.3592592592592593, + "acc_norm_stderr": 0.029252905927251976 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6386554621848739, + "acc_stderr": 0.03120469122515002, + "acc_norm": 0.6386554621848739, + "acc_norm_stderr": 0.03120469122515002 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943343, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943343 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8256880733944955, + "acc_stderr": 0.016265675632010354, + "acc_norm": 0.8256880733944955, + "acc_norm_stderr": 0.016265675632010354 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5324074074074074, + "acc_stderr": 0.03402801581358966, + "acc_norm": 0.5324074074074074, + "acc_norm_stderr": 0.03402801581358966 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8088235294117647, + "acc_stderr": 0.027599174300640766, + "acc_norm": 0.8088235294117647, + "acc_norm_stderr": 0.027599174300640766 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7679324894514767, + "acc_stderr": 0.02747974455080851, + "acc_norm": 0.7679324894514767, + "acc_norm_stderr": 0.02747974455080851 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7040358744394619, + "acc_stderr": 0.0306365913486998, + "acc_norm": 0.7040358744394619, + "acc_norm_stderr": 0.0306365913486998 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7938931297709924, + "acc_stderr": 0.03547771004159463, + "acc_norm": 0.7938931297709924, + "acc_norm_stderr": 0.03547771004159463 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8099173553719008, + "acc_stderr": 0.03581796951709282, + "acc_norm": 0.8099173553719008, + "acc_norm_stderr": 0.03581796951709282 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.803680981595092, + "acc_stderr": 0.031207970394709218, + "acc_norm": 0.803680981595092, + "acc_norm_stderr": 0.031207970394709218 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5178571428571429, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.5178571428571429, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.03916667762822585, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.03916667762822585 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.02190190511507333, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.02190190511507333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8135376756066411, + "acc_stderr": 0.013927751372001505, + "acc_norm": 0.8135376756066411, + "acc_norm_stderr": 0.013927751372001505 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7254335260115607, + "acc_stderr": 0.02402774515526502, + "acc_norm": 0.7254335260115607, + "acc_norm_stderr": 0.02402774515526502 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2916201117318436, + "acc_stderr": 0.015201032512520429, + "acc_norm": 0.2916201117318436, + "acc_norm_stderr": 0.015201032512520429 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7712418300653595, + "acc_stderr": 0.024051029739912258, + "acc_norm": 0.7712418300653595, + "acc_norm_stderr": 0.024051029739912258 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6977491961414791, + "acc_stderr": 0.02608270069539966, + "acc_norm": 0.6977491961414791, + "acc_norm_stderr": 0.02608270069539966 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.024922001168886335, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.024922001168886335 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5035460992907801, + "acc_stderr": 0.02982674915328092, + "acc_norm": 0.5035460992907801, + "acc_norm_stderr": 0.02982674915328092 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4452411994784876, + "acc_stderr": 0.012693421303973294, + "acc_norm": 0.4452411994784876, + "acc_norm_stderr": 0.012693421303973294 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.02841820861940676, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.02841820861940676 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6781045751633987, + "acc_stderr": 0.01890101532209309, + "acc_norm": 0.6781045751633987, + "acc_norm_stderr": 0.01890101532209309 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.045820048415054174, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.045820048415054174 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7428571428571429, + "acc_stderr": 0.02797982353874455, + "acc_norm": 0.7428571428571429, + "acc_norm_stderr": 0.02797982353874455 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8557213930348259, + "acc_stderr": 0.024845753212306046, + "acc_norm": 0.8557213930348259, + "acc_norm_stderr": 0.024845753212306046 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.88, + "acc_stderr": 0.03265986323710906, + "acc_norm": 0.88, + "acc_norm_stderr": 0.03265986323710906 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5481927710843374, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.5481927710843374, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8245614035087719, + "acc_stderr": 0.029170885500727665, + "acc_norm": 0.8245614035087719, + "acc_norm_stderr": 0.029170885500727665 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3047735618115055, + "mc1_stderr": 0.016114124156882455, + "mc2": 0.4490316447301984, + "mc2_stderr": 0.014392988561545562 + }, + "harness|winogrande|5": { + "acc": 0.7916337805840569, + "acc_stderr": 0.011414554399987727 + }, + "harness|gsm8k|5": { + "acc": 0.42153146322971946, + "acc_stderr": 0.013601824409483272 + }, + "all": { + "acc": 0.6418688892912601, + "acc_stderr": 0.032096127796783475, + "acc_norm": 0.6471999464850214, + "acc_norm_stderr": 0.03273684834996561, + "mc1": 0.3047735618115055, + "mc1_stderr": 0.016114124156882455, + "mc2": 0.4490316447301984, + "mc2_stderr": 0.014392988561545562 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "2ee42e3af4d9ec13" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "a1325707f3703c05" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-code-mistral-7b-v1.0/results_2023-10-10T09-35-40.611521.json b/eval-results/uukuguy/speechless-code-mistral-7b-v1.0/results_2023-10-10T09-35-40.611521.json new file mode 100644 index 0000000000000000000000000000000000000000..09ca27fee7fba7de9e3da0c60ed497c6911a37d9 --- /dev/null +++ b/eval-results/uukuguy/speechless-code-mistral-7b-v1.0/results_2023-10-10T09-35-40.611521.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-code-mistral-7b-v1.0", + "model_sha": "753852b8cb52dc5f0411568e98c0cb445a7835dc", + "model_size": "13.99 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5742320819112628, + "acc_stderr": 0.014449464278868805, + "acc_norm": 0.60580204778157, + "acc_norm_stderr": 0.014280522667467321 + }, + "harness|hellaswag|10": { + "acc": 0.6404102768372834, + "acc_stderr": 0.004788994060654275, + "acc_norm": 0.8374825731925911, + "acc_norm_stderr": 0.0036817082825814575 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.562962962962963, + "acc_stderr": 0.042849586397534015, + "acc_norm": 0.562962962962963, + "acc_norm_stderr": 0.042849586397534015 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6776315789473685, + "acc_stderr": 0.03803510248351585, + "acc_norm": 0.6776315789473685, + "acc_norm_stderr": 0.03803510248351585 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6792452830188679, + "acc_stderr": 0.028727502957880274, + "acc_norm": 0.6792452830188679, + "acc_norm_stderr": 0.028727502957880274 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7361111111111112, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.7361111111111112, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6127167630057804, + "acc_stderr": 0.037143259063020656, + "acc_norm": 0.6127167630057804, + "acc_norm_stderr": 0.037143259063020656 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5446808510638298, + "acc_stderr": 0.032555253593403555, + "acc_norm": 0.5446808510638298, + "acc_norm_stderr": 0.032555253593403555 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4473684210526316, + "acc_stderr": 0.04677473004491199, + "acc_norm": 0.4473684210526316, + "acc_norm_stderr": 0.04677473004491199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5586206896551724, + "acc_stderr": 0.04137931034482758, + "acc_norm": 0.5586206896551724, + "acc_norm_stderr": 0.04137931034482758 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.02530590624159063, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.02530590624159063 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.04415438226743744, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.04415438226743744 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7580645161290323, + "acc_stderr": 0.02436259969303108, + "acc_norm": 0.7580645161290323, + "acc_norm_stderr": 0.02436259969303108 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5221674876847291, + "acc_stderr": 0.03514528562175007, + "acc_norm": 0.5221674876847291, + "acc_norm_stderr": 0.03514528562175007 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7636363636363637, + "acc_stderr": 0.03317505930009182, + "acc_norm": 0.7636363636363637, + "acc_norm_stderr": 0.03317505930009182 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7929292929292929, + "acc_stderr": 0.028869778460267045, + "acc_norm": 0.7929292929292929, + "acc_norm_stderr": 0.028869778460267045 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8911917098445595, + "acc_stderr": 0.022473253332768766, + "acc_norm": 0.8911917098445595, + "acc_norm_stderr": 0.022473253332768766 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6435897435897436, + "acc_stderr": 0.0242831405294673, + "acc_norm": 0.6435897435897436, + "acc_norm_stderr": 0.0242831405294673 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.29259259259259257, + "acc_stderr": 0.027738969632176088, + "acc_norm": 0.29259259259259257, + "acc_norm_stderr": 0.027738969632176088 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.680672268907563, + "acc_stderr": 0.0302839955258844, + "acc_norm": 0.680672268907563, + "acc_norm_stderr": 0.0302839955258844 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.038227469376587525, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.038227469376587525 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8275229357798165, + "acc_stderr": 0.016197807956848036, + "acc_norm": 0.8275229357798165, + "acc_norm_stderr": 0.016197807956848036 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5138888888888888, + "acc_stderr": 0.03408655867977749, + "acc_norm": 0.5138888888888888, + "acc_norm_stderr": 0.03408655867977749 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7696078431372549, + "acc_stderr": 0.02955429260569507, + "acc_norm": 0.7696078431372549, + "acc_norm_stderr": 0.02955429260569507 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7721518987341772, + "acc_stderr": 0.027303484599069432, + "acc_norm": 0.7721518987341772, + "acc_norm_stderr": 0.027303484599069432 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7040358744394619, + "acc_stderr": 0.030636591348699803, + "acc_norm": 0.7040358744394619, + "acc_norm_stderr": 0.030636591348699803 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7633587786259542, + "acc_stderr": 0.03727673575596913, + "acc_norm": 0.7633587786259542, + "acc_norm_stderr": 0.03727673575596913 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8016528925619835, + "acc_stderr": 0.03640118271990946, + "acc_norm": 0.8016528925619835, + "acc_norm_stderr": 0.03640118271990946 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7129629629629629, + "acc_stderr": 0.043733130409147614, + "acc_norm": 0.7129629629629629, + "acc_norm_stderr": 0.043733130409147614 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7361963190184049, + "acc_stderr": 0.034624199316156234, + "acc_norm": 0.7361963190184049, + "acc_norm_stderr": 0.034624199316156234 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.48214285714285715, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.48214285714285715, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8418803418803419, + "acc_stderr": 0.02390232554956041, + "acc_norm": 0.8418803418803419, + "acc_norm_stderr": 0.02390232554956041 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.73, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.73, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8160919540229885, + "acc_stderr": 0.013853724170922531, + "acc_norm": 0.8160919540229885, + "acc_norm_stderr": 0.013853724170922531 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7196531791907514, + "acc_stderr": 0.024182427496577605, + "acc_norm": 0.7196531791907514, + "acc_norm_stderr": 0.024182427496577605 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3318435754189944, + "acc_stderr": 0.015748421208187303, + "acc_norm": 0.3318435754189944, + "acc_norm_stderr": 0.015748421208187303 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7352941176470589, + "acc_stderr": 0.025261691219729484, + "acc_norm": 0.7352941176470589, + "acc_norm_stderr": 0.025261691219729484 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7170418006430869, + "acc_stderr": 0.025583062489984806, + "acc_norm": 0.7170418006430869, + "acc_norm_stderr": 0.025583062489984806 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7469135802469136, + "acc_stderr": 0.024191808600713, + "acc_norm": 0.7469135802469136, + "acc_norm_stderr": 0.024191808600713 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4787234042553192, + "acc_stderr": 0.029800481645628693, + "acc_norm": 0.4787234042553192, + "acc_norm_stderr": 0.029800481645628693 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4954367666232073, + "acc_stderr": 0.012769704263117528, + "acc_norm": 0.4954367666232073, + "acc_norm_stderr": 0.012769704263117528 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6727941176470589, + "acc_stderr": 0.02850145286039656, + "acc_norm": 0.6727941176470589, + "acc_norm_stderr": 0.02850145286039656 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6699346405228758, + "acc_stderr": 0.019023726160724553, + "acc_norm": 0.6699346405228758, + "acc_norm_stderr": 0.019023726160724553 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7224489795918367, + "acc_stderr": 0.02866685779027465, + "acc_norm": 0.7224489795918367, + "acc_norm_stderr": 0.02866685779027465 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8208955223880597, + "acc_stderr": 0.027113286753111837, + "acc_norm": 0.8208955223880597, + "acc_norm_stderr": 0.027113286753111837 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.85, + "acc_stderr": 0.0358870281282637, + "acc_norm": 0.85, + "acc_norm_stderr": 0.0358870281282637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5542168674698795, + "acc_stderr": 0.038695433234721015, + "acc_norm": 0.5542168674698795, + "acc_norm_stderr": 0.038695433234721015 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8245614035087719, + "acc_stderr": 0.029170885500727665, + "acc_norm": 0.8245614035087719, + "acc_norm_stderr": 0.029170885500727665 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.33047735618115054, + "mc1_stderr": 0.0164667696136983, + "mc2": 0.4789607258136594, + "mc2_stderr": 0.014858060050825114 + }, + "all": { + "acc": 0.6290829151650942, + "acc_stderr": 0.033174869303662674, + "acc_norm": 0.6329582077451893, + "acc_norm_stderr": 0.03315323833095972, + "mc1": 0.33047735618115054, + "mc1_stderr": 0.0164667696136983, + "mc2": 0.4789607258136594, + "mc2_stderr": 0.014858060050825114 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "4426.557539224625", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-code-mistral-7b-v1.0/results_2023-10-28T16-31-08.459023.json b/eval-results/uukuguy/speechless-code-mistral-7b-v1.0/results_2023-10-28T16-31-08.459023.json new file mode 100644 index 0000000000000000000000000000000000000000..3b27f3a770ce40f0315a277a67a2c397e63ac227 --- /dev/null +++ b/eval-results/uukuguy/speechless-code-mistral-7b-v1.0/results_2023-10-28T16-31-08.459023.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-code-mistral-7b-v1.0", + "model_sha": "9828fac7846ab081d5c9659f1bd89f6c77d83e4e", + "model_size": "13.99 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.1579278523489933, + "em_stderr": 0.003734596341987714, + "f1": 0.21190331375838886, + "f1_stderr": 0.0037546108265308093 + }, + "harness|gsm8k|5": { + "acc": 0.19181197877179681, + "acc_stderr": 0.01084516995529402 + }, + "harness|winogrande|5": { + "acc": 0.7868981846882399, + "acc_stderr": 0.01150895769072275 + }, + "all": { + "em": 0.1579278523489933, + "em_stderr": 0.003734596341987714, + "f1": 0.21190331375838886, + "f1_stderr": 0.0037546108265308093, + "acc": 0.48935508173001835, + "acc_stderr": 0.011177063823008385 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "735d587ee2789678" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "56c03253b083b316" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2397, + "non-padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "0f0c253c1c2a9c81" + }, + "total_evaluation_time_secondes": "9619.377189397812", + "truncated": 0, + "non-truncated": 13389, + "padded": 2397, + "non-padded": 10992, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-code-mistral-7b-v1.0/results_2023-12-08T00-30-08.150409.json b/eval-results/uukuguy/speechless-code-mistral-7b-v1.0/results_2023-12-08T00-30-08.150409.json new file mode 100644 index 0000000000000000000000000000000000000000..6e05b478be38150426455d65972a31a2f6d62b60 --- /dev/null +++ b/eval-results/uukuguy/speechless-code-mistral-7b-v1.0/results_2023-12-08T00-30-08.150409.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 443209.430668752, + "end_time": 450278.981199242, + "total_evaluation_time_secondes": "7069.55053049", + "model_name": "uukuguy/speechless-code-mistral-7b-v1.0", + "model_sha": "43dea8e97d05f2e4358415b9a95a1b327c1f5804", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5708191126279863, + "acc_stderr": 0.014464085894870655, + "acc_norm": 0.6117747440273038, + "acc_norm_stderr": 0.014241614207414044 + }, + "harness|hellaswag|10": { + "acc": 0.6405098585939056, + "acc_stderr": 0.004788703173474743, + "acc_norm": 0.8376817367058355, + "acc_norm_stderr": 0.003679889125399815 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.562962962962963, + "acc_stderr": 0.042849586397534015, + "acc_norm": 0.562962962962963, + "acc_norm_stderr": 0.042849586397534015 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6842105263157895, + "acc_stderr": 0.03782728980865469, + "acc_norm": 0.6842105263157895, + "acc_norm_stderr": 0.03782728980865469 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6792452830188679, + "acc_stderr": 0.028727502957880274, + "acc_norm": 0.6792452830188679, + "acc_norm_stderr": 0.028727502957880274 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7291666666666666, + "acc_stderr": 0.03716177437566017, + "acc_norm": 0.7291666666666666, + "acc_norm_stderr": 0.03716177437566017 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6184971098265896, + "acc_stderr": 0.03703851193099521, + "acc_norm": 0.6184971098265896, + "acc_norm_stderr": 0.03703851193099521 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.38235294117647056, + "acc_stderr": 0.04835503696107223, + "acc_norm": 0.38235294117647056, + "acc_norm_stderr": 0.04835503696107223 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5659574468085107, + "acc_stderr": 0.03240038086792747, + "acc_norm": 0.5659574468085107, + "acc_norm_stderr": 0.03240038086792747 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4473684210526316, + "acc_stderr": 0.04677473004491199, + "acc_norm": 0.4473684210526316, + "acc_norm_stderr": 0.04677473004491199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5655172413793104, + "acc_stderr": 0.04130740879555497, + "acc_norm": 0.5655172413793104, + "acc_norm_stderr": 0.04130740879555497 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.02530590624159063, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.02530590624159063 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7580645161290323, + "acc_stderr": 0.02436259969303108, + "acc_norm": 0.7580645161290323, + "acc_norm_stderr": 0.02436259969303108 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5123152709359606, + "acc_stderr": 0.035169204442208966, + "acc_norm": 0.5123152709359606, + "acc_norm_stderr": 0.035169204442208966 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.797979797979798, + "acc_stderr": 0.028606204289229872, + "acc_norm": 0.797979797979798, + "acc_norm_stderr": 0.028606204289229872 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8963730569948186, + "acc_stderr": 0.02199531196364424, + "acc_norm": 0.8963730569948186, + "acc_norm_stderr": 0.02199531196364424 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6538461538461539, + "acc_stderr": 0.024121125416941183, + "acc_norm": 0.6538461538461539, + "acc_norm_stderr": 0.024121125416941183 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.31851851851851853, + "acc_stderr": 0.02840653309060846, + "acc_norm": 0.31851851851851853, + "acc_norm_stderr": 0.02840653309060846 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.7142857142857143, + "acc_stderr": 0.029344572500634335, + "acc_norm": 0.7142857142857143, + "acc_norm_stderr": 0.029344572500634335 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8293577981651377, + "acc_stderr": 0.016129271025099857, + "acc_norm": 0.8293577981651377, + "acc_norm_stderr": 0.016129271025099857 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5092592592592593, + "acc_stderr": 0.034093869469927006, + "acc_norm": 0.5092592592592593, + "acc_norm_stderr": 0.034093869469927006 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7745098039215687, + "acc_stderr": 0.02933116229425174, + "acc_norm": 0.7745098039215687, + "acc_norm_stderr": 0.02933116229425174 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7679324894514767, + "acc_stderr": 0.027479744550808514, + "acc_norm": 0.7679324894514767, + "acc_norm_stderr": 0.027479744550808514 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.695067264573991, + "acc_stderr": 0.030898610882477518, + "acc_norm": 0.695067264573991, + "acc_norm_stderr": 0.030898610882477518 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7786259541984732, + "acc_stderr": 0.03641297081313729, + "acc_norm": 0.7786259541984732, + "acc_norm_stderr": 0.03641297081313729 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8099173553719008, + "acc_stderr": 0.03581796951709282, + "acc_norm": 0.8099173553719008, + "acc_norm_stderr": 0.03581796951709282 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.04330043749650741, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.04330043749650741 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7361963190184049, + "acc_stderr": 0.034624199316156234, + "acc_norm": 0.7361963190184049, + "acc_norm_stderr": 0.034624199316156234 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8547008547008547, + "acc_stderr": 0.023086635086841407, + "acc_norm": 0.8547008547008547, + "acc_norm_stderr": 0.023086635086841407 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8173690932311622, + "acc_stderr": 0.013816335389973147, + "acc_norm": 0.8173690932311622, + "acc_norm_stderr": 0.013816335389973147 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7225433526011561, + "acc_stderr": 0.024105712607754307, + "acc_norm": 0.7225433526011561, + "acc_norm_stderr": 0.024105712607754307 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.34301675977653634, + "acc_stderr": 0.015876912673057728, + "acc_norm": 0.34301675977653634, + "acc_norm_stderr": 0.015876912673057728 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.738562091503268, + "acc_stderr": 0.025160998214292456, + "acc_norm": 0.738562091503268, + "acc_norm_stderr": 0.025160998214292456 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7234726688102894, + "acc_stderr": 0.02540383297817961, + "acc_norm": 0.7234726688102894, + "acc_norm_stderr": 0.02540383297817961 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.75, + "acc_stderr": 0.02409347123262133, + "acc_norm": 0.75, + "acc_norm_stderr": 0.02409347123262133 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4929078014184397, + "acc_stderr": 0.02982449855912901, + "acc_norm": 0.4929078014184397, + "acc_norm_stderr": 0.02982449855912901 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4954367666232073, + "acc_stderr": 0.012769704263117528, + "acc_norm": 0.4954367666232073, + "acc_norm_stderr": 0.012769704263117528 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6838235294117647, + "acc_stderr": 0.028245687391462927, + "acc_norm": 0.6838235294117647, + "acc_norm_stderr": 0.028245687391462927 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6748366013071896, + "acc_stderr": 0.01895088677080631, + "acc_norm": 0.6748366013071896, + "acc_norm_stderr": 0.01895088677080631 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7224489795918367, + "acc_stderr": 0.02866685779027465, + "acc_norm": 0.7224489795918367, + "acc_norm_stderr": 0.02866685779027465 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8208955223880597, + "acc_stderr": 0.027113286753111837, + "acc_norm": 0.8208955223880597, + "acc_norm_stderr": 0.027113286753111837 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.034873508801977704, + "acc_norm": 0.86, + "acc_norm_stderr": 0.034873508801977704 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5481927710843374, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.5481927710843374, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8187134502923976, + "acc_stderr": 0.029547741687640038, + "acc_norm": 0.8187134502923976, + "acc_norm_stderr": 0.029547741687640038 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.33414932680538556, + "mc1_stderr": 0.016512530677150538, + "mc2": 0.4789652694280945, + "mc2_stderr": 0.01485651327015513 + }, + "harness|winogrande|5": { + "acc": 0.7837411207576953, + "acc_stderr": 0.011570614861409348 + }, + "harness|gsm8k|5": { + "acc": 0.4700530705079606, + "acc_stderr": 0.013747759685444703 + }, + "all": { + "acc": 0.6328087510139756, + "acc_stderr": 0.03241662679481022, + "acc_norm": 0.6370452055948831, + "acc_norm_stderr": 0.033063806342390464, + "mc1": 0.33414932680538556, + "mc1_stderr": 0.016512530677150538, + "mc2": 0.4789652694280945, + "mc2_stderr": 0.01485651327015513 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "b90550d1e46b1867" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "912749bbb6c5f106" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-code-mistral-7b-v2.0/results_2023-12-09T20-08-24.695971.json b/eval-results/uukuguy/speechless-code-mistral-7b-v2.0/results_2023-12-09T20-08-24.695971.json new file mode 100644 index 0000000000000000000000000000000000000000..642c61de400dfc8f9a94de73e94d385a7f72fddd --- /dev/null +++ b/eval-results/uukuguy/speechless-code-mistral-7b-v2.0/results_2023-12-09T20-08-24.695971.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 379778.223467046, + "end_time": 387413.84615408, + "total_evaluation_time_secondes": "7635.622687034018", + "model_name": "uukuguy/speechless-code-mistral-7b-v2.0", + "model_sha": "8371b49e786758da62de015daa006c0e58b7ce82", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4931740614334471, + "acc_stderr": 0.014610029151379813, + "acc_norm": 0.5247440273037542, + "acc_norm_stderr": 0.014593487694937738 + }, + "harness|hellaswag|10": { + "acc": 0.5685122485560645, + "acc_stderr": 0.004942716091996087, + "acc_norm": 0.7561242780322645, + "acc_norm_stderr": 0.004285410130466112 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5394736842105263, + "acc_stderr": 0.04056242252249033, + "acc_norm": 0.5394736842105263, + "acc_norm_stderr": 0.04056242252249033 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5433962264150943, + "acc_stderr": 0.030656748696739435, + "acc_norm": 0.5433962264150943, + "acc_norm_stderr": 0.030656748696739435 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5486111111111112, + "acc_stderr": 0.04161402398403279, + "acc_norm": 0.5486111111111112, + "acc_norm_stderr": 0.04161402398403279 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4913294797687861, + "acc_stderr": 0.03811890988940413, + "acc_norm": 0.4913294797687861, + "acc_norm_stderr": 0.03811890988940413 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.043364327079931785, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.043364327079931785 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4553191489361702, + "acc_stderr": 0.03255525359340355, + "acc_norm": 0.4553191489361702, + "acc_norm_stderr": 0.03255525359340355 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.38596491228070173, + "acc_stderr": 0.04579639422070434, + "acc_norm": 0.38596491228070173, + "acc_norm_stderr": 0.04579639422070434 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.02397386199899207, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.02397386199899207 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04216370213557835, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04216370213557835 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5903225806451613, + "acc_stderr": 0.027976054915347357, + "acc_norm": 0.5903225806451613, + "acc_norm_stderr": 0.027976054915347357 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3891625615763547, + "acc_stderr": 0.034304624161038716, + "acc_norm": 0.3891625615763547, + "acc_norm_stderr": 0.034304624161038716 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6424242424242425, + "acc_stderr": 0.03742597043806586, + "acc_norm": 0.6424242424242425, + "acc_norm_stderr": 0.03742597043806586 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6616161616161617, + "acc_stderr": 0.03371124142626302, + "acc_norm": 0.6616161616161617, + "acc_norm_stderr": 0.03371124142626302 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7046632124352331, + "acc_stderr": 0.0329229663915514, + "acc_norm": 0.7046632124352331, + "acc_norm_stderr": 0.0329229663915514 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.44871794871794873, + "acc_stderr": 0.025217315184846475, + "acc_norm": 0.44871794871794873, + "acc_norm_stderr": 0.025217315184846475 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.027309140588230182, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.027309140588230182 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5126050420168067, + "acc_stderr": 0.03246816765752174, + "acc_norm": 0.5126050420168067, + "acc_norm_stderr": 0.03246816765752174 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.655045871559633, + "acc_stderr": 0.02038060540506695, + "acc_norm": 0.655045871559633, + "acc_norm_stderr": 0.02038060540506695 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.36574074074074076, + "acc_stderr": 0.03284738857647207, + "acc_norm": 0.36574074074074076, + "acc_norm_stderr": 0.03284738857647207 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6519607843137255, + "acc_stderr": 0.03343311240488418, + "acc_norm": 0.6519607843137255, + "acc_norm_stderr": 0.03343311240488418 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6582278481012658, + "acc_stderr": 0.03087453753755362, + "acc_norm": 0.6582278481012658, + "acc_norm_stderr": 0.03087453753755362 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6053811659192825, + "acc_stderr": 0.03280400504755291, + "acc_norm": 0.6053811659192825, + "acc_norm_stderr": 0.03280400504755291 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5954198473282443, + "acc_stderr": 0.043046937953806645, + "acc_norm": 0.5954198473282443, + "acc_norm_stderr": 0.043046937953806645 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6611570247933884, + "acc_stderr": 0.04320767807536671, + "acc_norm": 0.6611570247933884, + "acc_norm_stderr": 0.04320767807536671 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6851851851851852, + "acc_stderr": 0.04489931073591312, + "acc_norm": 0.6851851851851852, + "acc_norm_stderr": 0.04489931073591312 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6073619631901841, + "acc_stderr": 0.03836740907831028, + "acc_norm": 0.6073619631901841, + "acc_norm_stderr": 0.03836740907831028 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4642857142857143, + "acc_stderr": 0.04733667890053756, + "acc_norm": 0.4642857142857143, + "acc_norm_stderr": 0.04733667890053756 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6699029126213593, + "acc_stderr": 0.046561471100123514, + "acc_norm": 0.6699029126213593, + "acc_norm_stderr": 0.046561471100123514 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7991452991452992, + "acc_stderr": 0.026246772946890488, + "acc_norm": 0.7991452991452992, + "acc_norm_stderr": 0.026246772946890488 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562429, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562429 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6641123882503193, + "acc_stderr": 0.016889407235171686, + "acc_norm": 0.6641123882503193, + "acc_norm_stderr": 0.016889407235171686 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5549132947976878, + "acc_stderr": 0.02675625512966377, + "acc_norm": 0.5549132947976878, + "acc_norm_stderr": 0.02675625512966377 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.27932960893854747, + "acc_stderr": 0.015005762446786171, + "acc_norm": 0.27932960893854747, + "acc_norm_stderr": 0.015005762446786171 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5196078431372549, + "acc_stderr": 0.028607893699576063, + "acc_norm": 0.5196078431372549, + "acc_norm_stderr": 0.028607893699576063 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5787781350482315, + "acc_stderr": 0.028043399858210628, + "acc_norm": 0.5787781350482315, + "acc_norm_stderr": 0.028043399858210628 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.027648477877413324, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.027648477877413324 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3404255319148936, + "acc_stderr": 0.028267657482650147, + "acc_norm": 0.3404255319148936, + "acc_norm_stderr": 0.028267657482650147 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3924380704041721, + "acc_stderr": 0.012471243669229111, + "acc_norm": 0.3924380704041721, + "acc_norm_stderr": 0.012471243669229111 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.39705882352941174, + "acc_stderr": 0.029722152099280065, + "acc_norm": 0.39705882352941174, + "acc_norm_stderr": 0.029722152099280065 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5032679738562091, + "acc_stderr": 0.02022740279443487, + "acc_norm": 0.5032679738562091, + "acc_norm_stderr": 0.02022740279443487 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5909090909090909, + "acc_stderr": 0.04709306978661895, + "acc_norm": 0.5909090909090909, + "acc_norm_stderr": 0.04709306978661895 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5387755102040817, + "acc_stderr": 0.03191282052669278, + "acc_norm": 0.5387755102040817, + "acc_norm_stderr": 0.03191282052669278 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.746268656716418, + "acc_stderr": 0.03076944496729602, + "acc_norm": 0.746268656716418, + "acc_norm_stderr": 0.03076944496729602 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4397590361445783, + "acc_stderr": 0.03864139923699121, + "acc_norm": 0.4397590361445783, + "acc_norm_stderr": 0.03864139923699121 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6549707602339181, + "acc_stderr": 0.036459813773888065, + "acc_norm": 0.6549707602339181, + "acc_norm_stderr": 0.036459813773888065 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35128518971848227, + "mc1_stderr": 0.0167113581635444, + "mc2": 0.520514533655488, + "mc2_stderr": 0.01545419025071319 + }, + "harness|winogrande|5": { + "acc": 0.7142857142857143, + "acc_stderr": 0.01269653187003862 + }, + "harness|gsm8k|5": { + "acc": 0.3593631539044731, + "acc_stderr": 0.01321645630985153 + }, + "all": { + "acc": 0.5145047223419519, + "acc_stderr": 0.034309131418093795, + "acc_norm": 0.51746307101721, + "acc_norm_stderr": 0.035021528489929815, + "mc1": 0.35128518971848227, + "mc1_stderr": 0.0167113581635444, + "mc2": 0.520514533655488, + "mc2_stderr": 0.01545419025071319 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "f15aa021499015d7" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "d12acd7df7622099" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-code-mistral-orca-7b-v1.0/results_2023-10-11T06-17-39.611971.json b/eval-results/uukuguy/speechless-code-mistral-orca-7b-v1.0/results_2023-10-11T06-17-39.611971.json new file mode 100644 index 0000000000000000000000000000000000000000..b30585e2520af2bbbad51ba3f926a0035c26acd5 --- /dev/null +++ b/eval-results/uukuguy/speechless-code-mistral-orca-7b-v1.0/results_2023-10-11T06-17-39.611971.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-code-mistral-orca-7b-v1.0", + "model_sha": "f7db67fe6c82657b35d0ffcf8b7ff1568d979482", + "model_size": "13.99 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5597269624573379, + "acc_stderr": 0.014506769524804241, + "acc_norm": 0.5964163822525598, + "acc_norm_stderr": 0.01433715891426845 + }, + "harness|hellaswag|10": { + "acc": 0.6258713403704441, + "acc_stderr": 0.004829081532826503, + "acc_norm": 0.8225453096992631, + "acc_norm_stderr": 0.0038127222858557745 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6, + "acc_stderr": 0.04232073695151589, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04232073695151589 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6776315789473685, + "acc_stderr": 0.03803510248351585, + "acc_norm": 0.6776315789473685, + "acc_norm_stderr": 0.03803510248351585 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6452830188679245, + "acc_stderr": 0.02944517532819959, + "acc_norm": 0.6452830188679245, + "acc_norm_stderr": 0.02944517532819959 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7569444444444444, + "acc_stderr": 0.0358687928008034, + "acc_norm": 0.7569444444444444, + "acc_norm_stderr": 0.0358687928008034 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.036928207672648664, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.036928207672648664 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3431372549019608, + "acc_stderr": 0.04724007352383888, + "acc_norm": 0.3431372549019608, + "acc_norm_stderr": 0.04724007352383888 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.78, + "acc_stderr": 0.041633319989322605, + "acc_norm": 0.78, + "acc_norm_stderr": 0.041633319989322605 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5361702127659574, + "acc_stderr": 0.032600385118357715, + "acc_norm": 0.5361702127659574, + "acc_norm_stderr": 0.032600385118357715 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4649122807017544, + "acc_stderr": 0.046920083813689104, + "acc_norm": 0.4649122807017544, + "acc_norm_stderr": 0.046920083813689104 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5586206896551724, + "acc_stderr": 0.04137931034482758, + "acc_norm": 0.5586206896551724, + "acc_norm_stderr": 0.04137931034482758 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.02548718714785938, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.02548718714785938 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.04415438226743744, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.04415438226743744 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7354838709677419, + "acc_stderr": 0.02509189237885928, + "acc_norm": 0.7354838709677419, + "acc_norm_stderr": 0.02509189237885928 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4482758620689655, + "acc_stderr": 0.034991131376767445, + "acc_norm": 0.4482758620689655, + "acc_norm_stderr": 0.034991131376767445 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7515151515151515, + "acc_stderr": 0.033744026441394036, + "acc_norm": 0.7515151515151515, + "acc_norm_stderr": 0.033744026441394036 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7828282828282829, + "acc_stderr": 0.029376616484945633, + "acc_norm": 0.7828282828282829, + "acc_norm_stderr": 0.029376616484945633 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8704663212435233, + "acc_stderr": 0.024233532297758723, + "acc_norm": 0.8704663212435233, + "acc_norm_stderr": 0.024233532297758723 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6307692307692307, + "acc_stderr": 0.02446861524147892, + "acc_norm": 0.6307692307692307, + "acc_norm_stderr": 0.02446861524147892 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.027840811495871934, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.027840811495871934 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6218487394957983, + "acc_stderr": 0.031499305777849054, + "acc_norm": 0.6218487394957983, + "acc_norm_stderr": 0.031499305777849054 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3509933774834437, + "acc_stderr": 0.03896981964257375, + "acc_norm": 0.3509933774834437, + "acc_norm_stderr": 0.03896981964257375 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8165137614678899, + "acc_stderr": 0.016595259710399317, + "acc_norm": 0.8165137614678899, + "acc_norm_stderr": 0.016595259710399317 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5231481481481481, + "acc_stderr": 0.034063153607115086, + "acc_norm": 0.5231481481481481, + "acc_norm_stderr": 0.034063153607115086 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7892156862745098, + "acc_stderr": 0.02862654791243741, + "acc_norm": 0.7892156862745098, + "acc_norm_stderr": 0.02862654791243741 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7805907172995781, + "acc_stderr": 0.026939106581553945, + "acc_norm": 0.7805907172995781, + "acc_norm_stderr": 0.026939106581553945 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.695067264573991, + "acc_stderr": 0.030898610882477515, + "acc_norm": 0.695067264573991, + "acc_norm_stderr": 0.030898610882477515 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.732824427480916, + "acc_stderr": 0.038808483010823944, + "acc_norm": 0.732824427480916, + "acc_norm_stderr": 0.038808483010823944 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7768595041322314, + "acc_stderr": 0.03800754475228732, + "acc_norm": 0.7768595041322314, + "acc_norm_stderr": 0.03800754475228732 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.0433004374965074, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.0433004374965074 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7361963190184049, + "acc_stderr": 0.03462419931615623, + "acc_norm": 0.7361963190184049, + "acc_norm_stderr": 0.03462419931615623 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8155339805825242, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.8155339805825242, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8547008547008547, + "acc_stderr": 0.023086635086841403, + "acc_norm": 0.8547008547008547, + "acc_norm_stderr": 0.023086635086841403 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.67, + "acc_stderr": 0.047258156262526094, + "acc_norm": 0.67, + "acc_norm_stderr": 0.047258156262526094 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7943805874840357, + "acc_stderr": 0.01445250045678583, + "acc_norm": 0.7943805874840357, + "acc_norm_stderr": 0.01445250045678583 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6820809248554913, + "acc_stderr": 0.025070713719153176, + "acc_norm": 0.6820809248554913, + "acc_norm_stderr": 0.025070713719153176 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.30726256983240224, + "acc_stderr": 0.015430158846469613, + "acc_norm": 0.30726256983240224, + "acc_norm_stderr": 0.015430158846469613 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6503267973856209, + "acc_stderr": 0.0273053080762747, + "acc_norm": 0.6503267973856209, + "acc_norm_stderr": 0.0273053080762747 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6688102893890675, + "acc_stderr": 0.026730620728004903, + "acc_norm": 0.6688102893890675, + "acc_norm_stderr": 0.026730620728004903 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7160493827160493, + "acc_stderr": 0.025089478523765137, + "acc_norm": 0.7160493827160493, + "acc_norm_stderr": 0.025089478523765137 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.450354609929078, + "acc_stderr": 0.029680105565029036, + "acc_norm": 0.450354609929078, + "acc_norm_stderr": 0.029680105565029036 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.47327249022164275, + "acc_stderr": 0.012751977967676006, + "acc_norm": 0.47327249022164275, + "acc_norm_stderr": 0.012751977967676006 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.029520095697687765, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.029520095697687765 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6617647058823529, + "acc_stderr": 0.01913994374848704, + "acc_norm": 0.6617647058823529, + "acc_norm_stderr": 0.01913994374848704 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.045820048415054174, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.045820048415054174 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6448979591836734, + "acc_stderr": 0.030635655150387638, + "acc_norm": 0.6448979591836734, + "acc_norm_stderr": 0.030635655150387638 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8208955223880597, + "acc_stderr": 0.027113286753111837, + "acc_norm": 0.8208955223880597, + "acc_norm_stderr": 0.027113286753111837 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4939759036144578, + "acc_stderr": 0.03892212195333045, + "acc_norm": 0.4939759036144578, + "acc_norm_stderr": 0.03892212195333045 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8421052631578947, + "acc_stderr": 0.027966785859160893, + "acc_norm": 0.8421052631578947, + "acc_norm_stderr": 0.027966785859160893 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.32558139534883723, + "mc1_stderr": 0.016403989469907825, + "mc2": 0.4845050397609622, + "mc2_stderr": 0.015055762233491225 + }, + "all": { + "acc": 0.6125832240314191, + "acc_stderr": 0.033242123824563874, + "acc_norm": 0.6165385357114876, + "acc_norm_stderr": 0.03322202264053834, + "mc1": 0.32558139534883723, + "mc1_stderr": 0.016403989469907825, + "mc2": 0.4845050397609622, + "mc2_stderr": 0.015055762233491225 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "e43adcaa871b1364", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4684, + "non-padded": 3, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "08da6b3d0798f3e5", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40039, + "non-padded": 129, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "5e2b26eb9b4d08bf", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "d33cda9df28030eb", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "0dd50c500d64c57d", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "40b524d0df3defc2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "1f87d12d677e0dfd", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1056, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "dd6d69d8b13afbeb", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 572, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "d45f3c401a00e97e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "c04f21d954ae67b2", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "e7de03b4e1a407d8", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "9ce9516475f0b09c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 684, + "non-padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "f749592a0d6c967d", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 404, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "1a6dccf2066f3598", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "6ce98c8aec8e7514", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "7794b03bf6b9bb11", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "e47ff85e05850517", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2ce6901704311790", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "fa49c3faa72a3955", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "38992a391c7040d5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 396, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "4944fad6e0578120", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "bec955dfccee0331", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 796, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ccfe020e0a8e824", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "6a624d76e1b40f9d", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8340aed0285230f4", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "ca47137b1f3a769c", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "c9d341ab62890f30", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62573d06618ae7df", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ddddcaae96263221", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ef9c1ae343139fdd", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2161, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "eb4abd87b0e863cc", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "c93c778cb8c58a32", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "1daed91f54b42f7d", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "cfdae69f75ee8670", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "173979adbb5ab44e", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "7b7d06271aff55ff", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "ca062cfd7c7fddcb", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 445, + "non-padded": 3, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "fc47171ffb714da3", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "aa29e9d883670c8f", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "88ad044b653ecaa5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "f9e7e01573277484", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "03728b9e48594c28", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1360, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "04a903966514d177", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "a2176d3ac6f01cf0", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a96dc872948245a8", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "e0b03637947e9efa", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0b4c6d0e49c47ab4", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "9e6e34f48034edc0", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "634feb3f97d1064d", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "ae361375c940a0fb", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 800, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "e8bdf33cf82d89f5", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "32ce831e0ba2d2e2", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "4ed9b68c5694211b", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a30fbd9af05d717a", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3d86ffeb7677bd9d", + "hash_cont_tokens": "35527140510ee91a" + }, + "total_evaluation_time_secondes": "4148.854902267456", + "truncated": 0, + "non-truncated": 111019, + "padded": 110793, + "non-padded": 226, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-code-mistral-orca-7b-v1.0/results_2023-10-24T15-07-12.352820.json b/eval-results/uukuguy/speechless-code-mistral-orca-7b-v1.0/results_2023-10-24T15-07-12.352820.json new file mode 100644 index 0000000000000000000000000000000000000000..133fc184cd4e1f2570efa2b242684ff320151473 --- /dev/null +++ b/eval-results/uukuguy/speechless-code-mistral-orca-7b-v1.0/results_2023-10-24T15-07-12.352820.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-code-mistral-orca-7b-v1.0", + "model_sha": "8e1ba0da80219f8d1ed1858cd5e5ffa1d2b36875", + "model_size": "13.99 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.4526006711409396, + "em_stderr": 0.005097407791242309, + "f1": 0.4989010067114103, + "f1_stderr": 0.004905672332696013 + }, + "harness|gsm8k|5": { + "acc": 0.08263836239575435, + "acc_stderr": 0.0075840892201481476 + }, + "harness|winogrande|5": { + "acc": 0.7750591949486977, + "acc_stderr": 0.01173504356412673 + }, + "all": { + "em": 0.4526006711409396, + "em_stderr": 0.005097407791242309, + "f1": 0.4989010067114103, + "f1_stderr": 0.004905672332696013, + "acc": 0.42884877867222604, + "acc_stderr": 0.009659566392137438 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "06217d317eeb244d" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "b0099f6dd8ca1210" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "6bf335f26fed6442", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2397, + "non-padded": 137, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "7c6dc027ca91c1a8", + "hash_cont_tokens": "927d8f8030fbe0fa" + }, + "total_evaluation_time_secondes": "7523.266224861145", + "truncated": 0, + "non-truncated": 13389, + "padded": 2397, + "non-padded": 10992, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-34b-v1.9/results_2023-10-08T20-44-59.061253.json b/eval-results/uukuguy/speechless-codellama-34b-v1.9/results_2023-10-08T20-44-59.061253.json new file mode 100644 index 0000000000000000000000000000000000000000..924bb87d72ca8c96125f34c773139983e237ee33 --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-34b-v1.9/results_2023-10-08T20-44-59.061253.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-34b-v1.9", + "model_sha": "68aad9f8452b2abf7d5415d48c09bd55d5b7ca05", + "model_size": "63.23 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5017064846416383, + "acc_stderr": 0.01461130570505698, + "acc_norm": 0.5426621160409556, + "acc_norm_stderr": 0.01455810654392406 + }, + "harness|hellaswag|10": { + "acc": 0.5581557458673571, + "acc_stderr": 0.004955914693717968, + "acc_norm": 0.7520414260107549, + "acc_norm_stderr": 0.004309451164956192 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411021, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411021 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4222222222222222, + "acc_stderr": 0.042667634040995814, + "acc_norm": 0.4222222222222222, + "acc_norm_stderr": 0.042667634040995814 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6052631578947368, + "acc_stderr": 0.039777499346220734, + "acc_norm": 0.6052631578947368, + "acc_norm_stderr": 0.039777499346220734 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5132075471698113, + "acc_stderr": 0.030762134874500476, + "acc_norm": 0.5132075471698113, + "acc_norm_stderr": 0.030762134874500476 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5347222222222222, + "acc_stderr": 0.04171115858181618, + "acc_norm": 0.5347222222222222, + "acc_norm_stderr": 0.04171115858181618 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4624277456647399, + "acc_stderr": 0.0380168510452446, + "acc_norm": 0.4624277456647399, + "acc_norm_stderr": 0.0380168510452446 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.047551296160629475, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.047551296160629475 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4851063829787234, + "acc_stderr": 0.032671518489247764, + "acc_norm": 0.4851063829787234, + "acc_norm_stderr": 0.032671518489247764 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4298245614035088, + "acc_stderr": 0.046570472605949625, + "acc_norm": 0.4298245614035088, + "acc_norm_stderr": 0.046570472605949625 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5379310344827586, + "acc_stderr": 0.04154659671707548, + "acc_norm": 0.5379310344827586, + "acc_norm_stderr": 0.04154659671707548 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.41534391534391535, + "acc_stderr": 0.02537952491077839, + "acc_norm": 0.41534391534391535, + "acc_norm_stderr": 0.02537952491077839 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4365079365079365, + "acc_stderr": 0.04435932892851466, + "acc_norm": 0.4365079365079365, + "acc_norm_stderr": 0.04435932892851466 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6451612903225806, + "acc_stderr": 0.027218889773308767, + "acc_norm": 0.6451612903225806, + "acc_norm_stderr": 0.027218889773308767 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.46798029556650245, + "acc_stderr": 0.03510766597959217, + "acc_norm": 0.46798029556650245, + "acc_norm_stderr": 0.03510766597959217 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.73, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.73, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.03546563019624336, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.03546563019624336 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.033586181457325205, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.033586181457325205 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7305699481865285, + "acc_stderr": 0.032018671228777947, + "acc_norm": 0.7305699481865285, + "acc_norm_stderr": 0.032018671228777947 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5205128205128206, + "acc_stderr": 0.02532966316348994, + "acc_norm": 0.5205128205128206, + "acc_norm_stderr": 0.02532966316348994 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.37407407407407406, + "acc_stderr": 0.029502861128955293, + "acc_norm": 0.37407407407407406, + "acc_norm_stderr": 0.029502861128955293 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5462184873949579, + "acc_stderr": 0.03233943468182088, + "acc_norm": 0.5462184873949579, + "acc_norm_stderr": 0.03233943468182088 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2913907284768212, + "acc_stderr": 0.03710185726119995, + "acc_norm": 0.2913907284768212, + "acc_norm_stderr": 0.03710185726119995 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7394495412844037, + "acc_stderr": 0.01881918203485007, + "acc_norm": 0.7394495412844037, + "acc_norm_stderr": 0.01881918203485007 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.03372343271653064, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.03372343271653064 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7254901960784313, + "acc_stderr": 0.0313217980308329, + "acc_norm": 0.7254901960784313, + "acc_norm_stderr": 0.0313217980308329 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7426160337552743, + "acc_stderr": 0.028458820991460302, + "acc_norm": 0.7426160337552743, + "acc_norm_stderr": 0.028458820991460302 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5874439461883408, + "acc_stderr": 0.03304062175449297, + "acc_norm": 0.5874439461883408, + "acc_norm_stderr": 0.03304062175449297 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5648854961832062, + "acc_stderr": 0.04348208051644858, + "acc_norm": 0.5648854961832062, + "acc_norm_stderr": 0.04348208051644858 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7024793388429752, + "acc_stderr": 0.04173349148083499, + "acc_norm": 0.7024793388429752, + "acc_norm_stderr": 0.04173349148083499 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7037037037037037, + "acc_stderr": 0.044143436668549335, + "acc_norm": 0.7037037037037037, + "acc_norm_stderr": 0.044143436668549335 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6809815950920245, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.6809815950920245, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4375, + "acc_stderr": 0.04708567521880525, + "acc_norm": 0.4375, + "acc_norm_stderr": 0.04708567521880525 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6893203883495146, + "acc_stderr": 0.0458212416016155, + "acc_norm": 0.6893203883495146, + "acc_norm_stderr": 0.0458212416016155 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7991452991452992, + "acc_stderr": 0.02624677294689048, + "acc_norm": 0.7991452991452992, + "acc_norm_stderr": 0.02624677294689048 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7177522349936143, + "acc_stderr": 0.016095302969878544, + "acc_norm": 0.7177522349936143, + "acc_norm_stderr": 0.016095302969878544 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.026074314851657083, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.026074314851657083 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4011173184357542, + "acc_stderr": 0.01639222189940708, + "acc_norm": 0.4011173184357542, + "acc_norm_stderr": 0.01639222189940708 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.02845263998508801, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.02845263998508801 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6430868167202572, + "acc_stderr": 0.027210420375934012, + "acc_norm": 0.6430868167202572, + "acc_norm_stderr": 0.027210420375934012 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6049382716049383, + "acc_stderr": 0.027201117666925657, + "acc_norm": 0.6049382716049383, + "acc_norm_stderr": 0.027201117666925657 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4148936170212766, + "acc_stderr": 0.0293922365846125, + "acc_norm": 0.4148936170212766, + "acc_norm_stderr": 0.0293922365846125 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4367666232073012, + "acc_stderr": 0.01266770191960366, + "acc_norm": 0.4367666232073012, + "acc_norm_stderr": 0.01266770191960366 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4485294117647059, + "acc_stderr": 0.030211479609121603, + "acc_norm": 0.4485294117647059, + "acc_norm_stderr": 0.030211479609121603 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.020196594933541197, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.020196594933541197 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6938775510204082, + "acc_stderr": 0.029504896454595957, + "acc_norm": 0.6938775510204082, + "acc_norm_stderr": 0.029504896454595957 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7412935323383084, + "acc_stderr": 0.030965903123573044, + "acc_norm": 0.7412935323383084, + "acc_norm_stderr": 0.030965903123573044 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.76, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.76, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.40963855421686746, + "acc_stderr": 0.03828401115079021, + "acc_norm": 0.40963855421686746, + "acc_norm_stderr": 0.03828401115079021 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7309941520467836, + "acc_stderr": 0.0340105262010409, + "acc_norm": 0.7309941520467836, + "acc_norm_stderr": 0.0340105262010409 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2974296205630355, + "mc1_stderr": 0.016002651487361002, + "mc2": 0.43920006507019715, + "mc2_stderr": 0.014754612740891303 + }, + "all": { + "acc": 0.5601866295865682, + "acc_stderr": 0.03493787360526325, + "acc_norm": 0.5641669907991567, + "acc_norm_stderr": 0.03492601491560402, + "mc1": 0.2974296205630355, + "mc1_stderr": 0.016002651487361002, + "mc2": 0.43920006507019715, + "mc2_stderr": 0.014754612740891303 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "25581.405127763748", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-34b-v1.9/results_2023-10-28T13-29-15.296218.json b/eval-results/uukuguy/speechless-codellama-34b-v1.9/results_2023-10-28T13-29-15.296218.json new file mode 100644 index 0000000000000000000000000000000000000000..6b729d501197e13bda497b1abf4c0d160e50f408 --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-34b-v1.9/results_2023-10-28T13-29-15.296218.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-34b-v1.9", + "model_sha": "62919cbe41182df3515c341e0a53da008b44d7b5", + "model_size": "63.23 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.29771392617449666, + "em_stderr": 0.004682699129958643, + "f1": 0.3473626258389263, + "f1_stderr": 0.004601090689469596 + }, + "harness|gsm8k|5": { + "acc": 0.24791508718726307, + "acc_stderr": 0.01189398021482617 + }, + "harness|winogrande|5": { + "acc": 0.7355958958168903, + "acc_stderr": 0.012394724896983799 + }, + "all": { + "em": 0.29771392617449666, + "em_stderr": 0.004682699129958643, + "f1": 0.3473626258389263, + "f1_stderr": 0.004601090689469596, + "acc": 0.4917554915020767, + "acc_stderr": 0.012144352555904984 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "feb652ac4cc20e2a" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "70d0ec57b7e43521" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "3b94c09dba7666ff" + }, + "total_evaluation_time_secondes": "21773.727779865265", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-34b-v2.0/results_2023-10-08T21-55-38.209151.json b/eval-results/uukuguy/speechless-codellama-34b-v2.0/results_2023-10-08T21-55-38.209151.json new file mode 100644 index 0000000000000000000000000000000000000000..42b24ffcf5b4c139d5d939b88aaabafcd9d856fc --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-34b-v2.0/results_2023-10-08T21-55-38.209151.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-34b-v2.0", + "model_sha": "cb81174d72dbe06f8db1c406ef97981532de6f09", + "model_size": "63.23 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5119453924914675, + "acc_stderr": 0.014607220340597171, + "acc_norm": 0.5435153583617748, + "acc_norm_stderr": 0.01455594976049644 + }, + "harness|hellaswag|10": { + "acc": 0.5631348336984664, + "acc_stderr": 0.004949842967331427, + "acc_norm": 0.7565226050587532, + "acc_norm_stderr": 0.004283036686282379 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4148148148148148, + "acc_stderr": 0.042561937679014075, + "acc_norm": 0.4148148148148148, + "acc_norm_stderr": 0.042561937679014075 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5723684210526315, + "acc_stderr": 0.04026097083296562, + "acc_norm": 0.5723684210526315, + "acc_norm_stderr": 0.04026097083296562 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.49056603773584906, + "acc_stderr": 0.0307673947078081, + "acc_norm": 0.49056603773584906, + "acc_norm_stderr": 0.0307673947078081 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5625, + "acc_stderr": 0.04148415739394154, + "acc_norm": 0.5625, + "acc_norm_stderr": 0.04148415739394154 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411019, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411019 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4393063583815029, + "acc_stderr": 0.037842719328874674, + "acc_norm": 0.4393063583815029, + "acc_norm_stderr": 0.037842719328874674 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.044405219061793275, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.044405219061793275 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.48936170212765956, + "acc_stderr": 0.03267862331014063, + "acc_norm": 0.48936170212765956, + "acc_norm_stderr": 0.03267862331014063 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.38596491228070173, + "acc_stderr": 0.04579639422070434, + "acc_norm": 0.38596491228070173, + "acc_norm_stderr": 0.04579639422070434 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5103448275862069, + "acc_stderr": 0.04165774775728763, + "acc_norm": 0.5103448275862069, + "acc_norm_stderr": 0.04165774775728763 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.025424835086923992, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.025424835086923992 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.04415438226743743, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.04415438226743743 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6419354838709678, + "acc_stderr": 0.02727389059430064, + "acc_norm": 0.6419354838709678, + "acc_norm_stderr": 0.02727389059430064 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4088669950738916, + "acc_stderr": 0.034590588158832314, + "acc_norm": 0.4088669950738916, + "acc_norm_stderr": 0.034590588158832314 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7151515151515152, + "acc_stderr": 0.03524390844511781, + "acc_norm": 0.7151515151515152, + "acc_norm_stderr": 0.03524390844511781 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.696969696969697, + "acc_stderr": 0.032742879140268674, + "acc_norm": 0.696969696969697, + "acc_norm_stderr": 0.032742879140268674 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7357512953367875, + "acc_stderr": 0.031821550509166456, + "acc_norm": 0.7357512953367875, + "acc_norm_stderr": 0.031821550509166456 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4948717948717949, + "acc_stderr": 0.025349672906838653, + "acc_norm": 0.4948717948717949, + "acc_norm_stderr": 0.025349672906838653 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2814814814814815, + "acc_stderr": 0.02742001935094527, + "acc_norm": 0.2814814814814815, + "acc_norm_stderr": 0.02742001935094527 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5504201680672269, + "acc_stderr": 0.03231293497137707, + "acc_norm": 0.5504201680672269, + "acc_norm_stderr": 0.03231293497137707 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33112582781456956, + "acc_stderr": 0.038425817186598696, + "acc_norm": 0.33112582781456956, + "acc_norm_stderr": 0.038425817186598696 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7339449541284404, + "acc_stderr": 0.018946022322225604, + "acc_norm": 0.7339449541284404, + "acc_norm_stderr": 0.018946022322225604 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.44907407407407407, + "acc_stderr": 0.03392238405321616, + "acc_norm": 0.44907407407407407, + "acc_norm_stderr": 0.03392238405321616 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7401960784313726, + "acc_stderr": 0.03077855467869326, + "acc_norm": 0.7401960784313726, + "acc_norm_stderr": 0.03077855467869326 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.759493670886076, + "acc_stderr": 0.02782078198114969, + "acc_norm": 0.759493670886076, + "acc_norm_stderr": 0.02782078198114969 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5695067264573991, + "acc_stderr": 0.033231973029429394, + "acc_norm": 0.5695067264573991, + "acc_norm_stderr": 0.033231973029429394 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.4961832061068702, + "acc_stderr": 0.043851623256015534, + "acc_norm": 0.4961832061068702, + "acc_norm_stderr": 0.043851623256015534 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7107438016528925, + "acc_stderr": 0.04139112727635463, + "acc_norm": 0.7107438016528925, + "acc_norm_stderr": 0.04139112727635463 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6481481481481481, + "acc_stderr": 0.04616631111801713, + "acc_norm": 0.6481481481481481, + "acc_norm_stderr": 0.04616631111801713 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6932515337423313, + "acc_stderr": 0.036230899157241474, + "acc_norm": 0.6932515337423313, + "acc_norm_stderr": 0.036230899157241474 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.41964285714285715, + "acc_stderr": 0.04684099321077106, + "acc_norm": 0.41964285714285715, + "acc_norm_stderr": 0.04684099321077106 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7087378640776699, + "acc_stderr": 0.04498676320572924, + "acc_norm": 0.7087378640776699, + "acc_norm_stderr": 0.04498676320572924 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7905982905982906, + "acc_stderr": 0.026655699653922744, + "acc_norm": 0.7905982905982906, + "acc_norm_stderr": 0.026655699653922744 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7126436781609196, + "acc_stderr": 0.0161824107306827, + "acc_norm": 0.7126436781609196, + "acc_norm_stderr": 0.0161824107306827 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5924855491329479, + "acc_stderr": 0.026454578146931505, + "acc_norm": 0.5924855491329479, + "acc_norm_stderr": 0.026454578146931505 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.38324022346368714, + "acc_stderr": 0.016260159604429128, + "acc_norm": 0.38324022346368714, + "acc_norm_stderr": 0.016260159604429128 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.028431095444176643, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.028431095444176643 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6302250803858521, + "acc_stderr": 0.027417996705630998, + "acc_norm": 0.6302250803858521, + "acc_norm_stderr": 0.027417996705630998 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6080246913580247, + "acc_stderr": 0.027163686038271146, + "acc_norm": 0.6080246913580247, + "acc_norm_stderr": 0.027163686038271146 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3971631205673759, + "acc_stderr": 0.029189805673587102, + "acc_norm": 0.3971631205673759, + "acc_norm_stderr": 0.029189805673587102 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.41395045632333766, + "acc_stderr": 0.012579699631289265, + "acc_norm": 0.41395045632333766, + "acc_norm_stderr": 0.012579699631289265 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4742647058823529, + "acc_stderr": 0.03033257809455504, + "acc_norm": 0.4742647058823529, + "acc_norm_stderr": 0.03033257809455504 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5, + "acc_stderr": 0.020227834851568375, + "acc_norm": 0.5, + "acc_norm_stderr": 0.020227834851568375 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.0449429086625209, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.0449429086625209 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6693877551020408, + "acc_stderr": 0.030116426296540603, + "acc_norm": 0.6693877551020408, + "acc_norm_stderr": 0.030116426296540603 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7562189054726368, + "acc_stderr": 0.030360490154014635, + "acc_norm": 0.7562189054726368, + "acc_norm_stderr": 0.030360490154014635 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42771084337349397, + "acc_stderr": 0.038515976837185335, + "acc_norm": 0.42771084337349397, + "acc_norm_stderr": 0.038515976837185335 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7192982456140351, + "acc_stderr": 0.034462962170884265, + "acc_norm": 0.7192982456140351, + "acc_norm_stderr": 0.034462962170884265 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.30354957160342716, + "mc1_stderr": 0.016095884155386854, + "mc2": 0.45211730267295, + "mc2_stderr": 0.014761798027932871 + }, + "all": { + "acc": 0.5464126519732339, + "acc_stderr": 0.03484059078525412, + "acc_norm": 0.5502254949771422, + "acc_norm_stderr": 0.034828419990997346, + "mc1": 0.30354957160342716, + "mc1_stderr": 0.016095884155386854, + "mc2": 0.45211730267295, + "mc2_stderr": 0.014761798027932871 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "25598.604048252106", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-34b-v2.0/results_2023-10-23T15-35-47.826162.json b/eval-results/uukuguy/speechless-codellama-34b-v2.0/results_2023-10-23T15-35-47.826162.json new file mode 100644 index 0000000000000000000000000000000000000000..acd5bfa7f8af8fde378885240ed79dddc2433416 --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-34b-v2.0/results_2023-10-23T15-35-47.826162.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-34b-v2.0", + "model_sha": "e55b493220980988e18940bc71b5cfeded917a07", + "model_size": "63.23 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.3704907718120805, + "em_stderr": 0.004945718565106882, + "f1": 0.4170574664429539, + "f1_stderr": 0.004815998685057963 + }, + "harness|gsm8k|5": { + "acc": 0.11599696739954511, + "acc_stderr": 0.008820485491442485 + }, + "harness|winogrande|5": { + "acc": 0.7355958958168903, + "acc_stderr": 0.012394724896983799 + }, + "all": { + "em": 0.3704907718120805, + "em_stderr": 0.004945718565106882, + "f1": 0.4170574664429539, + "f1_stderr": 0.004815998685057963, + "acc": 0.42579643160821773, + "acc_stderr": 0.010607605194213141 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "b5049dede51b8487" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "037b2abc6e04e9cf" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "db9079078bf27f46" + }, + "total_evaluation_time_secondes": "18064.06504011154", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-dolphin-orca-platypus-13b/results_2023-09-11T11-46-04.714895.json b/eval-results/uukuguy/speechless-codellama-dolphin-orca-platypus-13b/results_2023-09-11T11-46-04.714895.json new file mode 100644 index 0000000000000000000000000000000000000000..e94c902561a5a15bc4fa4d45ddf3fa56b1e7d313 --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-dolphin-orca-platypus-13b/results_2023-09-11T11-46-04.714895.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-dolphin-orca-platypus-13b", + "model_sha": "0c41023f8f665946a2c46c3823afee431408bcbd", + "model_size": "24.56 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4129692832764505, + "acc_stderr": 0.014388344935398324, + "acc_norm": 0.44795221843003413, + "acc_norm_stderr": 0.014532011498211669 + }, + "harness|hellaswag|10": { + "acc": 0.5068711412069309, + "acc_stderr": 0.0049893102282761136, + "acc_norm": 0.686018721370245, + "acc_norm_stderr": 0.00463160353975195 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.362962962962963, + "acc_stderr": 0.04153948404742398, + "acc_norm": 0.362962962962963, + "acc_norm_stderr": 0.04153948404742398 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4407894736842105, + "acc_stderr": 0.04040311062490436, + "acc_norm": 0.4407894736842105, + "acc_norm_stderr": 0.04040311062490436 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.3886792452830189, + "acc_stderr": 0.030000485448675986, + "acc_norm": 0.3886792452830189, + "acc_norm_stderr": 0.030000485448675986 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4513888888888889, + "acc_stderr": 0.04161402398403279, + "acc_norm": 0.4513888888888889, + "acc_norm_stderr": 0.04161402398403279 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.37572254335260113, + "acc_stderr": 0.03692820767264867, + "acc_norm": 0.37572254335260113, + "acc_norm_stderr": 0.03692820767264867 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171453, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171453 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.39148936170212767, + "acc_stderr": 0.031907012423268113, + "acc_norm": 0.39148936170212767, + "acc_norm_stderr": 0.031907012423268113 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.32456140350877194, + "acc_stderr": 0.04404556157374767, + "acc_norm": 0.32456140350877194, + "acc_norm_stderr": 0.04404556157374767 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3931034482758621, + "acc_stderr": 0.0407032901370707, + "acc_norm": 0.3931034482758621, + "acc_norm_stderr": 0.0407032901370707 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2751322751322751, + "acc_stderr": 0.02300008685906864, + "acc_norm": 0.2751322751322751, + "acc_norm_stderr": 0.02300008685906864 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.04415438226743744, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.04415438226743744 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.4290322580645161, + "acc_stderr": 0.028156036538233217, + "acc_norm": 0.4290322580645161, + "acc_norm_stderr": 0.028156036538233217 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2660098522167488, + "acc_stderr": 0.031089826002937523, + "acc_norm": 0.2660098522167488, + "acc_norm_stderr": 0.031089826002937523 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6424242424242425, + "acc_stderr": 0.03742597043806587, + "acc_norm": 0.6424242424242425, + "acc_norm_stderr": 0.03742597043806587 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5353535353535354, + "acc_stderr": 0.03553436368828063, + "acc_norm": 0.5353535353535354, + "acc_norm_stderr": 0.03553436368828063 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.5544041450777202, + "acc_stderr": 0.03587014986075659, + "acc_norm": 0.5544041450777202, + "acc_norm_stderr": 0.03587014986075659 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3230769230769231, + "acc_stderr": 0.023710888501970555, + "acc_norm": 0.3230769230769231, + "acc_norm_stderr": 0.023710888501970555 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25555555555555554, + "acc_stderr": 0.026593939101844086, + "acc_norm": 0.25555555555555554, + "acc_norm_stderr": 0.026593939101844086 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.42016806722689076, + "acc_stderr": 0.032061837832361516, + "acc_norm": 0.42016806722689076, + "acc_norm_stderr": 0.032061837832361516 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2913907284768212, + "acc_stderr": 0.03710185726119994, + "acc_norm": 0.2913907284768212, + "acc_norm_stderr": 0.03710185726119994 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.5339449541284403, + "acc_stderr": 0.021387863350353982, + "acc_norm": 0.5339449541284403, + "acc_norm_stderr": 0.021387863350353982 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.03114144782353602, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.03114144782353602 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6323529411764706, + "acc_stderr": 0.03384132045674118, + "acc_norm": 0.6323529411764706, + "acc_norm_stderr": 0.03384132045674118 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6708860759493671, + "acc_stderr": 0.03058732629470236, + "acc_norm": 0.6708860759493671, + "acc_norm_stderr": 0.03058732629470236 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5022421524663677, + "acc_stderr": 0.033557465352232634, + "acc_norm": 0.5022421524663677, + "acc_norm_stderr": 0.033557465352232634 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.4351145038167939, + "acc_stderr": 0.043482080516448585, + "acc_norm": 0.4351145038167939, + "acc_norm_stderr": 0.043482080516448585 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6115702479338843, + "acc_stderr": 0.044492703500683836, + "acc_norm": 0.6115702479338843, + "acc_norm_stderr": 0.044492703500683836 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5, + "acc_stderr": 0.04833682445228318, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04833682445228318 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.49693251533742333, + "acc_stderr": 0.03928297078179663, + "acc_norm": 0.49693251533742333, + "acc_norm_stderr": 0.03928297078179663 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6407766990291263, + "acc_stderr": 0.047504583990416946, + "acc_norm": 0.6407766990291263, + "acc_norm_stderr": 0.047504583990416946 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6752136752136753, + "acc_stderr": 0.03067902276549883, + "acc_norm": 0.6752136752136753, + "acc_norm_stderr": 0.03067902276549883 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5402298850574713, + "acc_stderr": 0.01782199409693354, + "acc_norm": 0.5402298850574713, + "acc_norm_stderr": 0.01782199409693354 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.430635838150289, + "acc_stderr": 0.02665880027367238, + "acc_norm": 0.430635838150289, + "acc_norm_stderr": 0.02665880027367238 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3474860335195531, + "acc_stderr": 0.01592556406020815, + "acc_norm": 0.3474860335195531, + "acc_norm_stderr": 0.01592556406020815 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.028180596328259283, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.028180596328259283 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.48231511254019294, + "acc_stderr": 0.02838032284907713, + "acc_norm": 0.48231511254019294, + "acc_norm_stderr": 0.02838032284907713 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.4783950617283951, + "acc_stderr": 0.02779476010500874, + "acc_norm": 0.4783950617283951, + "acc_norm_stderr": 0.02779476010500874 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3546099290780142, + "acc_stderr": 0.02853865002887864, + "acc_norm": 0.3546099290780142, + "acc_norm_stderr": 0.02853865002887864 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.35528031290743156, + "acc_stderr": 0.012223623364044041, + "acc_norm": 0.35528031290743156, + "acc_norm_stderr": 0.012223623364044041 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.3014705882352941, + "acc_stderr": 0.027875982114273168, + "acc_norm": 0.3014705882352941, + "acc_norm_stderr": 0.027875982114273168 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4019607843137255, + "acc_stderr": 0.019835176484375373, + "acc_norm": 0.4019607843137255, + "acc_norm_stderr": 0.019835176484375373 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5636363636363636, + "acc_stderr": 0.04750185058907297, + "acc_norm": 0.5636363636363636, + "acc_norm_stderr": 0.04750185058907297 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.031680911612338825, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.031680911612338825 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.5522388059701493, + "acc_stderr": 0.03516184772952167, + "acc_norm": 0.5522388059701493, + "acc_norm_stderr": 0.03516184772952167 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.62, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.62, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3855421686746988, + "acc_stderr": 0.037891344246115496, + "acc_norm": 0.3855421686746988, + "acc_norm_stderr": 0.037891344246115496 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.5029239766081871, + "acc_stderr": 0.03834759370936839, + "acc_norm": 0.5029239766081871, + "acc_norm_stderr": 0.03834759370936839 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2962056303549572, + "mc1_stderr": 0.015983595101811392, + "mc2": 0.4627912956571528, + "mc2_stderr": 0.01466090570906347 + }, + "all": { + "acc": 0.4409281791882639, + "acc_stderr": 0.035246385009241446, + "acc_norm": 0.44455750995634685, + "acc_norm_stderr": 0.03524275721050058, + "mc1": 0.2962056303549572, + "mc1_stderr": 0.015983595101811392, + "mc2": 0.4627912956571528, + "mc2_stderr": 0.01466090570906347 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6757.642242908478", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-dolphin-orca-platypus-13b/results_2023-10-23T11-16-46.322538.json b/eval-results/uukuguy/speechless-codellama-dolphin-orca-platypus-13b/results_2023-10-23T11-16-46.322538.json new file mode 100644 index 0000000000000000000000000000000000000000..33116fd2a72425e8e622296c707505cb23946b6c --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-dolphin-orca-platypus-13b/results_2023-10-23T11-16-46.322538.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-dolphin-orca-platypus-13b", + "model_sha": "8544049ab4b7d5327e446e2a9b36637106c8ae53", + "model_size": "24.56 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.2627936241610738, + "em_stderr": 0.004507560917898856, + "f1": 0.30675125838926204, + "f1_stderr": 0.004485653251386068 + }, + "harness|gsm8k|5": { + "acc": 0.09552691432903715, + "acc_stderr": 0.008096605771155735 + }, + "harness|winogrande|5": { + "acc": 0.6692975532754538, + "acc_stderr": 0.0132224358870027 + }, + "all": { + "em": 0.2627936241610738, + "em_stderr": 0.004507560917898856, + "f1": 0.30675125838926204, + "f1_stderr": 0.004485653251386068, + "acc": 0.3824122338022455, + "acc_stderr": 0.010659520829079217 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "72d9a6a105e486b7" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "da12cbf59286996b" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "03eb301109b555b5" + }, + "total_evaluation_time_secondes": "10075.417939186096", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-dolphin-orca-platypus-34b/results_2023-10-04T00-22-19.968928.json b/eval-results/uukuguy/speechless-codellama-dolphin-orca-platypus-34b/results_2023-10-04T00-22-19.968928.json new file mode 100644 index 0000000000000000000000000000000000000000..a41eb0dace454449a1315c1e43328eae88b3679c --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-dolphin-orca-platypus-34b/results_2023-10-04T00-22-19.968928.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-dolphin-orca-platypus-34b", + "model_sha": "57e18e617b4fd7ab61bd7da8ee9516513ad76842", + "model_size": "63.23 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.49573378839590443, + "acc_stderr": 0.014610858923956945, + "acc_norm": 0.5247440273037542, + "acc_norm_stderr": 0.014593487694937738 + }, + "harness|hellaswag|10": { + "acc": 0.547998406691894, + "acc_stderr": 0.004966736811010487, + "acc_norm": 0.7412865962955587, + "acc_norm_stderr": 0.004370328224831782 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.43703703703703706, + "acc_stderr": 0.04284958639753399, + "acc_norm": 0.43703703703703706, + "acc_norm_stderr": 0.04284958639753399 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5328947368421053, + "acc_stderr": 0.04060127035236395, + "acc_norm": 0.5328947368421053, + "acc_norm_stderr": 0.04060127035236395 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4830188679245283, + "acc_stderr": 0.030755120364119905, + "acc_norm": 0.4830188679245283, + "acc_norm_stderr": 0.030755120364119905 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5555555555555556, + "acc_stderr": 0.04155319955593146, + "acc_norm": 0.5555555555555556, + "acc_norm_stderr": 0.04155319955593146 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.43352601156069365, + "acc_stderr": 0.03778621079092055, + "acc_norm": 0.43352601156069365, + "acc_norm_stderr": 0.03778621079092055 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04690650298201942, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04690650298201942 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.03261936918467381, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.03261936918467381 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4298245614035088, + "acc_stderr": 0.046570472605949625, + "acc_norm": 0.4298245614035088, + "acc_norm_stderr": 0.046570472605949625 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.496551724137931, + "acc_stderr": 0.04166567577101579, + "acc_norm": 0.496551724137931, + "acc_norm_stderr": 0.04166567577101579 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3835978835978836, + "acc_stderr": 0.025043757318520203, + "acc_norm": 0.3835978835978836, + "acc_norm_stderr": 0.025043757318520203 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.49206349206349204, + "acc_stderr": 0.044715725362943486, + "acc_norm": 0.49206349206349204, + "acc_norm_stderr": 0.044715725362943486 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.37, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5806451612903226, + "acc_stderr": 0.02807158890109185, + "acc_norm": 0.5806451612903226, + "acc_norm_stderr": 0.02807158890109185 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.35467980295566504, + "acc_stderr": 0.0336612448905145, + "acc_norm": 0.35467980295566504, + "acc_norm_stderr": 0.0336612448905145 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7151515151515152, + "acc_stderr": 0.03524390844511781, + "acc_norm": 0.7151515151515152, + "acc_norm_stderr": 0.03524390844511781 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6717171717171717, + "acc_stderr": 0.033456784227567746, + "acc_norm": 0.6717171717171717, + "acc_norm_stderr": 0.033456784227567746 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7461139896373057, + "acc_stderr": 0.03141024780565317, + "acc_norm": 0.7461139896373057, + "acc_norm_stderr": 0.03141024780565317 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.48717948717948717, + "acc_stderr": 0.02534267129380725, + "acc_norm": 0.48717948717948717, + "acc_norm_stderr": 0.02534267129380725 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.27037037037037037, + "acc_stderr": 0.02708037281514566, + "acc_norm": 0.27037037037037037, + "acc_norm_stderr": 0.02708037281514566 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5294117647058824, + "acc_stderr": 0.03242225027115006, + "acc_norm": 0.5294117647058824, + "acc_norm_stderr": 0.03242225027115006 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.038227469376587525, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.038227469376587525 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7211009174311926, + "acc_stderr": 0.0192274688764635, + "acc_norm": 0.7211009174311926, + "acc_norm_stderr": 0.0192274688764635 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4027777777777778, + "acc_stderr": 0.033448873829978666, + "acc_norm": 0.4027777777777778, + "acc_norm_stderr": 0.033448873829978666 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.75, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.75, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7552742616033755, + "acc_stderr": 0.027985699387036427, + "acc_norm": 0.7552742616033755, + "acc_norm_stderr": 0.027985699387036427 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6053811659192825, + "acc_stderr": 0.03280400504755291, + "acc_norm": 0.6053811659192825, + "acc_norm_stderr": 0.03280400504755291 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5190839694656488, + "acc_stderr": 0.04382094705550988, + "acc_norm": 0.5190839694656488, + "acc_norm_stderr": 0.04382094705550988 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6694214876033058, + "acc_stderr": 0.04294340845212094, + "acc_norm": 0.6694214876033058, + "acc_norm_stderr": 0.04294340845212094 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6759259259259259, + "acc_stderr": 0.045245960070300476, + "acc_norm": 0.6759259259259259, + "acc_norm_stderr": 0.045245960070300476 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7239263803680982, + "acc_stderr": 0.035123852837050475, + "acc_norm": 0.7239263803680982, + "acc_norm_stderr": 0.035123852837050475 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4375, + "acc_stderr": 0.04708567521880525, + "acc_norm": 0.4375, + "acc_norm_stderr": 0.04708567521880525 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6796116504854369, + "acc_stderr": 0.04620284082280041, + "acc_norm": 0.6796116504854369, + "acc_norm_stderr": 0.04620284082280041 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7863247863247863, + "acc_stderr": 0.026853450377009154, + "acc_norm": 0.7863247863247863, + "acc_norm_stderr": 0.026853450377009154 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.016857391247472552, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.016857391247472552 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5578034682080925, + "acc_stderr": 0.0267386036438074, + "acc_norm": 0.5578034682080925, + "acc_norm_stderr": 0.0267386036438074 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.27150837988826815, + "acc_stderr": 0.014874252168095277, + "acc_norm": 0.27150837988826815, + "acc_norm_stderr": 0.014874252168095277 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.545751633986928, + "acc_stderr": 0.02850980780262659, + "acc_norm": 0.545751633986928, + "acc_norm_stderr": 0.02850980780262659 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5916398713826366, + "acc_stderr": 0.027917050748484617, + "acc_norm": 0.5916398713826366, + "acc_norm_stderr": 0.027917050748484617 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5987654320987654, + "acc_stderr": 0.0272725828498398, + "acc_norm": 0.5987654320987654, + "acc_norm_stderr": 0.0272725828498398 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3900709219858156, + "acc_stderr": 0.02909767559946393, + "acc_norm": 0.3900709219858156, + "acc_norm_stderr": 0.02909767559946393 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4198174706649283, + "acc_stderr": 0.012604960816087371, + "acc_norm": 0.4198174706649283, + "acc_norm_stderr": 0.012604960816087371 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.45588235294117646, + "acc_stderr": 0.030254372573976694, + "acc_norm": 0.45588235294117646, + "acc_norm_stderr": 0.030254372573976694 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5310457516339869, + "acc_stderr": 0.020188804456361887, + "acc_norm": 0.5310457516339869, + "acc_norm_stderr": 0.020188804456361887 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6181818181818182, + "acc_stderr": 0.046534298079135075, + "acc_norm": 0.6181818181818182, + "acc_norm_stderr": 0.046534298079135075 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6612244897959184, + "acc_stderr": 0.030299506562154185, + "acc_norm": 0.6612244897959184, + "acc_norm_stderr": 0.030299506562154185 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7512437810945274, + "acc_stderr": 0.030567675938916718, + "acc_norm": 0.7512437810945274, + "acc_norm_stderr": 0.030567675938916718 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.74, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.74, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.39759036144578314, + "acc_stderr": 0.038099730845402184, + "acc_norm": 0.39759036144578314, + "acc_norm_stderr": 0.038099730845402184 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.6374269005847953, + "acc_stderr": 0.0368713061556206, + "acc_norm": 0.6374269005847953, + "acc_norm_stderr": 0.0368713061556206 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.30354957160342716, + "mc1_stderr": 0.016095884155386854, + "mc2": 0.47135907975593017, + "mc2_stderr": 0.014951001296424498 + }, + "all": { + "acc": 0.5342362830958945, + "acc_stderr": 0.034949132938444705, + "acc_norm": 0.538004053070666, + "acc_norm_stderr": 0.03493872989072948, + "mc1": 0.30354957160342716, + "mc1_stderr": 0.016095884155386854, + "mc2": 0.47135907975593017, + "mc2_stderr": 0.014951001296424498 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "23941.410168886185", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-dolphin-orca-platypus-34b/results_2023-10-29T00-32-33.472586.json b/eval-results/uukuguy/speechless-codellama-dolphin-orca-platypus-34b/results_2023-10-29T00-32-33.472586.json new file mode 100644 index 0000000000000000000000000000000000000000..0524d29ff2f6c8a956f8e2a807b234f5245ce042 --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-dolphin-orca-platypus-34b/results_2023-10-29T00-32-33.472586.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-dolphin-orca-platypus-34b", + "model_sha": "e5b0493bfeb4353951034b9075d4b059aca7deb2", + "model_size": "63.23 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.37080536912751677, + "em_stderr": 0.004946581424326503, + "f1": 0.42342072147651116, + "f1_stderr": 0.004815729646559334 + }, + "harness|gsm8k|5": { + "acc": 0.1470811220621683, + "acc_stderr": 0.0097560636603599 + }, + "harness|winogrande|5": { + "acc": 0.7324388318863457, + "acc_stderr": 0.012441718456893009 + }, + "all": { + "em": 0.37080536912751677, + "em_stderr": 0.004946581424326503, + "f1": 0.42342072147651116, + "f1_stderr": 0.004815729646559334, + "acc": 0.439759976974257, + "acc_stderr": 0.011098891058626454 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "d57a37b8ad8ae81e" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "f4106e83caffb355" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "3d6d54bb73d7aeff" + }, + "total_evaluation_time_secondes": "14808.589596033096", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-orca-13b/results_2023-09-04T06-25-23.128128.json b/eval-results/uukuguy/speechless-codellama-orca-13b/results_2023-09-04T06-25-23.128128.json new file mode 100644 index 0000000000000000000000000000000000000000..5bda5c003057c2d81c7589d3b5a6523ab44210bc --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-orca-13b/results_2023-09-04T06-25-23.128128.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-orca-13b", + "model_sha": "a82467de3cb9438aa8f9e0ea8ea692f16a5724b2", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4283276450511945, + "acc_stderr": 0.014460496367599017, + "acc_norm": 0.46331058020477817, + "acc_norm_stderr": 0.014572000527757001 + }, + "harness|hellaswag|10": { + "acc": 0.5055765783708425, + "acc_stderr": 0.004989471055090957, + "acc_norm": 0.6770563632742481, + "acc_norm_stderr": 0.004666457279979415 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3851851851851852, + "acc_stderr": 0.042039210401562783, + "acc_norm": 0.3851851851851852, + "acc_norm_stderr": 0.042039210401562783 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4342105263157895, + "acc_stderr": 0.040335656678483184, + "acc_norm": 0.4342105263157895, + "acc_norm_stderr": 0.040335656678483184 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4528301886792453, + "acc_stderr": 0.030635627957961823, + "acc_norm": 0.4528301886792453, + "acc_norm_stderr": 0.030635627957961823 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4305555555555556, + "acc_stderr": 0.04140685639111503, + "acc_norm": 0.4305555555555556, + "acc_norm_stderr": 0.04140685639111503 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252606, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252606 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.41040462427745666, + "acc_stderr": 0.03750757044895537, + "acc_norm": 0.41040462427745666, + "acc_norm_stderr": 0.03750757044895537 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.04023382273617747, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.04023382273617747 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3829787234042553, + "acc_stderr": 0.031778212502369216, + "acc_norm": 0.3829787234042553, + "acc_norm_stderr": 0.031778212502369216 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2543859649122807, + "acc_stderr": 0.040969851398436716, + "acc_norm": 0.2543859649122807, + "acc_norm_stderr": 0.040969851398436716 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.04164188720169377, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.04164188720169377 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.023973861998992072, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.023973861998992072 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04216370213557835, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04216370213557835 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5, + "acc_stderr": 0.028444006199428714, + "acc_norm": 0.5, + "acc_norm_stderr": 0.028444006199428714 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.32019704433497537, + "acc_stderr": 0.032826493853041504, + "acc_norm": 0.32019704433497537, + "acc_norm_stderr": 0.032826493853041504 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.65, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.65, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.03756335775187896, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.03756335775187896 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5858585858585859, + "acc_stderr": 0.03509438348879629, + "acc_norm": 0.5858585858585859, + "acc_norm_stderr": 0.03509438348879629 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.6113989637305699, + "acc_stderr": 0.03517739796373131, + "acc_norm": 0.6113989637305699, + "acc_norm_stderr": 0.03517739796373131 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.441025641025641, + "acc_stderr": 0.02517404838400076, + "acc_norm": 0.441025641025641, + "acc_norm_stderr": 0.02517404838400076 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.027940457136228402, + "acc_norm": 0.3, + "acc_norm_stderr": 0.027940457136228402 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.44537815126050423, + "acc_stderr": 0.0322841062671639, + "acc_norm": 0.44537815126050423, + "acc_norm_stderr": 0.0322841062671639 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.36423841059602646, + "acc_stderr": 0.03929111781242741, + "acc_norm": 0.36423841059602646, + "acc_norm_stderr": 0.03929111781242741 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6018348623853211, + "acc_stderr": 0.02098798942265427, + "acc_norm": 0.6018348623853211, + "acc_norm_stderr": 0.02098798942265427 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.38425925925925924, + "acc_stderr": 0.03317354514310742, + "acc_norm": 0.38425925925925924, + "acc_norm_stderr": 0.03317354514310742 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.5931372549019608, + "acc_stderr": 0.03447891136353382, + "acc_norm": 0.5931372549019608, + "acc_norm_stderr": 0.03447891136353382 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6497890295358649, + "acc_stderr": 0.031052391937584346, + "acc_norm": 0.6497890295358649, + "acc_norm_stderr": 0.031052391937584346 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5426008968609866, + "acc_stderr": 0.03343577705583066, + "acc_norm": 0.5426008968609866, + "acc_norm_stderr": 0.03343577705583066 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.45038167938931295, + "acc_stderr": 0.04363643698524779, + "acc_norm": 0.45038167938931295, + "acc_norm_stderr": 0.04363643698524779 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5867768595041323, + "acc_stderr": 0.04495087843548408, + "acc_norm": 0.5867768595041323, + "acc_norm_stderr": 0.04495087843548408 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.04826217294139894, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.04826217294139894 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6134969325153374, + "acc_stderr": 0.038258255488486076, + "acc_norm": 0.6134969325153374, + "acc_norm_stderr": 0.038258255488486076 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764376, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6116504854368932, + "acc_stderr": 0.0482572933735639, + "acc_norm": 0.6116504854368932, + "acc_norm_stderr": 0.0482572933735639 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7564102564102564, + "acc_stderr": 0.028120966503914394, + "acc_norm": 0.7564102564102564, + "acc_norm_stderr": 0.028120966503914394 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5721583652618135, + "acc_stderr": 0.017692787927803728, + "acc_norm": 0.5721583652618135, + "acc_norm_stderr": 0.017692787927803728 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5, + "acc_stderr": 0.026919095102908273, + "acc_norm": 0.5, + "acc_norm_stderr": 0.026919095102908273 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.30502793296089387, + "acc_stderr": 0.015398723510916716, + "acc_norm": 0.30502793296089387, + "acc_norm_stderr": 0.015398723510916716 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.43790849673202614, + "acc_stderr": 0.028408302020332694, + "acc_norm": 0.43790849673202614, + "acc_norm_stderr": 0.028408302020332694 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5273311897106109, + "acc_stderr": 0.028355633568328167, + "acc_norm": 0.5273311897106109, + "acc_norm_stderr": 0.028355633568328167 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.4691358024691358, + "acc_stderr": 0.027767689606833932, + "acc_norm": 0.4691358024691358, + "acc_norm_stderr": 0.027767689606833932 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3971631205673759, + "acc_stderr": 0.0291898056735871, + "acc_norm": 0.3971631205673759, + "acc_norm_stderr": 0.0291898056735871 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.34028683181225555, + "acc_stderr": 0.012101217610223787, + "acc_norm": 0.34028683181225555, + "acc_norm_stderr": 0.012101217610223787 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.3382352941176471, + "acc_stderr": 0.02873932851398358, + "acc_norm": 0.3382352941176471, + "acc_norm_stderr": 0.02873932851398358 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.40522875816993464, + "acc_stderr": 0.019861155193829163, + "acc_norm": 0.40522875816993464, + "acc_norm_stderr": 0.019861155193829163 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5909090909090909, + "acc_stderr": 0.04709306978661896, + "acc_norm": 0.5909090909090909, + "acc_norm_stderr": 0.04709306978661896 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5510204081632653, + "acc_stderr": 0.03184213866687579, + "acc_norm": 0.5510204081632653, + "acc_norm_stderr": 0.03184213866687579 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6368159203980099, + "acc_stderr": 0.034005985055990146, + "acc_norm": 0.6368159203980099, + "acc_norm_stderr": 0.034005985055990146 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42168674698795183, + "acc_stderr": 0.03844453181770917, + "acc_norm": 0.42168674698795183, + "acc_norm_stderr": 0.03844453181770917 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.5964912280701754, + "acc_stderr": 0.03762738699917057, + "acc_norm": 0.5964912280701754, + "acc_norm_stderr": 0.03762738699917057 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3157894736842105, + "mc1_stderr": 0.016272287957916923, + "mc2": 0.46661209491833394, + "mc2_stderr": 0.015159322019066763 + }, + "all": { + "acc": 0.471745887911803, + "acc_stderr": 0.035250082127143575, + "acc_norm": 0.47524525604836215, + "acc_norm_stderr": 0.03524649721841555, + "mc1": 0.3157894736842105, + "mc1_stderr": 0.016272287957916923, + "mc2": 0.46661209491833394, + "mc2_stderr": 0.015159322019066763 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6803.140939474106", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-orca-13b/results_2023-09-12T14-20-48.062177.json b/eval-results/uukuguy/speechless-codellama-orca-13b/results_2023-09-12T14-20-48.062177.json new file mode 100644 index 0000000000000000000000000000000000000000..e543fb36ff0734f869a77ff823419f4b844eb62c --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-orca-13b/results_2023-09-12T14-20-48.062177.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-orca-13b", + "model_sha": "6fdfeabe817235df3d560a6e6465c3722bc3a4ba", + "model_size": "24.56 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4104095563139932, + "acc_stderr": 0.014374922192642666, + "acc_norm": 0.44368600682593856, + "acc_norm_stderr": 0.01451842182567045 + }, + "harness|hellaswag|10": { + "acc": 0.4801832304321848, + "acc_stderr": 0.0049858608534276315, + "acc_norm": 0.6519617606054571, + "acc_norm_stderr": 0.004753746951620158 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.362962962962963, + "acc_stderr": 0.041539484047424, + "acc_norm": 0.362962962962963, + "acc_norm_stderr": 0.041539484047424 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.40131578947368424, + "acc_stderr": 0.039889037033362836, + "acc_norm": 0.40131578947368424, + "acc_norm_stderr": 0.039889037033362836 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4, + "acc_stderr": 0.03015113445777629, + "acc_norm": 0.4, + "acc_norm_stderr": 0.03015113445777629 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4166666666666667, + "acc_stderr": 0.04122728707651282, + "acc_norm": 0.4166666666666667, + "acc_norm_stderr": 0.04122728707651282 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384741, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384741 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.35260115606936415, + "acc_stderr": 0.03643037168958548, + "acc_norm": 0.35260115606936415, + "acc_norm_stderr": 0.03643037168958548 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.04158307533083286, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.04158307533083286 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3872340425531915, + "acc_stderr": 0.03184389265339526, + "acc_norm": 0.3872340425531915, + "acc_norm_stderr": 0.03184389265339526 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537313, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537313 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.42758620689655175, + "acc_stderr": 0.04122737111370332, + "acc_norm": 0.42758620689655175, + "acc_norm_stderr": 0.04122737111370332 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2830687830687831, + "acc_stderr": 0.023201392938194978, + "acc_norm": 0.2830687830687831, + "acc_norm_stderr": 0.023201392938194978 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.04343525428949098, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.04343525428949098 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.4032258064516129, + "acc_stderr": 0.027906150826041143, + "acc_norm": 0.4032258064516129, + "acc_norm_stderr": 0.027906150826041143 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.30049261083743845, + "acc_stderr": 0.03225799476233483, + "acc_norm": 0.30049261083743845, + "acc_norm_stderr": 0.03225799476233483 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5818181818181818, + "acc_stderr": 0.03851716319398393, + "acc_norm": 0.5818181818181818, + "acc_norm_stderr": 0.03851716319398393 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.5454545454545454, + "acc_stderr": 0.03547601494006937, + "acc_norm": 0.5454545454545454, + "acc_norm_stderr": 0.03547601494006937 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.5544041450777202, + "acc_stderr": 0.03587014986075659, + "acc_norm": 0.5544041450777202, + "acc_norm_stderr": 0.03587014986075659 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3230769230769231, + "acc_stderr": 0.023710888501970562, + "acc_norm": 0.3230769230769231, + "acc_norm_stderr": 0.023710888501970562 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25555555555555554, + "acc_stderr": 0.026593939101844082, + "acc_norm": 0.25555555555555554, + "acc_norm_stderr": 0.026593939101844082 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.40336134453781514, + "acc_stderr": 0.031866081214088314, + "acc_norm": 0.40336134453781514, + "acc_norm_stderr": 0.031866081214088314 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.5045871559633027, + "acc_stderr": 0.02143642095552942, + "acc_norm": 0.5045871559633027, + "acc_norm_stderr": 0.02143642095552942 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.33796296296296297, + "acc_stderr": 0.03225941352631295, + "acc_norm": 0.33796296296296297, + "acc_norm_stderr": 0.03225941352631295 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6029411764705882, + "acc_stderr": 0.03434131164719129, + "acc_norm": 0.6029411764705882, + "acc_norm_stderr": 0.03434131164719129 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6118143459915611, + "acc_stderr": 0.03172295004332328, + "acc_norm": 0.6118143459915611, + "acc_norm_stderr": 0.03172295004332328 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5201793721973094, + "acc_stderr": 0.033530461674123, + "acc_norm": 0.5201793721973094, + "acc_norm_stderr": 0.033530461674123 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.45038167938931295, + "acc_stderr": 0.04363643698524779, + "acc_norm": 0.45038167938931295, + "acc_norm_stderr": 0.04363643698524779 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6115702479338843, + "acc_stderr": 0.044492703500683836, + "acc_norm": 0.6115702479338843, + "acc_norm_stderr": 0.044492703500683836 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5370370370370371, + "acc_stderr": 0.04820403072760627, + "acc_norm": 0.5370370370370371, + "acc_norm_stderr": 0.04820403072760627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4723926380368098, + "acc_stderr": 0.039223782906109894, + "acc_norm": 0.4723926380368098, + "acc_norm_stderr": 0.039223782906109894 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5825242718446602, + "acc_stderr": 0.048828405482122375, + "acc_norm": 0.5825242718446602, + "acc_norm_stderr": 0.048828405482122375 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7264957264957265, + "acc_stderr": 0.029202540153431183, + "acc_norm": 0.7264957264957265, + "acc_norm_stderr": 0.029202540153431183 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.49, + "acc_stderr": 0.050241839379569095, + "acc_norm": 0.49, + "acc_norm_stderr": 0.050241839379569095 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5006385696040868, + "acc_stderr": 0.017879948914431676, + "acc_norm": 0.5006385696040868, + "acc_norm_stderr": 0.017879948914431676 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.47398843930635837, + "acc_stderr": 0.02688264343402289, + "acc_norm": 0.47398843930635837, + "acc_norm_stderr": 0.02688264343402289 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.30837988826815643, + "acc_stderr": 0.01544571691099888, + "acc_norm": 0.30837988826815643, + "acc_norm_stderr": 0.01544571691099888 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.027363593284684937, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.027363593284684937 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5048231511254019, + "acc_stderr": 0.028396770444111298, + "acc_norm": 0.5048231511254019, + "acc_norm_stderr": 0.028396770444111298 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.43209876543209874, + "acc_stderr": 0.02756301097160668, + "acc_norm": 0.43209876543209874, + "acc_norm_stderr": 0.02756301097160668 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3404255319148936, + "acc_stderr": 0.02826765748265014, + "acc_norm": 0.3404255319148936, + "acc_norm_stderr": 0.02826765748265014 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3272490221642764, + "acc_stderr": 0.011983819806464742, + "acc_norm": 0.3272490221642764, + "acc_norm_stderr": 0.011983819806464742 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2867647058823529, + "acc_stderr": 0.02747227447323382, + "acc_norm": 0.2867647058823529, + "acc_norm_stderr": 0.02747227447323382 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.3937908496732026, + "acc_stderr": 0.01976621199107307, + "acc_norm": 0.3937908496732026, + "acc_norm_stderr": 0.01976621199107307 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5909090909090909, + "acc_stderr": 0.04709306978661896, + "acc_norm": 0.5909090909090909, + "acc_norm_stderr": 0.04709306978661896 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5591836734693878, + "acc_stderr": 0.03178419114175363, + "acc_norm": 0.5591836734693878, + "acc_norm_stderr": 0.03178419114175363 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.5422885572139303, + "acc_stderr": 0.035228658640995975, + "acc_norm": 0.5422885572139303, + "acc_norm_stderr": 0.035228658640995975 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237101, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237101 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3855421686746988, + "acc_stderr": 0.03789134424611551, + "acc_norm": 0.3855421686746988, + "acc_norm_stderr": 0.03789134424611551 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.4853801169590643, + "acc_stderr": 0.038331852752130205, + "acc_norm": 0.4853801169590643, + "acc_norm_stderr": 0.038331852752130205 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2937576499388005, + "mc1_stderr": 0.015945068581236614, + "mc2": 0.4593961495535578, + "mc2_stderr": 0.015067081123713475 + }, + "all": { + "acc": 0.4349239227238415, + "acc_stderr": 0.0352545419630563, + "acc_norm": 0.43839943087104855, + "acc_norm_stderr": 0.03525304002629732, + "mc1": 0.2937576499388005, + "mc1_stderr": 0.015945068581236614, + "mc2": 0.4593961495535578, + "mc2_stderr": 0.015067081123713475 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6353.465031385422", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-orca-13b/results_2023-10-17T18-24-08.012097.json b/eval-results/uukuguy/speechless-codellama-orca-13b/results_2023-10-17T18-24-08.012097.json new file mode 100644 index 0000000000000000000000000000000000000000..2086b99e6280c3475e3f48ce4f750def2125dddf --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-orca-13b/results_2023-10-17T18-24-08.012097.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-orca-13b", + "model_sha": "a53c8d4f55bbf7b3ec1eb0a8c37527fde3676d98", + "model_size": "24.56 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.2717072147651007, + "em_stderr": 0.004555575414025805, + "f1": 0.3334070889261759, + "f1_stderr": 0.004535946388945885 + }, + "harness|gsm8k|5": { + "acc": 0.05989385898407885, + "acc_stderr": 0.006536148151288703 + }, + "harness|winogrande|5": { + "acc": 0.6377269139700079, + "acc_stderr": 0.013508855476252517 + }, + "all": { + "em": 0.2717072147651007, + "em_stderr": 0.004555575414025805, + "f1": 0.3334070889261759, + "f1_stderr": 0.004535946388945885, + "acc": 0.3488103864770434, + "acc_stderr": 0.01002250181377061 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "22db96dc61901cba" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "e8f69221f14a5433" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "be6b04ac28e0d0d3" + }, + "total_evaluation_time_secondes": "7539.503427505493", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-orca-13b/results_2023-10-23T18-58-19.504304.json b/eval-results/uukuguy/speechless-codellama-orca-13b/results_2023-10-23T18-58-19.504304.json new file mode 100644 index 0000000000000000000000000000000000000000..9e21796e7a9f4374f03c6b7b238ccd12b2d7fdea --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-orca-13b/results_2023-10-23T18-58-19.504304.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-orca-13b", + "model_sha": "a53c8d4f55bbf7b3ec1eb0a8c37527fde3676d98", + "model_size": "24.56 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.2686661073825503, + "em_stderr": 0.004539457381903774, + "f1": 0.3305505453020149, + "f1_stderr": 0.00452265523617686 + }, + "harness|gsm8k|5": { + "acc": 0.05989385898407885, + "acc_stderr": 0.006536148151288716 + }, + "harness|winogrande|5": { + "acc": 0.6400947119179163, + "acc_stderr": 0.013489609590266799 + }, + "all": { + "em": 0.2686661073825503, + "em_stderr": 0.004539457381903774, + "f1": 0.3305505453020149, + "f1_stderr": 0.00452265523617686, + "acc": 0.3499942854509976, + "acc_stderr": 0.010012878870777758 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "1f60c6bcf34804af" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "4af52b38f7e0ed87" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "db733440caf2fba2" + }, + "total_evaluation_time_secondes": "7567.254971027374", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-09-05T03-40-07.595318.json b/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-09-05T03-40-07.595318.json new file mode 100644 index 0000000000000000000000000000000000000000..42b50e1b075d369bccc293f360f968cb5620158e --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-09-05T03-40-07.595318.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-orca-airoboros-13b-0.10e", + "model_sha": "dbd1d1f7ad7b6b359f8246141650b25ca0bb8cbb", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "9f7699e1a44b5b4d7bd4f326b57a34db83b67c3f", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.24573378839590443, + "acc_stderr": 0.012581033453730111, + "acc_norm": 0.29266211604095566, + "acc_norm_stderr": 0.013295916103619411 + }, + "harness|hellaswag|10": { + "acc": 0.2577175861382195, + "acc_stderr": 0.004364838000335621, + "acc_norm": 0.2574188408683529, + "acc_norm_stderr": 0.0043631851720471754 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.16, + "acc_stderr": 0.0368452949177471, + "acc_norm": 0.16, + "acc_norm_stderr": 0.0368452949177471 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.037498507091740206, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.037498507091740206 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.18421052631578946, + "acc_stderr": 0.0315469804508223, + "acc_norm": 0.18421052631578946, + "acc_norm_stderr": 0.0315469804508223 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.21, + "acc_stderr": 0.04093601807403325, + "acc_norm": 0.21, + "acc_norm_stderr": 0.04093601807403325 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2679245283018868, + "acc_stderr": 0.027257260322494845, + "acc_norm": 0.2679245283018868, + "acc_norm_stderr": 0.027257260322494845 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.03476590104304134, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.03476590104304134 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2138728323699422, + "acc_stderr": 0.03126511206173042, + "acc_norm": 0.2138728323699422, + "acc_norm_stderr": 0.03126511206173042 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179961, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179961 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.32340425531914896, + "acc_stderr": 0.030579442773610334, + "acc_norm": 0.32340425531914896, + "acc_norm_stderr": 0.030579442773610334 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.04227054451232199, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.04227054451232199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2566137566137566, + "acc_stderr": 0.022494510767503154, + "acc_norm": 0.2566137566137566, + "acc_norm_stderr": 0.022494510767503154 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.04163453031302859, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.04163453031302859 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25483870967741934, + "acc_stderr": 0.024790118459332208, + "acc_norm": 0.25483870967741934, + "acc_norm_stderr": 0.024790118459332208 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.31527093596059114, + "acc_stderr": 0.032690808719701876, + "acc_norm": 0.31527093596059114, + "acc_norm_stderr": 0.032690808719701876 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3484848484848485, + "acc_stderr": 0.033948539651564025, + "acc_norm": 0.3484848484848485, + "acc_norm_stderr": 0.033948539651564025 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.36787564766839376, + "acc_stderr": 0.03480175668466036, + "acc_norm": 0.36787564766839376, + "acc_norm_stderr": 0.03480175668466036 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.22564102564102564, + "acc_stderr": 0.021193632525148526, + "acc_norm": 0.22564102564102564, + "acc_norm_stderr": 0.021193632525148526 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.026842057873833706, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.026842057873833706 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3487394957983193, + "acc_stderr": 0.03095663632856655, + "acc_norm": 0.3487394957983193, + "acc_norm_stderr": 0.03095663632856655 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2913907284768212, + "acc_stderr": 0.03710185726119995, + "acc_norm": 0.2913907284768212, + "acc_norm_stderr": 0.03710185726119995 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.23669724770642203, + "acc_stderr": 0.01822407811729908, + "acc_norm": 0.23669724770642203, + "acc_norm_stderr": 0.01822407811729908 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.19444444444444445, + "acc_stderr": 0.026991454502036726, + "acc_norm": 0.19444444444444445, + "acc_norm_stderr": 0.026991454502036726 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.030587591351604246, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.030587591351604246 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.20253164556962025, + "acc_stderr": 0.026160568246601457, + "acc_norm": 0.20253164556962025, + "acc_norm_stderr": 0.026160568246601457 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.10762331838565023, + "acc_stderr": 0.020799400082879997, + "acc_norm": 0.10762331838565023, + "acc_norm_stderr": 0.020799400082879997 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2824427480916031, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.2824427480916031, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.14049586776859505, + "acc_stderr": 0.03172233426002161, + "acc_norm": 0.14049586776859505, + "acc_norm_stderr": 0.03172233426002161 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.23148148148148148, + "acc_stderr": 0.04077494709252628, + "acc_norm": 0.23148148148148148, + "acc_norm_stderr": 0.04077494709252628 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2392638036809816, + "acc_stderr": 0.033519538795212696, + "acc_norm": 0.2392638036809816, + "acc_norm_stderr": 0.033519538795212696 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.19642857142857142, + "acc_stderr": 0.03770970049347019, + "acc_norm": 0.19642857142857142, + "acc_norm_stderr": 0.03770970049347019 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.2524271844660194, + "acc_stderr": 0.04301250399690877, + "acc_norm": 0.2524271844660194, + "acc_norm_stderr": 0.04301250399690877 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2564102564102564, + "acc_stderr": 0.028605953702004253, + "acc_norm": 0.2564102564102564, + "acc_norm_stderr": 0.028605953702004253 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2720306513409962, + "acc_stderr": 0.015913367447500524, + "acc_norm": 0.2720306513409962, + "acc_norm_stderr": 0.015913367447500524 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.26927374301675977, + "acc_stderr": 0.014835616582882585, + "acc_norm": 0.26927374301675977, + "acc_norm_stderr": 0.014835616582882585 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22875816993464052, + "acc_stderr": 0.024051029739912258, + "acc_norm": 0.22875816993464052, + "acc_norm_stderr": 0.024051029739912258 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2733118971061093, + "acc_stderr": 0.02531176597542612, + "acc_norm": 0.2733118971061093, + "acc_norm_stderr": 0.02531176597542612 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2654320987654321, + "acc_stderr": 0.024569223600460845, + "acc_norm": 0.2654320987654321, + "acc_norm_stderr": 0.024569223600460845 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2730496453900709, + "acc_stderr": 0.02657786094330785, + "acc_norm": 0.2730496453900709, + "acc_norm_stderr": 0.02657786094330785 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24445893089960888, + "acc_stderr": 0.010976425013113886, + "acc_norm": 0.24445893089960888, + "acc_norm_stderr": 0.010976425013113886 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4485294117647059, + "acc_stderr": 0.030211479609121593, + "acc_norm": 0.4485294117647059, + "acc_norm_stderr": 0.030211479609121593 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2679738562091503, + "acc_stderr": 0.017917974069594722, + "acc_norm": 0.2679738562091503, + "acc_norm_stderr": 0.017917974069594722 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.35454545454545455, + "acc_stderr": 0.04582004841505417, + "acc_norm": 0.35454545454545455, + "acc_norm_stderr": 0.04582004841505417 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4, + "acc_stderr": 0.031362502409358936, + "acc_norm": 0.4, + "acc_norm_stderr": 0.031362502409358936 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2835820895522388, + "acc_stderr": 0.031871875379197986, + "acc_norm": 0.2835820895522388, + "acc_norm_stderr": 0.031871875379197986 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322674, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322674 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.26506024096385544, + "acc_stderr": 0.03436024037944967, + "acc_norm": 0.26506024096385544, + "acc_norm_stderr": 0.03436024037944967 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.0312678171466318, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.0312678171466318 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.24112607099143207, + "mc1_stderr": 0.014974827279752344, + "mc2": 0.49613219705316436, + "mc2_stderr": 0.016501500834701685 + }, + "all": { + "acc": 0.2567653715101606, + "acc_stderr": 0.03143442520725739, + "acc_norm": 0.2575557034148247, + "acc_norm_stderr": 0.03144651384830147, + "mc1": 0.24112607099143207, + "mc1_stderr": 0.014974827279752344, + "mc2": 0.49613219705316436, + "mc2_stderr": 0.016501500834701685 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6767.538799524307", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-09-12T14-42-21.510480.json b/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-09-12T14-42-21.510480.json new file mode 100644 index 0000000000000000000000000000000000000000..95c9eeb2a02ccb401d8234c462304c873ce62840 --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-09-12T14-42-21.510480.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-orca-airoboros-13b-0.10e", + "model_sha": "dbd1d1f7ad7b6b359f8246141650b25ca0bb8cbb", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.24744027303754265, + "acc_stderr": 0.012610352663292673, + "acc_norm": 0.29436860068259385, + "acc_norm_stderr": 0.013318528460539427 + }, + "harness|hellaswag|10": { + "acc": 0.25781716789484166, + "acc_stderr": 0.004365388351563103, + "acc_norm": 0.25712009559848636, + "acc_norm_stderr": 0.004361529679492747 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653696, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653696 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.037498507091740206, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.037498507091740206 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.18421052631578946, + "acc_stderr": 0.0315469804508223, + "acc_norm": 0.18421052631578946, + "acc_norm_stderr": 0.0315469804508223 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2679245283018868, + "acc_stderr": 0.027257260322494845, + "acc_norm": 0.2679245283018868, + "acc_norm_stderr": 0.027257260322494845 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2152777777777778, + "acc_stderr": 0.034370793441061344, + "acc_norm": 0.2152777777777778, + "acc_norm_stderr": 0.034370793441061344 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2138728323699422, + "acc_stderr": 0.03126511206173042, + "acc_norm": 0.2138728323699422, + "acc_norm_stderr": 0.03126511206173042 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179961, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179961 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.32340425531914896, + "acc_stderr": 0.030579442773610334, + "acc_norm": 0.32340425531914896, + "acc_norm_stderr": 0.030579442773610334 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.04227054451232199, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.04227054451232199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.22758620689655173, + "acc_stderr": 0.03493950380131184, + "acc_norm": 0.22758620689655173, + "acc_norm_stderr": 0.03493950380131184 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2566137566137566, + "acc_stderr": 0.022494510767503154, + "acc_norm": 0.2566137566137566, + "acc_norm_stderr": 0.022494510767503154 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.041905964388711366, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.041905964388711366 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25483870967741934, + "acc_stderr": 0.024790118459332208, + "acc_norm": 0.25483870967741934, + "acc_norm_stderr": 0.024790118459332208 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2955665024630542, + "acc_stderr": 0.032104944337514575, + "acc_norm": 0.2955665024630542, + "acc_norm_stderr": 0.032104944337514575 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.31313131313131315, + "acc_stderr": 0.033042050878136525, + "acc_norm": 0.31313131313131315, + "acc_norm_stderr": 0.033042050878136525 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.36787564766839376, + "acc_stderr": 0.03480175668466036, + "acc_norm": 0.36787564766839376, + "acc_norm_stderr": 0.03480175668466036 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2282051282051282, + "acc_stderr": 0.02127839386358628, + "acc_norm": 0.2282051282051282, + "acc_norm_stderr": 0.02127839386358628 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.026842057873833706, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.026842057873833706 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3487394957983193, + "acc_stderr": 0.03095663632856655, + "acc_norm": 0.3487394957983193, + "acc_norm_stderr": 0.03095663632856655 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.03631329803969653, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.03631329803969653 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.23669724770642203, + "acc_stderr": 0.01822407811729908, + "acc_norm": 0.23669724770642203, + "acc_norm_stderr": 0.01822407811729908 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.16666666666666666, + "acc_stderr": 0.025416428388767478, + "acc_norm": 0.16666666666666666, + "acc_norm_stderr": 0.025416428388767478 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.030587591351604246, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.030587591351604246 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.20253164556962025, + "acc_stderr": 0.026160568246601457, + "acc_norm": 0.20253164556962025, + "acc_norm_stderr": 0.026160568246601457 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.10762331838565023, + "acc_stderr": 0.020799400082879997, + "acc_norm": 0.10762331838565023, + "acc_norm_stderr": 0.020799400082879997 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2824427480916031, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.2824427480916031, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.14049586776859505, + "acc_stderr": 0.03172233426002161, + "acc_norm": 0.14049586776859505, + "acc_norm_stderr": 0.03172233426002161 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.23148148148148148, + "acc_stderr": 0.040774947092526284, + "acc_norm": 0.23148148148148148, + "acc_norm_stderr": 0.040774947092526284 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22699386503067484, + "acc_stderr": 0.032910995786157686, + "acc_norm": 0.22699386503067484, + "acc_norm_stderr": 0.032910995786157686 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.038946411200447915, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.038946411200447915 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.23300970873786409, + "acc_stderr": 0.04185832598928315, + "acc_norm": 0.23300970873786409, + "acc_norm_stderr": 0.04185832598928315 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2564102564102564, + "acc_stderr": 0.028605953702004253, + "acc_norm": 0.2564102564102564, + "acc_norm_stderr": 0.028605953702004253 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.25287356321839083, + "acc_stderr": 0.015543377313719678, + "acc_norm": 0.25287356321839083, + "acc_norm_stderr": 0.015543377313719678 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24566473988439305, + "acc_stderr": 0.02317629820399201, + "acc_norm": 0.24566473988439305, + "acc_norm_stderr": 0.02317629820399201 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2681564245810056, + "acc_stderr": 0.014816119635317005, + "acc_norm": 0.2681564245810056, + "acc_norm_stderr": 0.014816119635317005 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22875816993464052, + "acc_stderr": 0.024051029739912258, + "acc_norm": 0.22875816993464052, + "acc_norm_stderr": 0.024051029739912258 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2733118971061093, + "acc_stderr": 0.02531176597542612, + "acc_norm": 0.2733118971061093, + "acc_norm_stderr": 0.02531176597542612 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2654320987654321, + "acc_stderr": 0.024569223600460845, + "acc_norm": 0.2654320987654321, + "acc_norm_stderr": 0.024569223600460845 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2765957446808511, + "acc_stderr": 0.026684564340460997, + "acc_norm": 0.2765957446808511, + "acc_norm_stderr": 0.026684564340460997 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24445893089960888, + "acc_stderr": 0.010976425013113886, + "acc_norm": 0.24445893089960888, + "acc_norm_stderr": 0.010976425013113886 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4485294117647059, + "acc_stderr": 0.030211479609121593, + "acc_norm": 0.4485294117647059, + "acc_norm_stderr": 0.030211479609121593 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2679738562091503, + "acc_stderr": 0.017917974069594722, + "acc_norm": 0.2679738562091503, + "acc_norm_stderr": 0.017917974069594722 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.34545454545454546, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.34545454545454546, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4, + "acc_stderr": 0.031362502409358936, + "acc_norm": 0.4, + "acc_norm_stderr": 0.031362502409358936 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.27860696517412936, + "acc_stderr": 0.03170056183497308, + "acc_norm": 0.27860696517412936, + "acc_norm_stderr": 0.03170056183497308 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.2, + "acc_stderr": 0.040201512610368466, + "acc_norm": 0.2, + "acc_norm_stderr": 0.040201512610368466 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.25301204819277107, + "acc_stderr": 0.03384429155233134, + "acc_norm": 0.25301204819277107, + "acc_norm_stderr": 0.03384429155233134 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.0312678171466318, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.0312678171466318 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2423500611995104, + "mc1_stderr": 0.015000674373570342, + "mc2": 0.4963684639744407, + "mc2_stderr": 0.016496718744347754 + }, + "all": { + "acc": 0.2542111825994659, + "acc_stderr": 0.03135387998985871, + "acc_norm": 0.2549947631985964, + "acc_norm_stderr": 0.03136581756825153, + "mc1": 0.2423500611995104, + "mc1_stderr": 0.015000674373570342, + "mc2": 0.4963684639744407, + "mc2_stderr": 0.016496718744347754 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6339.539946317673", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-10-18T02-12-43.735016.json b/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-10-18T02-12-43.735016.json new file mode 100644 index 0000000000000000000000000000000000000000..05cb04950a2d289b889d80824043ba708a1c050d --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-10-18T02-12-43.735016.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-orca-airoboros-13b-0.10e", + "model_sha": "dbd1d1f7ad7b6b359f8246141650b25ca0bb8cbb", + "model_size": "24.32 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5098658247829518, + "acc_stderr": 0.014049749833367592 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0, + "acc": 0.2549329123914759, + "acc_stderr": 0.007024874916683796 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "40c358f29b5417ff" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "eafafecdc7c603ba" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "3ebe635f3d0eeb8a" + }, + "total_evaluation_time_secondes": "31045.29495692253", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-10-28T09-11-54.446220.json b/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-10-28T09-11-54.446220.json new file mode 100644 index 0000000000000000000000000000000000000000..e8becaeff3f74a4903f834f6ee92a0cdfbb15abf --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-10-28T09-11-54.446220.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-orca-airoboros-13b-0.10e", + "model_sha": "dbd1d1f7ad7b6b359f8246141650b25ca0bb8cbb", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5193370165745856, + "acc_stderr": 0.014041972733712977 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0, + "acc": 0.2596685082872928, + "acc_stderr": 0.007020986366856489 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "8ab2c888b1ac932b" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "2267774afa446f49" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "14f25fca71c52387" + }, + "total_evaluation_time_secondes": "33516.29539060593", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-12-03T19-08-06.034957.json b/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-12-03T19-08-06.034957.json new file mode 100644 index 0000000000000000000000000000000000000000..59732f3a910518fea5d5c6270f5a1317d6ee422e --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-12-03T19-08-06.034957.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 80976.797194727, + "end_time": 85356.884026307, + "total_evaluation_time_secondes": "4380.0868315800035", + "model_name": "uukuguy/speechless-codellama-orca-airoboros-13b-0.10e", + "model_sha": "dbd1d1f7ad7b6b359f8246141650b25ca0bb8cbb", + "model_dtype": "torch.bfloat16", + "model_size": "24.32 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "all": { + "acc": 0.0, + "acc_stderr": 0.0 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "eafafecdc7c603ba" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "42036645de5ac59d", + "hash_cont_tokens": "b9e630f3ba8326e3" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-12-03T19-08-12.373310.json b/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-12-03T19-08-12.373310.json new file mode 100644 index 0000000000000000000000000000000000000000..0f1bdefb464bf907a8ef5c94ec4e886e058553fd --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-12-03T19-08-12.373310.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 80976.837509063, + "end_time": 85365.053940341, + "total_evaluation_time_secondes": "4388.216431277993", + "model_name": "uukuguy/speechless-codellama-orca-airoboros-13b-0.10e", + "model_sha": "dbd1d1f7ad7b6b359f8246141650b25ca0bb8cbb", + "model_dtype": "torch.float16", + "model_size": "24.32 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "all": { + "acc": 0.0, + "acc_stderr": 0.0 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "2267774afa446f49" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "42036645de5ac59d", + "hash_cont_tokens": "2a4412b8689ca230" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-12-03T19-30-19.333310.json b/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-12-03T19-30-19.333310.json new file mode 100644 index 0000000000000000000000000000000000000000..6472f8fa74d770d26cfbdfa96bca23252f5b9964 --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-12-03T19-30-19.333310.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 82186.060252434, + "end_time": 86696.374309201, + "total_evaluation_time_secondes": "4510.314056766991", + "model_name": "uukuguy/speechless-codellama-orca-airoboros-13b-0.10e", + "model_sha": "dbd1d1f7ad7b6b359f8246141650b25ca0bb8cbb", + "model_dtype": "torch.float16", + "model_size": "24.32 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "all": { + "acc": 0.0, + "acc_stderr": 0.0 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "2267774afa446f49" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "42036645de5ac59d", + "hash_cont_tokens": "2a4412b8689ca230" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-12-03T19-30-46.049775.json b/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-12-03T19-30-46.049775.json new file mode 100644 index 0000000000000000000000000000000000000000..e0b37fc1902e860f9fbec7d27a99d158c042c9d4 --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-orca-airoboros-13b-0.10e/results_2023-12-03T19-30-46.049775.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 82179.017000941, + "end_time": 86716.175169107, + "total_evaluation_time_secondes": "4537.158168165988", + "model_name": "uukuguy/speechless-codellama-orca-airoboros-13b-0.10e", + "model_sha": "dbd1d1f7ad7b6b359f8246141650b25ca0bb8cbb", + "model_dtype": "torch.bfloat16", + "model_size": "24.32 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "all": { + "acc": 0.0, + "acc_stderr": 0.0 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "eafafecdc7c603ba" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "42036645de5ac59d", + "hash_cont_tokens": "b9e630f3ba8326e3" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-orca-platypus-13b-0.10e/results_2023-09-04T08-11-13.966337.json b/eval-results/uukuguy/speechless-codellama-orca-platypus-13b-0.10e/results_2023-09-04T08-11-13.966337.json new file mode 100644 index 0000000000000000000000000000000000000000..5ac5e1f828f81b37757f8d0659dcd812d94e231b --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-orca-platypus-13b-0.10e/results_2023-09-04T08-11-13.966337.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-orca-platypus-13b-0.10e", + "model_sha": "119abfc73f9ce541a40779f167fe21e95faed4e8", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.20392491467576793, + "acc_stderr": 0.011774262478702247, + "acc_norm": 0.28924914675767915, + "acc_norm_stderr": 0.013250012579393443 + }, + "harness|hellaswag|10": { + "acc": 0.25582553276239794, + "acc_stderr": 0.004354325017137537, + "acc_norm": 0.25761800438159727, + "acc_norm_stderr": 0.004364287353415454 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.035914440841969694, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.035914440841969694 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.28289473684210525, + "acc_stderr": 0.03665349695640767, + "acc_norm": 0.28289473684210525, + "acc_norm_stderr": 0.03665349695640767 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.22264150943396227, + "acc_stderr": 0.025604233470899095, + "acc_norm": 0.22264150943396227, + "acc_norm_stderr": 0.025604233470899095 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.25, + "acc_stderr": 0.03621034121889507, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03621034121889507 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.30057803468208094, + "acc_stderr": 0.034961014811911786, + "acc_norm": 0.30057803468208094, + "acc_norm_stderr": 0.034961014811911786 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.37254901960784315, + "acc_stderr": 0.04810840148082633, + "acc_norm": 0.37254901960784315, + "acc_norm_stderr": 0.04810840148082633 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.20425531914893616, + "acc_stderr": 0.026355158413349424, + "acc_norm": 0.20425531914893616, + "acc_norm_stderr": 0.026355158413349424 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748141, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748141 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2689655172413793, + "acc_stderr": 0.03695183311650232, + "acc_norm": 0.2689655172413793, + "acc_norm_stderr": 0.03695183311650232 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24338624338624337, + "acc_stderr": 0.022101128787415415, + "acc_norm": 0.24338624338624337, + "acc_norm_stderr": 0.022101128787415415 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.04306241259127153, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.04306241259127153 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24516129032258063, + "acc_stderr": 0.02447224384089553, + "acc_norm": 0.24516129032258063, + "acc_norm_stderr": 0.02447224384089553 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.22660098522167488, + "acc_stderr": 0.029454863835293003, + "acc_norm": 0.22660098522167488, + "acc_norm_stderr": 0.029454863835293003 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.20606060606060606, + "acc_stderr": 0.031584153240477086, + "acc_norm": 0.20606060606060606, + "acc_norm_stderr": 0.031584153240477086 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.03173071239071724, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.03173071239071724 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.3626943005181347, + "acc_stderr": 0.03469713791704371, + "acc_norm": 0.3626943005181347, + "acc_norm_stderr": 0.03469713791704371 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.23333333333333334, + "acc_stderr": 0.021444547301560493, + "acc_norm": 0.23333333333333334, + "acc_norm_stderr": 0.021444547301560493 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24814814814814815, + "acc_stderr": 0.0263357394040558, + "acc_norm": 0.24814814814814815, + "acc_norm_stderr": 0.0263357394040558 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.31512605042016806, + "acc_stderr": 0.030176808288974337, + "acc_norm": 0.31512605042016806, + "acc_norm_stderr": 0.030176808288974337 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.13245033112582782, + "acc_stderr": 0.02767757707489175, + "acc_norm": 0.13245033112582782, + "acc_norm_stderr": 0.02767757707489175 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.21100917431192662, + "acc_stderr": 0.017493922404112648, + "acc_norm": 0.21100917431192662, + "acc_norm_stderr": 0.017493922404112648 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.16203703703703703, + "acc_stderr": 0.02513045365226846, + "acc_norm": 0.16203703703703703, + "acc_norm_stderr": 0.02513045365226846 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.028379449451588667, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.028379449451588667 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.23628691983122363, + "acc_stderr": 0.02765215314415928, + "acc_norm": 0.23628691983122363, + "acc_norm_stderr": 0.02765215314415928 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3094170403587444, + "acc_stderr": 0.031024411740572206, + "acc_norm": 0.3094170403587444, + "acc_norm_stderr": 0.031024411740572206 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.22900763358778625, + "acc_stderr": 0.036853466317118506, + "acc_norm": 0.22900763358778625, + "acc_norm_stderr": 0.036853466317118506 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.3140495867768595, + "acc_stderr": 0.04236964753041018, + "acc_norm": 0.3140495867768595, + "acc_norm_stderr": 0.04236964753041018 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.24539877300613497, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.24539877300613497, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3392857142857143, + "acc_stderr": 0.04493949068613539, + "acc_norm": 0.3392857142857143, + "acc_norm_stderr": 0.04493949068613539 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.22330097087378642, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.22330097087378642, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.27350427350427353, + "acc_stderr": 0.029202540153431187, + "acc_norm": 0.27350427350427353, + "acc_norm_stderr": 0.029202540153431187 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.22, + "acc_stderr": 0.041633319989322695, + "acc_norm": 0.22, + "acc_norm_stderr": 0.041633319989322695 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2707535121328225, + "acc_stderr": 0.015889888362560486, + "acc_norm": 0.2707535121328225, + "acc_norm_stderr": 0.015889888362560486 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2138728323699422, + "acc_stderr": 0.022075709251757183, + "acc_norm": 0.2138728323699422, + "acc_norm_stderr": 0.022075709251757183 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.23202614379084968, + "acc_stderr": 0.024170840879341016, + "acc_norm": 0.23202614379084968, + "acc_norm_stderr": 0.024170840879341016 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2990353697749196, + "acc_stderr": 0.026003301117885135, + "acc_norm": 0.2990353697749196, + "acc_norm_stderr": 0.026003301117885135 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.22530864197530864, + "acc_stderr": 0.023246202647819746, + "acc_norm": 0.22530864197530864, + "acc_norm_stderr": 0.023246202647819746 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.26595744680851063, + "acc_stderr": 0.026358065698880596, + "acc_norm": 0.26595744680851063, + "acc_norm_stderr": 0.026358065698880596 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24511082138200782, + "acc_stderr": 0.010986307870045514, + "acc_norm": 0.24511082138200782, + "acc_norm_stderr": 0.010986307870045514 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.02315746830855935, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.02315746830855935 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2565359477124183, + "acc_stderr": 0.017667841612378984, + "acc_norm": 0.2565359477124183, + "acc_norm_stderr": 0.017667841612378984 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.22727272727272727, + "acc_stderr": 0.04013964554072774, + "acc_norm": 0.22727272727272727, + "acc_norm_stderr": 0.04013964554072774 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.16326530612244897, + "acc_stderr": 0.023661699177098622, + "acc_norm": 0.16326530612244897, + "acc_norm_stderr": 0.023661699177098622 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2736318407960199, + "acc_stderr": 0.03152439186555402, + "acc_norm": 0.2736318407960199, + "acc_norm_stderr": 0.03152439186555402 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.22289156626506024, + "acc_stderr": 0.03240004825594688, + "acc_norm": 0.22289156626506024, + "acc_norm_stderr": 0.03240004825594688 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.29239766081871343, + "acc_stderr": 0.034886477134579215, + "acc_norm": 0.29239766081871343, + "acc_norm_stderr": 0.034886477134579215 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.24357405140758873, + "mc1_stderr": 0.01502635482491078, + "mc2": 0.4922408617550152, + "mc2_stderr": 0.016579665032793813 + }, + "all": { + "acc": 0.2520232154837474, + "acc_stderr": 0.03158302657070843, + "acc_norm": 0.2534997697837662, + "acc_norm_stderr": 0.03160820813743672, + "mc1": 0.24357405140758873, + "mc1_stderr": 0.01502635482491078, + "mc2": 0.4922408617550152, + "mc2_stderr": 0.016579665032793813 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6770.291565656662", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-orca-platypus-13b-0.10e/results_2023-09-12T14-48-20.175227.json b/eval-results/uukuguy/speechless-codellama-orca-platypus-13b-0.10e/results_2023-09-12T14-48-20.175227.json new file mode 100644 index 0000000000000000000000000000000000000000..4908d7dfdccdbd168513a1720449cfdc3bbe27c4 --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-orca-platypus-13b-0.10e/results_2023-09-12T14-48-20.175227.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-orca-platypus-13b-0.10e", + "model_sha": "119abfc73f9ce541a40779f167fe21e95faed4e8", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.2030716723549488, + "acc_stderr": 0.011755899303705582, + "acc_norm": 0.28754266211604096, + "acc_norm_stderr": 0.013226719056266134 + }, + "harness|hellaswag|10": { + "acc": 0.25582553276239794, + "acc_stderr": 0.004354325017137537, + "acc_norm": 0.2588129854610635, + "acc_norm_stderr": 0.004370875625258996 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.22962962962962963, + "acc_stderr": 0.03633384414073462, + "acc_norm": 0.22962962962962963, + "acc_norm_stderr": 0.03633384414073462 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.26973684210526316, + "acc_stderr": 0.03611780560284898, + "acc_norm": 0.26973684210526316, + "acc_norm_stderr": 0.03611780560284898 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.23773584905660378, + "acc_stderr": 0.026199808807561925, + "acc_norm": 0.23773584905660378, + "acc_norm_stderr": 0.026199808807561925 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3063583815028902, + "acc_stderr": 0.035149425512674394, + "acc_norm": 0.3063583815028902, + "acc_norm_stderr": 0.035149425512674394 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.37254901960784315, + "acc_stderr": 0.04810840148082633, + "acc_norm": 0.37254901960784315, + "acc_norm_stderr": 0.04810840148082633 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.20425531914893616, + "acc_stderr": 0.026355158413349424, + "acc_norm": 0.20425531914893616, + "acc_norm_stderr": 0.026355158413349424 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748141, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748141 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.037245636197746325, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.037245636197746325 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.24338624338624337, + "acc_stderr": 0.022101128787415422, + "acc_norm": 0.24338624338624337, + "acc_norm_stderr": 0.022101128787415422 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.04306241259127153, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.04306241259127153 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.24516129032258063, + "acc_stderr": 0.02447224384089553, + "acc_norm": 0.24516129032258063, + "acc_norm_stderr": 0.02447224384089553 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2315270935960591, + "acc_stderr": 0.02967833314144444, + "acc_norm": 0.2315270935960591, + "acc_norm_stderr": 0.02967833314144444 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.22424242424242424, + "acc_stderr": 0.03256866661681102, + "acc_norm": 0.22424242424242424, + "acc_norm_stderr": 0.03256866661681102 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.29292929292929293, + "acc_stderr": 0.032424979581788166, + "acc_norm": 0.29292929292929293, + "acc_norm_stderr": 0.032424979581788166 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.3626943005181347, + "acc_stderr": 0.03469713791704371, + "acc_norm": 0.3626943005181347, + "acc_norm_stderr": 0.03469713791704371 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2282051282051282, + "acc_stderr": 0.02127839386358628, + "acc_norm": 0.2282051282051282, + "acc_norm_stderr": 0.02127839386358628 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.02620276653465215, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.02620276653465215 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.31932773109243695, + "acc_stderr": 0.0302839955258844, + "acc_norm": 0.31932773109243695, + "acc_norm_stderr": 0.0302839955258844 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.13245033112582782, + "acc_stderr": 0.02767757707489175, + "acc_norm": 0.13245033112582782, + "acc_norm_stderr": 0.02767757707489175 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.21467889908256882, + "acc_stderr": 0.01760430414925649, + "acc_norm": 0.21467889908256882, + "acc_norm_stderr": 0.01760430414925649 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.12962962962962962, + "acc_stderr": 0.022907883151288597, + "acc_norm": 0.12962962962962962, + "acc_norm_stderr": 0.022907883151288597 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.20588235294117646, + "acc_stderr": 0.028379449451588674, + "acc_norm": 0.20588235294117646, + "acc_norm_stderr": 0.028379449451588674 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.23628691983122363, + "acc_stderr": 0.02765215314415927, + "acc_norm": 0.23628691983122363, + "acc_norm_stderr": 0.02765215314415927 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.31390134529147984, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.31390134529147984, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.24427480916030533, + "acc_stderr": 0.037683359597287434, + "acc_norm": 0.24427480916030533, + "acc_norm_stderr": 0.037683359597287434 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.30578512396694213, + "acc_stderr": 0.04205953933884124, + "acc_norm": 0.30578512396694213, + "acc_norm_stderr": 0.04205953933884124 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.19444444444444445, + "acc_stderr": 0.038260763248848646, + "acc_norm": 0.19444444444444445, + "acc_norm_stderr": 0.038260763248848646 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.27607361963190186, + "acc_stderr": 0.0351238528370505, + "acc_norm": 0.27607361963190186, + "acc_norm_stderr": 0.0351238528370505 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.0457237235873743, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.0457237235873743 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.21359223300970873, + "acc_stderr": 0.04058042015646034, + "acc_norm": 0.21359223300970873, + "acc_norm_stderr": 0.04058042015646034 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.25213675213675213, + "acc_stderr": 0.02844796547623102, + "acc_norm": 0.25213675213675213, + "acc_norm_stderr": 0.02844796547623102 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036623, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036623 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2707535121328225, + "acc_stderr": 0.015889888362560486, + "acc_norm": 0.2707535121328225, + "acc_norm_stderr": 0.015889888362560486 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2138728323699422, + "acc_stderr": 0.022075709251757183, + "acc_norm": 0.2138728323699422, + "acc_norm_stderr": 0.022075709251757183 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22875816993464052, + "acc_stderr": 0.024051029739912255, + "acc_norm": 0.22875816993464052, + "acc_norm_stderr": 0.024051029739912255 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2990353697749196, + "acc_stderr": 0.026003301117885135, + "acc_norm": 0.2990353697749196, + "acc_norm_stderr": 0.026003301117885135 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.22530864197530864, + "acc_stderr": 0.023246202647819746, + "acc_norm": 0.22530864197530864, + "acc_norm_stderr": 0.023246202647819746 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2624113475177305, + "acc_stderr": 0.026244920349843007, + "acc_norm": 0.2624113475177305, + "acc_norm_stderr": 0.026244920349843007 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24445893089960888, + "acc_stderr": 0.010976425013113902, + "acc_norm": 0.24445893089960888, + "acc_norm_stderr": 0.010976425013113902 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.18382352941176472, + "acc_stderr": 0.02352924218519311, + "acc_norm": 0.18382352941176472, + "acc_norm_stderr": 0.02352924218519311 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.017740899509177788, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.017740899509177788 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.22727272727272727, + "acc_stderr": 0.04013964554072774, + "acc_norm": 0.22727272727272727, + "acc_norm_stderr": 0.04013964554072774 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.16326530612244897, + "acc_stderr": 0.023661699177098622, + "acc_norm": 0.16326530612244897, + "acc_norm_stderr": 0.023661699177098622 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.26865671641791045, + "acc_stderr": 0.03134328358208954, + "acc_norm": 0.26865671641791045, + "acc_norm_stderr": 0.03134328358208954 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.22289156626506024, + "acc_stderr": 0.03240004825594689, + "acc_norm": 0.22289156626506024, + "acc_norm_stderr": 0.03240004825594689 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.29239766081871343, + "acc_stderr": 0.034886477134579215, + "acc_norm": 0.29239766081871343, + "acc_norm_stderr": 0.034886477134579215 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2423500611995104, + "mc1_stderr": 0.01500067437357034, + "mc2": 0.4927276728925703, + "mc2_stderr": 0.016570107801526537 + }, + "all": { + "acc": 0.25274408092866135, + "acc_stderr": 0.031570516886240504, + "acc_norm": 0.25422642741103013, + "acc_norm_stderr": 0.03159572655337071, + "mc1": 0.2423500611995104, + "mc1_stderr": 0.01500067437357034, + "mc2": 0.4927276728925703, + "mc2_stderr": 0.016570107801526537 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6349.435353755951", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-orca-platypus-13b-0.10e/results_2023-10-17T17-09-41.931905.json b/eval-results/uukuguy/speechless-codellama-orca-platypus-13b-0.10e/results_2023-10-17T17-09-41.931905.json new file mode 100644 index 0000000000000000000000000000000000000000..3a42c4828d9bb42d221e13ec27ddfdb3814cc374 --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-orca-platypus-13b-0.10e/results_2023-10-17T17-09-41.931905.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-orca-platypus-13b-0.10e", + "model_sha": "119abfc73f9ce541a40779f167fe21e95faed4e8", + "model_size": "24.32 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 9.962248322147652e-05, + "f1_stderr": 3.468772577255907e-05 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.5059194948697711, + "acc_stderr": 0.014051500838485807 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 9.962248322147652e-05, + "f1_stderr": 3.468772577255907e-05, + "acc": 0.25295974743488553, + "acc_stderr": 0.007025750419242903 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "61d217e9717c9846" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "f59577a3c3ba7413" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "6c7293270e2793e7" + }, + "total_evaluation_time_secondes": "31867.728604078293", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-orca-platypus-13b-0.10e/results_2023-10-24T14-54-29.987056.json b/eval-results/uukuguy/speechless-codellama-orca-platypus-13b-0.10e/results_2023-10-24T14-54-29.987056.json new file mode 100644 index 0000000000000000000000000000000000000000..45cf0e5092cb336a2c8f677a67c1865f9ec62c65 --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-orca-platypus-13b-0.10e/results_2023-10-24T14-54-29.987056.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-orca-platypus-13b-0.10e", + "model_sha": "119abfc73f9ce541a40779f167fe21e95faed4e8", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 8.913590604026845e-05, + "f1_stderr": 2.996167513080367e-05 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.4972375690607735, + "acc_stderr": 0.014052271211616441 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 8.913590604026845e-05, + "f1_stderr": 2.996167513080367e-05, + "acc": 0.24861878453038674, + "acc_stderr": 0.007026135605808221 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "58e715ebac5c7282" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "82f3437abbe2a22c" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "8091bdef61584b21" + }, + "total_evaluation_time_secondes": "31731.846982955933", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-platypus-13b/results_2023-08-31T15-51-18.379129.json b/eval-results/uukuguy/speechless-codellama-platypus-13b/results_2023-08-31T15-51-18.379129.json new file mode 100644 index 0000000000000000000000000000000000000000..6ed012b745ff4050ced022ec8fe3677fea1e99d1 --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-platypus-13b/results_2023-08-31T15-51-18.379129.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-platypus-13b", + "model_sha": "7a771bd8899b9ef4ba9680e96f84dc85810a67d6", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4249146757679181, + "acc_stderr": 0.01444569896852077, + "acc_norm": 0.4616040955631399, + "acc_norm_stderr": 0.014568245550296361 + }, + "harness|hellaswag|10": { + "acc": 0.5159330810595499, + "acc_stderr": 0.004987247325495631, + "acc_norm": 0.6888070105556662, + "acc_norm_stderr": 0.004620353433075615 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.35555555555555557, + "acc_stderr": 0.04135176749720386, + "acc_norm": 0.35555555555555557, + "acc_norm_stderr": 0.04135176749720386 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.4407894736842105, + "acc_stderr": 0.04040311062490436, + "acc_norm": 0.4407894736842105, + "acc_norm_stderr": 0.04040311062490436 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.4075471698113208, + "acc_stderr": 0.030242233800854498, + "acc_norm": 0.4075471698113208, + "acc_norm_stderr": 0.030242233800854498 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3819444444444444, + "acc_stderr": 0.040629907841466674, + "acc_norm": 0.3819444444444444, + "acc_norm_stderr": 0.040629907841466674 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4046242774566474, + "acc_stderr": 0.03742461193887248, + "acc_norm": 0.4046242774566474, + "acc_norm_stderr": 0.03742461193887248 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.04440521906179327, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.04440521906179327 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.37446808510638296, + "acc_stderr": 0.03163910665367291, + "acc_norm": 0.37446808510638296, + "acc_norm_stderr": 0.03163910665367291 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.043727482902780064, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.043727482902780064 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.43448275862068964, + "acc_stderr": 0.04130740879555497, + "acc_norm": 0.43448275862068964, + "acc_norm_stderr": 0.04130740879555497 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2830687830687831, + "acc_stderr": 0.02320139293819498, + "acc_norm": 0.2830687830687831, + "acc_norm_stderr": 0.02320139293819498 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.042163702135578345, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.042163702135578345 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.4612903225806452, + "acc_stderr": 0.028358634859836935, + "acc_norm": 0.4612903225806452, + "acc_norm_stderr": 0.028358634859836935 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3054187192118227, + "acc_stderr": 0.03240661565868408, + "acc_norm": 0.3054187192118227, + "acc_norm_stderr": 0.03240661565868408 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.5454545454545454, + "acc_stderr": 0.03888176921674101, + "acc_norm": 0.5454545454545454, + "acc_norm_stderr": 0.03888176921674101 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.4696969696969697, + "acc_stderr": 0.03555804051763929, + "acc_norm": 0.4696969696969697, + "acc_norm_stderr": 0.03555804051763929 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.5647668393782384, + "acc_stderr": 0.03578038165008585, + "acc_norm": 0.5647668393782384, + "acc_norm_stderr": 0.03578038165008585 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.4, + "acc_stderr": 0.024838811988033165, + "acc_norm": 0.4, + "acc_norm_stderr": 0.024838811988033165 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2814814814814815, + "acc_stderr": 0.02742001935094528, + "acc_norm": 0.2814814814814815, + "acc_norm_stderr": 0.02742001935094528 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.37815126050420167, + "acc_stderr": 0.03149930577784906, + "acc_norm": 0.37815126050420167, + "acc_norm_stderr": 0.03149930577784906 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2582781456953642, + "acc_stderr": 0.035737053147634576, + "acc_norm": 0.2582781456953642, + "acc_norm_stderr": 0.035737053147634576 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.5908256880733945, + "acc_stderr": 0.021080670264433728, + "acc_norm": 0.5908256880733945, + "acc_norm_stderr": 0.021080670264433728 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3611111111111111, + "acc_stderr": 0.03275773486100999, + "acc_norm": 0.3611111111111111, + "acc_norm_stderr": 0.03275773486100999 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6029411764705882, + "acc_stderr": 0.03434131164719128, + "acc_norm": 0.6029411764705882, + "acc_norm_stderr": 0.03434131164719128 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6708860759493671, + "acc_stderr": 0.03058732629470236, + "acc_norm": 0.6708860759493671, + "acc_norm_stderr": 0.03058732629470236 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.4977578475336323, + "acc_stderr": 0.03355746535223263, + "acc_norm": 0.4977578475336323, + "acc_norm_stderr": 0.03355746535223263 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.4198473282442748, + "acc_stderr": 0.04328577215262972, + "acc_norm": 0.4198473282442748, + "acc_norm_stderr": 0.04328577215262972 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6033057851239669, + "acc_stderr": 0.044658697805310094, + "acc_norm": 0.6033057851239669, + "acc_norm_stderr": 0.044658697805310094 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5462962962962963, + "acc_stderr": 0.04812917324536823, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.04812917324536823 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.5337423312883436, + "acc_stderr": 0.039194155450484096, + "acc_norm": 0.5337423312883436, + "acc_norm_stderr": 0.039194155450484096 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6019417475728155, + "acc_stderr": 0.04846748253977239, + "acc_norm": 0.6019417475728155, + "acc_norm_stderr": 0.04846748253977239 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7094017094017094, + "acc_stderr": 0.029745048572674054, + "acc_norm": 0.7094017094017094, + "acc_norm_stderr": 0.029745048572674054 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.42, + "acc_stderr": 0.04960449637488584, + "acc_norm": 0.42, + "acc_norm_stderr": 0.04960449637488584 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.5887611749680716, + "acc_stderr": 0.017595971908056573, + "acc_norm": 0.5887611749680716, + "acc_norm_stderr": 0.017595971908056573 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.45375722543352603, + "acc_stderr": 0.026803720583206188, + "acc_norm": 0.45375722543352603, + "acc_norm_stderr": 0.026803720583206188 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3329608938547486, + "acc_stderr": 0.015761716178397563, + "acc_norm": 0.3329608938547486, + "acc_norm_stderr": 0.015761716178397563 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.43137254901960786, + "acc_stderr": 0.02835895631342354, + "acc_norm": 0.43137254901960786, + "acc_norm_stderr": 0.02835895631342354 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5144694533762058, + "acc_stderr": 0.02838619808417768, + "acc_norm": 0.5144694533762058, + "acc_norm_stderr": 0.02838619808417768 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.4845679012345679, + "acc_stderr": 0.0278074900442762, + "acc_norm": 0.4845679012345679, + "acc_norm_stderr": 0.0278074900442762 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3475177304964539, + "acc_stderr": 0.02840662780959095, + "acc_norm": 0.3475177304964539, + "acc_norm_stderr": 0.02840662780959095 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.35723598435462844, + "acc_stderr": 0.012238615750316508, + "acc_norm": 0.35723598435462844, + "acc_norm_stderr": 0.012238615750316508 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.3382352941176471, + "acc_stderr": 0.028739328513983576, + "acc_norm": 0.3382352941176471, + "acc_norm_stderr": 0.028739328513983576 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.44607843137254904, + "acc_stderr": 0.020109864547181364, + "acc_norm": 0.44607843137254904, + "acc_norm_stderr": 0.020109864547181364 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5727272727272728, + "acc_stderr": 0.047381987035454834, + "acc_norm": 0.5727272727272728, + "acc_norm_stderr": 0.047381987035454834 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5469387755102041, + "acc_stderr": 0.03186785930004128, + "acc_norm": 0.5469387755102041, + "acc_norm_stderr": 0.03186785930004128 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6119402985074627, + "acc_stderr": 0.03445789964362749, + "acc_norm": 0.6119402985074627, + "acc_norm_stderr": 0.03445789964362749 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.39156626506024095, + "acc_stderr": 0.03799857454479637, + "acc_norm": 0.39156626506024095, + "acc_norm_stderr": 0.03799857454479637 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.5614035087719298, + "acc_stderr": 0.038057975055904594, + "acc_norm": 0.5614035087719298, + "acc_norm_stderr": 0.038057975055904594 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2802937576499388, + "mc1_stderr": 0.01572313952460876, + "mc2": 0.4497718048355691, + "mc2_stderr": 0.014936196335906366 + }, + "all": { + "acc": 0.4462981195828559, + "acc_stderr": 0.03524970482715913, + "acc_norm": 0.44985004075728546, + "acc_norm_stderr": 0.035245563347317696, + "mc1": 0.2802937576499388, + "mc1_stderr": 0.01572313952460876, + "mc2": 0.4497718048355691, + "mc2_stderr": 0.014936196335906366 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6781.611564159393", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-platypus-13b/results_2023-09-12T15-51-14.957387.json b/eval-results/uukuguy/speechless-codellama-platypus-13b/results_2023-09-12T15-51-14.957387.json new file mode 100644 index 0000000000000000000000000000000000000000..47f42cb6b457507ad8e101352df790a049b3069e --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-platypus-13b/results_2023-09-12T15-51-14.957387.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-platypus-13b", + "model_sha": "81cb1bca46ce646b8339501537837e02116de1b8", + "model_size": "24.56 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.41467576791808874, + "acc_stderr": 0.014397070564409172, + "acc_norm": 0.45307167235494883, + "acc_norm_stderr": 0.01454689205200563 + }, + "harness|hellaswag|10": { + "acc": 0.50318661621191, + "acc_stderr": 0.004989680072717476, + "acc_norm": 0.6863174666401115, + "acc_norm_stderr": 0.0046304074768351985 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.34814814814814815, + "acc_stderr": 0.041153246103369526, + "acc_norm": 0.34814814814814815, + "acc_norm_stderr": 0.041153246103369526 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.375, + "acc_stderr": 0.039397364351956274, + "acc_norm": 0.375, + "acc_norm_stderr": 0.039397364351956274 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.37735849056603776, + "acc_stderr": 0.02983280811479601, + "acc_norm": 0.37735849056603776, + "acc_norm_stderr": 0.02983280811479601 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.4236111111111111, + "acc_stderr": 0.041321250197233685, + "acc_norm": 0.4236111111111111, + "acc_norm_stderr": 0.041321250197233685 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.42, + "acc_stderr": 0.04960449637488584, + "acc_norm": 0.42, + "acc_norm_stderr": 0.04960449637488584 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3583815028901734, + "acc_stderr": 0.036563436533531585, + "acc_norm": 0.3583815028901734, + "acc_norm_stderr": 0.036563436533531585 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.044405219061793275, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.044405219061793275 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.37872340425531914, + "acc_stderr": 0.031709956060406545, + "acc_norm": 0.37872340425531914, + "acc_norm_stderr": 0.031709956060406545 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.32456140350877194, + "acc_stderr": 0.04404556157374767, + "acc_norm": 0.32456140350877194, + "acc_norm_stderr": 0.04404556157374767 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3724137931034483, + "acc_stderr": 0.040287315329475576, + "acc_norm": 0.3724137931034483, + "acc_norm_stderr": 0.040287315329475576 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2724867724867725, + "acc_stderr": 0.02293097307163335, + "acc_norm": 0.2724867724867725, + "acc_norm_stderr": 0.02293097307163335 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04216370213557835, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04216370213557835 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.43548387096774194, + "acc_stderr": 0.028206225591502744, + "acc_norm": 0.43548387096774194, + "acc_norm_stderr": 0.028206225591502744 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.03178529710642751, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.03178529710642751 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.593939393939394, + "acc_stderr": 0.03834816355401181, + "acc_norm": 0.593939393939394, + "acc_norm_stderr": 0.03834816355401181 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.45454545454545453, + "acc_stderr": 0.03547601494006936, + "acc_norm": 0.45454545454545453, + "acc_norm_stderr": 0.03547601494006936 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.5647668393782384, + "acc_stderr": 0.035780381650085846, + "acc_norm": 0.5647668393782384, + "acc_norm_stderr": 0.035780381650085846 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.33589743589743587, + "acc_stderr": 0.02394672474156397, + "acc_norm": 0.33589743589743587, + "acc_norm_stderr": 0.02394672474156397 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25555555555555554, + "acc_stderr": 0.026593939101844086, + "acc_norm": 0.25555555555555554, + "acc_norm_stderr": 0.026593939101844086 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.36134453781512604, + "acc_stderr": 0.03120469122515002, + "acc_norm": 0.36134453781512604, + "acc_norm_stderr": 0.03120469122515002 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2913907284768212, + "acc_stderr": 0.037101857261199946, + "acc_norm": 0.2913907284768212, + "acc_norm_stderr": 0.037101857261199946 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.4917431192660551, + "acc_stderr": 0.021434399918214338, + "acc_norm": 0.4917431192660551, + "acc_norm_stderr": 0.021434399918214338 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.28703703703703703, + "acc_stderr": 0.030851992993257017, + "acc_norm": 0.28703703703703703, + "acc_norm_stderr": 0.030851992993257017 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.6274509803921569, + "acc_stderr": 0.033933885849584046, + "acc_norm": 0.6274509803921569, + "acc_norm_stderr": 0.033933885849584046 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.6540084388185654, + "acc_stderr": 0.030964810588786713, + "acc_norm": 0.6540084388185654, + "acc_norm_stderr": 0.030964810588786713 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.47533632286995514, + "acc_stderr": 0.033516951676526276, + "acc_norm": 0.47533632286995514, + "acc_norm_stderr": 0.033516951676526276 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.4580152671755725, + "acc_stderr": 0.04369802690578757, + "acc_norm": 0.4580152671755725, + "acc_norm_stderr": 0.04369802690578757 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6446280991735537, + "acc_stderr": 0.04369236326573981, + "acc_norm": 0.6446280991735537, + "acc_norm_stderr": 0.04369236326573981 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.048262172941398944, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.048262172941398944 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4539877300613497, + "acc_stderr": 0.0391170190467718, + "acc_norm": 0.4539877300613497, + "acc_norm_stderr": 0.0391170190467718 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.04521829902833585, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.04521829902833585 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6019417475728155, + "acc_stderr": 0.04846748253977239, + "acc_norm": 0.6019417475728155, + "acc_norm_stderr": 0.04846748253977239 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6581196581196581, + "acc_stderr": 0.031075028526507748, + "acc_norm": 0.6581196581196581, + "acc_norm_stderr": 0.031075028526507748 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.558109833971903, + "acc_stderr": 0.017758800534214414, + "acc_norm": 0.558109833971903, + "acc_norm_stderr": 0.017758800534214414 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.4797687861271676, + "acc_stderr": 0.026897049996382868, + "acc_norm": 0.4797687861271676, + "acc_norm_stderr": 0.026897049996382868 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.30614525139664805, + "acc_stderr": 0.015414494487903219, + "acc_norm": 0.30614525139664805, + "acc_norm_stderr": 0.015414494487903219 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.3758169934640523, + "acc_stderr": 0.027732834353363944, + "acc_norm": 0.3758169934640523, + "acc_norm_stderr": 0.027732834353363944 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.5112540192926045, + "acc_stderr": 0.028390897396863537, + "acc_norm": 0.5112540192926045, + "acc_norm_stderr": 0.028390897396863537 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.4537037037037037, + "acc_stderr": 0.027701228468542602, + "acc_norm": 0.4537037037037037, + "acc_norm_stderr": 0.027701228468542602 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3191489361702128, + "acc_stderr": 0.027807990141320203, + "acc_norm": 0.3191489361702128, + "acc_norm_stderr": 0.027807990141320203 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3533246414602347, + "acc_stderr": 0.012208408211082426, + "acc_norm": 0.3533246414602347, + "acc_norm_stderr": 0.012208408211082426 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2757352941176471, + "acc_stderr": 0.027146271936625166, + "acc_norm": 0.2757352941176471, + "acc_norm_stderr": 0.027146271936625166 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.4035947712418301, + "acc_stderr": 0.01984828016840117, + "acc_norm": 0.4035947712418301, + "acc_norm_stderr": 0.01984828016840117 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5454545454545454, + "acc_stderr": 0.04769300568972745, + "acc_norm": 0.5454545454545454, + "acc_norm_stderr": 0.04769300568972745 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.4897959183673469, + "acc_stderr": 0.03200255347893782, + "acc_norm": 0.4897959183673469, + "acc_norm_stderr": 0.03200255347893782 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.5223880597014925, + "acc_stderr": 0.03531987930208731, + "acc_norm": 0.5223880597014925, + "acc_norm_stderr": 0.03531987930208731 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.35542168674698793, + "acc_stderr": 0.03726214354322415, + "acc_norm": 0.35542168674698793, + "acc_norm_stderr": 0.03726214354322415 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.52046783625731, + "acc_stderr": 0.038316105328219316, + "acc_norm": 0.52046783625731, + "acc_norm_stderr": 0.038316105328219316 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2607099143206854, + "mc1_stderr": 0.015368841620766368, + "mc2": 0.42379634005703287, + "mc2_stderr": 0.014924927935144282 + }, + "all": { + "acc": 0.42921049137622375, + "acc_stderr": 0.03511728333962651, + "acc_norm": 0.4329651821366485, + "acc_norm_stderr": 0.03511373332084201, + "mc1": 0.2607099143206854, + "mc1_stderr": 0.015368841620766368, + "mc2": 0.42379634005703287, + "mc2_stderr": 0.014924927935144282 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6357.056966781616", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-platypus-13b/results_2023-10-18T18-50-42.836793.json b/eval-results/uukuguy/speechless-codellama-platypus-13b/results_2023-10-18T18-50-42.836793.json new file mode 100644 index 0000000000000000000000000000000000000000..a660797a6738cd2bca4e21ade877e6bba92da415 --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-platypus-13b/results_2023-10-18T18-50-42.836793.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-platypus-13b", + "model_sha": "f0598c3d6796889d9be969768048dc74820b7ca9", + "model_size": "24.56 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.004718959731543624, + "em_stderr": 0.0007018360183131142, + "f1": 0.05542051174496665, + "f1_stderr": 0.001376052199396106 + }, + "harness|gsm8k|5": { + "acc": 0.09401061410159212, + "acc_stderr": 0.00803881981887249 + }, + "harness|winogrande|5": { + "acc": 0.6614048934490924, + "acc_stderr": 0.013300169865842416 + }, + "all": { + "em": 0.004718959731543624, + "em_stderr": 0.0007018360183131142, + "f1": 0.05542051174496665, + "f1_stderr": 0.001376052199396106, + "acc": 0.37770775377534227, + "acc_stderr": 0.010669494842357453 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "d8bfd62cd0020247" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "ce97f447d8eadbfd" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "fa8c80e1cb7968ca" + }, + "total_evaluation_time_secondes": "12243.999771595001", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-codellama-platypus-13b/results_2023-10-25T04-45-04.706301.json b/eval-results/uukuguy/speechless-codellama-platypus-13b/results_2023-10-25T04-45-04.706301.json new file mode 100644 index 0000000000000000000000000000000000000000..af887d2445a813a1bb2aa0fc8a59be616a8eaacf --- /dev/null +++ b/eval-results/uukuguy/speechless-codellama-platypus-13b/results_2023-10-25T04-45-04.706301.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-codellama-platypus-13b", + "model_sha": "f0598c3d6796889d9be969768048dc74820b7ca9", + "model_size": "24.56 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.008494127516778523, + "em_stderr": 0.0009398243325411525, + "f1": 0.05910234899328872, + "f1_stderr": 0.001500499797469734 + }, + "harness|gsm8k|5": { + "acc": 0.09097801364670205, + "acc_stderr": 0.007921322844013643 + }, + "harness|winogrande|5": { + "acc": 0.6558800315706393, + "acc_stderr": 0.013352121905005935 + }, + "all": { + "em": 0.008494127516778523, + "em_stderr": 0.0009398243325411525, + "f1": 0.05910234899328872, + "f1_stderr": 0.001500499797469734, + "acc": 0.3734290226086707, + "acc_stderr": 0.010636722374509789 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "7d640a5f442da36d" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "a1bb9dcfe13ea38c" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "0bb3ce6809963e63" + }, + "total_evaluation_time_secondes": "12464.980425357819", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-coding-7b-16k-tora/results_2023-12-09T15-50-40.789199.json b/eval-results/uukuguy/speechless-coding-7b-16k-tora/results_2023-12-09T15-50-40.789199.json new file mode 100644 index 0000000000000000000000000000000000000000..1029316c2817d112cf763b0ca029d6524b0072b4 --- /dev/null +++ b/eval-results/uukuguy/speechless-coding-7b-16k-tora/results_2023-12-09T15-50-40.789199.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 584901.040810675, + "end_time": 591913.689073263, + "total_evaluation_time_secondes": "7012.648262587958", + "model_name": "uukuguy/speechless-coding-7b-16k-tora", + "model_sha": "d56b5c4f649d8e722efb927d16d7589967a67fbe", + "model_dtype": "torch.float16", + "model_size": "12.8 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.37457337883959047, + "acc_stderr": 0.014144193471893446, + "acc_norm": 0.4121160409556314, + "acc_norm_stderr": 0.0143839153022254 + }, + "harness|hellaswag|10": { + "acc": 0.4838677554272057, + "acc_stderr": 0.004987183560792758, + "acc_norm": 0.6444931288587931, + "acc_norm_stderr": 0.004776883632722618 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.35555555555555557, + "acc_stderr": 0.04135176749720386, + "acc_norm": 0.35555555555555557, + "acc_norm_stderr": 0.04135176749720386 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.32894736842105265, + "acc_stderr": 0.03823428969926604, + "acc_norm": 0.32894736842105265, + "acc_norm_stderr": 0.03823428969926604 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.39622641509433965, + "acc_stderr": 0.030102793781791194, + "acc_norm": 0.39622641509433965, + "acc_norm_stderr": 0.030102793781791194 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3472222222222222, + "acc_stderr": 0.039812405437178615, + "acc_norm": 0.3472222222222222, + "acc_norm_stderr": 0.039812405437178615 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3352601156069364, + "acc_stderr": 0.03599586301247077, + "acc_norm": 0.3352601156069364, + "acc_norm_stderr": 0.03599586301247077 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.043364327079931785, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.043364327079931785 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237101, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237101 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2936170212765957, + "acc_stderr": 0.029771642712491227, + "acc_norm": 0.2936170212765957, + "acc_norm_stderr": 0.029771642712491227 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3448275862068966, + "acc_stderr": 0.03960933549451208, + "acc_norm": 0.3448275862068966, + "acc_norm_stderr": 0.03960933549451208 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2751322751322751, + "acc_stderr": 0.023000086859068656, + "acc_norm": 0.2751322751322751, + "acc_norm_stderr": 0.023000086859068656 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.03932537680392871, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.03932537680392871 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3903225806451613, + "acc_stderr": 0.027751256636969576, + "acc_norm": 0.3903225806451613, + "acc_norm_stderr": 0.027751256636969576 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3497536945812808, + "acc_stderr": 0.03355400904969566, + "acc_norm": 0.3497536945812808, + "acc_norm_stderr": 0.03355400904969566 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.509090909090909, + "acc_stderr": 0.03903698647748441, + "acc_norm": 0.509090909090909, + "acc_norm_stderr": 0.03903698647748441 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.47474747474747475, + "acc_stderr": 0.03557806245087314, + "acc_norm": 0.47474747474747475, + "acc_norm_stderr": 0.03557806245087314 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.46113989637305697, + "acc_stderr": 0.03597524411734578, + "acc_norm": 0.46113989637305697, + "acc_norm_stderr": 0.03597524411734578 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3717948717948718, + "acc_stderr": 0.024503472557110946, + "acc_norm": 0.3717948717948718, + "acc_norm_stderr": 0.024503472557110946 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.027309140588230186, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.027309140588230186 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3949579831932773, + "acc_stderr": 0.03175367846096624, + "acc_norm": 0.3949579831932773, + "acc_norm_stderr": 0.03175367846096624 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23178807947019867, + "acc_stderr": 0.03445406271987054, + "acc_norm": 0.23178807947019867, + "acc_norm_stderr": 0.03445406271987054 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.42935779816513764, + "acc_stderr": 0.021222286397236508, + "acc_norm": 0.42935779816513764, + "acc_norm_stderr": 0.021222286397236508 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.27314814814814814, + "acc_stderr": 0.03038805130167812, + "acc_norm": 0.27314814814814814, + "acc_norm_stderr": 0.03038805130167812 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.4411764705882353, + "acc_stderr": 0.03484941514429231, + "acc_norm": 0.4411764705882353, + "acc_norm_stderr": 0.03484941514429231 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.5569620253164557, + "acc_stderr": 0.032335327775334835, + "acc_norm": 0.5569620253164557, + "acc_norm_stderr": 0.032335327775334835 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.4304932735426009, + "acc_stderr": 0.033231973029429394, + "acc_norm": 0.4304932735426009, + "acc_norm_stderr": 0.033231973029429394 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.4732824427480916, + "acc_stderr": 0.04379024936553894, + "acc_norm": 0.4732824427480916, + "acc_norm_stderr": 0.04379024936553894 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5537190082644629, + "acc_stderr": 0.04537935177947879, + "acc_norm": 0.5537190082644629, + "acc_norm_stderr": 0.04537935177947879 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.49074074074074076, + "acc_stderr": 0.04832853553437055, + "acc_norm": 0.49074074074074076, + "acc_norm_stderr": 0.04832853553437055 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.39263803680981596, + "acc_stderr": 0.03836740907831029, + "acc_norm": 0.39263803680981596, + "acc_norm_stderr": 0.03836740907831029 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.29464285714285715, + "acc_stderr": 0.043270409325787296, + "acc_norm": 0.29464285714285715, + "acc_norm_stderr": 0.043270409325787296 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5048543689320388, + "acc_stderr": 0.049505043821289195, + "acc_norm": 0.5048543689320388, + "acc_norm_stderr": 0.049505043821289195 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.594017094017094, + "acc_stderr": 0.03217180182641087, + "acc_norm": 0.594017094017094, + "acc_norm_stderr": 0.03217180182641087 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.4840357598978289, + "acc_stderr": 0.01787084750608173, + "acc_norm": 0.4840357598978289, + "acc_norm_stderr": 0.01787084750608173 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.4653179190751445, + "acc_stderr": 0.02685425792825889, + "acc_norm": 0.4653179190751445, + "acc_norm_stderr": 0.02685425792825889 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25139664804469275, + "acc_stderr": 0.014508979453553984, + "acc_norm": 0.25139664804469275, + "acc_norm_stderr": 0.014508979453553984 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.3954248366013072, + "acc_stderr": 0.027996723180631455, + "acc_norm": 0.3954248366013072, + "acc_norm_stderr": 0.027996723180631455 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.44694533762057875, + "acc_stderr": 0.028237769422085324, + "acc_norm": 0.44694533762057875, + "acc_norm_stderr": 0.028237769422085324 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.42592592592592593, + "acc_stderr": 0.027513747284379428, + "acc_norm": 0.42592592592592593, + "acc_norm_stderr": 0.027513747284379428 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3191489361702128, + "acc_stderr": 0.027807990141320193, + "acc_norm": 0.3191489361702128, + "acc_norm_stderr": 0.027807990141320193 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.3122555410691004, + "acc_stderr": 0.011835798135683182, + "acc_norm": 0.3122555410691004, + "acc_norm_stderr": 0.011835798135683182 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.3125, + "acc_stderr": 0.02815637344037142, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.02815637344037142 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.019450768432505514, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.019450768432505514 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5363636363636364, + "acc_stderr": 0.04776449162396197, + "acc_norm": 0.5363636363636364, + "acc_norm_stderr": 0.04776449162396197 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.46938775510204084, + "acc_stderr": 0.031949171367580624, + "acc_norm": 0.46938775510204084, + "acc_norm_stderr": 0.031949171367580624 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.47761194029850745, + "acc_stderr": 0.035319879302087305, + "acc_norm": 0.47761194029850745, + "acc_norm_stderr": 0.035319879302087305 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3795180722891566, + "acc_stderr": 0.037777988227480165, + "acc_norm": 0.3795180722891566, + "acc_norm_stderr": 0.037777988227480165 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.391812865497076, + "acc_stderr": 0.03743979825926401, + "acc_norm": 0.391812865497076, + "acc_norm_stderr": 0.03743979825926401 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.29008567931456547, + "mc1_stderr": 0.01588623687420952, + "mc2": 0.4490702414317695, + "mc2_stderr": 0.01493086789491207 + }, + "harness|winogrande|5": { + "acc": 0.6361483820047356, + "acc_stderr": 0.013521488896883415 + }, + "harness|gsm8k|5": { + "acc": 0.1728582259287339, + "acc_stderr": 0.010415432246200566 + }, + "all": { + "acc": 0.3931109615254218, + "acc_stderr": 0.03416544865753528, + "acc_norm": 0.3960835606892354, + "acc_norm_stderr": 0.03491838760794626, + "mc1": 0.29008567931456547, + "mc1_stderr": 0.01588623687420952, + "mc2": 0.4490702414317695, + "mc2_stderr": 0.01493086789491207 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "c78e1d116bc0ec18" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "a8fa53915153e1db", + "hash_cont_tokens": "2823dbbe667e3d66" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-hermes-coig-lite-13b/results_2023-08-21T17-40-30.743693.json b/eval-results/uukuguy/speechless-hermes-coig-lite-13b/results_2023-08-21T17-40-30.743693.json new file mode 100644 index 0000000000000000000000000000000000000000..7878691b9328664184989a15b361b997545b9231 --- /dev/null +++ b/eval-results/uukuguy/speechless-hermes-coig-lite-13b/results_2023-08-21T17-40-30.743693.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5520477815699659, + "acc_stderr": 0.014532011498211674, + "acc_norm": 0.5947098976109215, + "acc_norm_stderr": 0.01434686906022933 + }, + "harness|hellaswag|10": { + "acc": 0.6113324039036049, + "acc_stderr": 0.0048645132621943105, + "acc_norm": 0.8228440549691296, + "acc_norm_stderr": 0.003810203308901091 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5526315789473685, + "acc_stderr": 0.0404633688397825, + "acc_norm": 0.5526315789473685, + "acc_norm_stderr": 0.0404633688397825 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6037735849056604, + "acc_stderr": 0.030102793781791197, + "acc_norm": 0.6037735849056604, + "acc_norm_stderr": 0.030102793781791197 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5763888888888888, + "acc_stderr": 0.04132125019723369, + "acc_norm": 0.5763888888888888, + "acc_norm_stderr": 0.04132125019723369 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5491329479768786, + "acc_stderr": 0.0379401267469703, + "acc_norm": 0.5491329479768786, + "acc_norm_stderr": 0.0379401267469703 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.04440521906179328, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.04440521906179328 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.43829787234042555, + "acc_stderr": 0.03243618636108102, + "acc_norm": 0.43829787234042555, + "acc_norm_stderr": 0.03243618636108102 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537315, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537315 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.496551724137931, + "acc_stderr": 0.041665675771015785, + "acc_norm": 0.496551724137931, + "acc_norm_stderr": 0.041665675771015785 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3544973544973545, + "acc_stderr": 0.024636830602842, + "acc_norm": 0.3544973544973545, + "acc_norm_stderr": 0.024636830602842 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.04306241259127153, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.04306241259127153 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6064516129032258, + "acc_stderr": 0.02779187875313226, + "acc_norm": 0.6064516129032258, + "acc_norm_stderr": 0.02779187875313226 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4236453201970443, + "acc_stderr": 0.034767257476490385, + "acc_norm": 0.4236453201970443, + "acc_norm_stderr": 0.034767257476490385 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6787878787878788, + "acc_stderr": 0.036462049632538115, + "acc_norm": 0.6787878787878788, + "acc_norm_stderr": 0.036462049632538115 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.702020202020202, + "acc_stderr": 0.03258630383836556, + "acc_norm": 0.702020202020202, + "acc_norm_stderr": 0.03258630383836556 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7927461139896373, + "acc_stderr": 0.029252823291803638, + "acc_norm": 0.7927461139896373, + "acc_norm_stderr": 0.029252823291803638 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5384615384615384, + "acc_stderr": 0.02527589207024064, + "acc_norm": 0.5384615384615384, + "acc_norm_stderr": 0.02527589207024064 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3592592592592593, + "acc_stderr": 0.02925290592725198, + "acc_norm": 0.3592592592592593, + "acc_norm_stderr": 0.02925290592725198 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.032252942323996406, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.032252942323996406 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3509933774834437, + "acc_stderr": 0.03896981964257375, + "acc_norm": 0.3509933774834437, + "acc_norm_stderr": 0.03896981964257375 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7321100917431193, + "acc_stderr": 0.018987462257978652, + "acc_norm": 0.7321100917431193, + "acc_norm_stderr": 0.018987462257978652 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.46296296296296297, + "acc_stderr": 0.03400603625538271, + "acc_norm": 0.46296296296296297, + "acc_norm_stderr": 0.03400603625538271 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7352941176470589, + "acc_stderr": 0.030964517926923403, + "acc_norm": 0.7352941176470589, + "acc_norm_stderr": 0.030964517926923403 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7341772151898734, + "acc_stderr": 0.02875679962965834, + "acc_norm": 0.7341772151898734, + "acc_norm_stderr": 0.02875679962965834 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6322869955156951, + "acc_stderr": 0.03236198350928275, + "acc_norm": 0.6322869955156951, + "acc_norm_stderr": 0.03236198350928275 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5343511450381679, + "acc_stderr": 0.043749285605997376, + "acc_norm": 0.5343511450381679, + "acc_norm_stderr": 0.043749285605997376 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.768595041322314, + "acc_stderr": 0.03849856098794088, + "acc_norm": 0.768595041322314, + "acc_norm_stderr": 0.03849856098794088 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6759259259259259, + "acc_stderr": 0.045245960070300476, + "acc_norm": 0.6759259259259259, + "acc_norm_stderr": 0.045245960070300476 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6748466257668712, + "acc_stderr": 0.036803503712864616, + "acc_norm": 0.6748466257668712, + "acc_norm_stderr": 0.036803503712864616 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.045416094465039476, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.045416094465039476 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7735042735042735, + "acc_stderr": 0.027421007295392912, + "acc_norm": 0.7735042735042735, + "acc_norm_stderr": 0.027421007295392912 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7586206896551724, + "acc_stderr": 0.015302380123542108, + "acc_norm": 0.7586206896551724, + "acc_norm_stderr": 0.015302380123542108 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.02607431485165708, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.02607431485165708 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.31731843575418994, + "acc_stderr": 0.01556639263005703, + "acc_norm": 0.31731843575418994, + "acc_norm_stderr": 0.01556639263005703 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.027914055510467998, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.027914055510467998 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6237942122186495, + "acc_stderr": 0.027513925683549434, + "acc_norm": 0.6237942122186495, + "acc_norm_stderr": 0.027513925683549434 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6512345679012346, + "acc_stderr": 0.02651759772446501, + "acc_norm": 0.6512345679012346, + "acc_norm_stderr": 0.02651759772446501 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4219858156028369, + "acc_stderr": 0.029462189233370597, + "acc_norm": 0.4219858156028369, + "acc_norm_stderr": 0.029462189233370597 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.41003911342894395, + "acc_stderr": 0.01256183762196204, + "acc_norm": 0.41003911342894395, + "acc_norm_stderr": 0.01256183762196204 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.030161911930767105, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.030161911930767105 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5359477124183006, + "acc_stderr": 0.020175488765484036, + "acc_norm": 0.5359477124183006, + "acc_norm_stderr": 0.020175488765484036 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.046075820907199756, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.046075820907199756 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.031680911612338825, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.031680911612338825 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7064676616915423, + "acc_stderr": 0.03220024104534205, + "acc_norm": 0.7064676616915423, + "acc_norm_stderr": 0.03220024104534205 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7543859649122807, + "acc_stderr": 0.0330140594698725, + "acc_norm": 0.7543859649122807, + "acc_norm_stderr": 0.0330140594698725 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3390452876376989, + "mc1_stderr": 0.016571797910626608, + "mc2": 0.4760084073774732, + "mc2_stderr": 0.01516279117118438 + }, + "all": { + "acc": 0.552829982108787, + "acc_stderr": 0.03471081958793997, + "acc_norm": 0.5571380120597443, + "acc_norm_stderr": 0.03468981192029124, + "mc1": 0.3390452876376989, + "mc1_stderr": 0.016571797910626608, + "mc2": 0.4760084073774732, + "mc2_stderr": 0.01516279117118438 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "uukuguy/speechless-hermes-coig-lite-13b", + "model_sha": "2ee11d9c7acaefb723796227e2ad099b165f0dd9", + "model_dtype": "torch.float16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6357.519212245941", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-hermes-coig-lite-13b/results_2023-08-21T22-44-56.088825.json b/eval-results/uukuguy/speechless-hermes-coig-lite-13b/results_2023-08-21T22-44-56.088825.json new file mode 100644 index 0000000000000000000000000000000000000000..46f5242a7a4950881fe897e2b309ed8775b6dc72 --- /dev/null +++ b/eval-results/uukuguy/speechless-hermes-coig-lite-13b/results_2023-08-21T22-44-56.088825.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.552901023890785, + "acc_stderr": 0.014529380160526843, + "acc_norm": 0.5955631399317406, + "acc_norm_stderr": 0.014342036483436175 + }, + "harness|hellaswag|10": { + "acc": 0.6107349133638718, + "acc_stderr": 0.004865871290143341, + "acc_norm": 0.8226448914558853, + "acc_norm_stderr": 0.0038118830709112698 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4888888888888889, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.4888888888888889, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5526315789473685, + "acc_stderr": 0.04046336883978249, + "acc_norm": 0.5526315789473685, + "acc_norm_stderr": 0.04046336883978249 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.48, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.48, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6150943396226415, + "acc_stderr": 0.02994649856769995, + "acc_norm": 0.6150943396226415, + "acc_norm_stderr": 0.02994649856769995 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5694444444444444, + "acc_stderr": 0.04140685639111503, + "acc_norm": 0.5694444444444444, + "acc_norm_stderr": 0.04140685639111503 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5375722543352601, + "acc_stderr": 0.0380168510452446, + "acc_norm": 0.5375722543352601, + "acc_norm_stderr": 0.0380168510452446 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.04440521906179328, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.04440521906179328 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.74, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.74, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4425531914893617, + "acc_stderr": 0.03246956919789958, + "acc_norm": 0.4425531914893617, + "acc_norm_stderr": 0.03246956919789958 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.04339138322579861, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.04339138322579861 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.496551724137931, + "acc_stderr": 0.041665675771015785, + "acc_norm": 0.496551724137931, + "acc_norm_stderr": 0.041665675771015785 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.35978835978835977, + "acc_stderr": 0.024718075944129277, + "acc_norm": 0.35978835978835977, + "acc_norm_stderr": 0.024718075944129277 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.04306241259127153, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.04306241259127153 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6064516129032258, + "acc_stderr": 0.02779187875313226, + "acc_norm": 0.6064516129032258, + "acc_norm_stderr": 0.02779187875313226 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4088669950738916, + "acc_stderr": 0.03459058815883231, + "acc_norm": 0.4088669950738916, + "acc_norm_stderr": 0.03459058815883231 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6787878787878788, + "acc_stderr": 0.036462049632538115, + "acc_norm": 0.6787878787878788, + "acc_norm_stderr": 0.036462049632538115 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7070707070707071, + "acc_stderr": 0.03242497958178815, + "acc_norm": 0.7070707070707071, + "acc_norm_stderr": 0.03242497958178815 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7927461139896373, + "acc_stderr": 0.029252823291803638, + "acc_norm": 0.7927461139896373, + "acc_norm_stderr": 0.029252823291803638 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.541025641025641, + "acc_stderr": 0.025265525491284295, + "acc_norm": 0.541025641025641, + "acc_norm_stderr": 0.025265525491284295 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.02911661760608302, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.02911661760608302 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5714285714285714, + "acc_stderr": 0.032145368597886394, + "acc_norm": 0.5714285714285714, + "acc_norm_stderr": 0.032145368597886394 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.36423841059602646, + "acc_stderr": 0.03929111781242742, + "acc_norm": 0.36423841059602646, + "acc_norm_stderr": 0.03929111781242742 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7321100917431193, + "acc_stderr": 0.018987462257978652, + "acc_norm": 0.7321100917431193, + "acc_norm_stderr": 0.018987462257978652 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4398148148148148, + "acc_stderr": 0.03385177976044811, + "acc_norm": 0.4398148148148148, + "acc_norm_stderr": 0.03385177976044811 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7352941176470589, + "acc_stderr": 0.0309645179269234, + "acc_norm": 0.7352941176470589, + "acc_norm_stderr": 0.0309645179269234 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7257383966244726, + "acc_stderr": 0.02904133351059804, + "acc_norm": 0.7257383966244726, + "acc_norm_stderr": 0.02904133351059804 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6367713004484304, + "acc_stderr": 0.032277904428505, + "acc_norm": 0.6367713004484304, + "acc_norm_stderr": 0.032277904428505 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.549618320610687, + "acc_stderr": 0.04363643698524779, + "acc_norm": 0.549618320610687, + "acc_norm_stderr": 0.04363643698524779 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.768595041322314, + "acc_stderr": 0.03849856098794088, + "acc_norm": 0.768595041322314, + "acc_norm_stderr": 0.03849856098794088 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.04557239513497752, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.04557239513497752 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6748466257668712, + "acc_stderr": 0.036803503712864616, + "acc_norm": 0.6748466257668712, + "acc_norm_stderr": 0.036803503712864616 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6990291262135923, + "acc_stderr": 0.045416094465039476, + "acc_norm": 0.6990291262135923, + "acc_norm_stderr": 0.045416094465039476 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7863247863247863, + "acc_stderr": 0.02685345037700916, + "acc_norm": 0.7863247863247863, + "acc_norm_stderr": 0.02685345037700916 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956913, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956913 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7547892720306514, + "acc_stderr": 0.015384352284543941, + "acc_norm": 0.7547892720306514, + "acc_norm_stderr": 0.015384352284543941 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6271676300578035, + "acc_stderr": 0.02603389061357628, + "acc_norm": 0.6271676300578035, + "acc_norm_stderr": 0.02603389061357628 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.30837988826815643, + "acc_stderr": 0.01544571691099888, + "acc_norm": 0.30837988826815643, + "acc_norm_stderr": 0.01544571691099888 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6241830065359477, + "acc_stderr": 0.027732834353363947, + "acc_norm": 0.6241830065359477, + "acc_norm_stderr": 0.027732834353363947 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6141479099678456, + "acc_stderr": 0.027648149599751464, + "acc_norm": 0.6141479099678456, + "acc_norm_stderr": 0.027648149599751464 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6481481481481481, + "acc_stderr": 0.02657148348071997, + "acc_norm": 0.6481481481481481, + "acc_norm_stderr": 0.02657148348071997 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4148936170212766, + "acc_stderr": 0.029392236584612503, + "acc_norm": 0.4148936170212766, + "acc_norm_stderr": 0.029392236584612503 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4074315514993481, + "acc_stderr": 0.012549473714212224, + "acc_norm": 0.4074315514993481, + "acc_norm_stderr": 0.012549473714212224 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5477941176470589, + "acc_stderr": 0.030233758551596452, + "acc_norm": 0.5477941176470589, + "acc_norm_stderr": 0.030233758551596452 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5408496732026143, + "acc_stderr": 0.020160213617222516, + "acc_norm": 0.5408496732026143, + "acc_norm_stderr": 0.020160213617222516 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.045820048415054174, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.045820048415054174 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5673469387755102, + "acc_stderr": 0.031717528240626645, + "acc_norm": 0.5673469387755102, + "acc_norm_stderr": 0.031717528240626645 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7014925373134329, + "acc_stderr": 0.03235743789355042, + "acc_norm": 0.7014925373134329, + "acc_norm_stderr": 0.03235743789355042 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.463855421686747, + "acc_stderr": 0.03882310850890593, + "acc_norm": 0.463855421686747, + "acc_norm_stderr": 0.03882310850890593 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7660818713450293, + "acc_stderr": 0.03246721765117826, + "acc_norm": 0.7660818713450293, + "acc_norm_stderr": 0.03246721765117826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3378212974296206, + "mc1_stderr": 0.016557167322516875, + "mc2": 0.47563564471422687, + "mc2_stderr": 0.015157119827092011 + }, + "all": { + "acc": 0.5540179768073534, + "acc_stderr": 0.034706557254092496, + "acc_norm": 0.5583327580638444, + "acc_norm_stderr": 0.03468551773042601, + "mc1": 0.3378212974296206, + "mc1_stderr": 0.016557167322516875, + "mc2": 0.47563564471422687, + "mc2_stderr": 0.015157119827092011 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "uukuguy/speechless-hermes-coig-lite-13b", + "model_sha": "2ee11d9c7acaefb723796227e2ad099b165f0dd9", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "9a6ba3212080b87510982c30fdec55b87dcab0c7", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6762.326350450516", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-hermes-coig-lite-13b/results_2023-10-17T08-26-01.591650.json b/eval-results/uukuguy/speechless-hermes-coig-lite-13b/results_2023-10-17T08-26-01.591650.json new file mode 100644 index 0000000000000000000000000000000000000000..f4f6ed5c31c13b361b6c83fed09f5b05642f7f2c --- /dev/null +++ b/eval-results/uukuguy/speechless-hermes-coig-lite-13b/results_2023-10-17T08-26-01.591650.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-hermes-coig-lite-13b", + "model_sha": "ca518001456ece5040bc01e9f18131cc17b7f487", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.34616191275167785, + "em_stderr": 0.00487207987824367, + "f1": 0.3925209731543626, + "f1_stderr": 0.004756667202146712 + }, + "harness|gsm8k|5": { + "acc": 0.10765731614859743, + "acc_stderr": 0.00853748400302334 + }, + "harness|winogrande|5": { + "acc": 0.7861089187056038, + "acc_stderr": 0.011524466954090252 + }, + "all": { + "em": 0.34616191275167785, + "em_stderr": 0.00487207987824367, + "f1": 0.3925209731543626, + "f1_stderr": 0.004756667202146712, + "acc": 0.44688311742710063, + "acc_stderr": 0.010030975478556796 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "fdb1e20350f49acb" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "b2df57dbcc6d3153" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "8c2ee370f6bf06c1" + }, + "total_evaluation_time_secondes": "8900.48944735527", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-hermes-coig-lite-13b/results_2023-10-18T15-01-47.854586.json b/eval-results/uukuguy/speechless-hermes-coig-lite-13b/results_2023-10-18T15-01-47.854586.json new file mode 100644 index 0000000000000000000000000000000000000000..08238c774f34c01f078cb1db2fa21b0a84c65530 --- /dev/null +++ b/eval-results/uukuguy/speechless-hermes-coig-lite-13b/results_2023-10-18T15-01-47.854586.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-hermes-coig-lite-13b", + "model_sha": "ca518001456ece5040bc01e9f18131cc17b7f487", + "model_size": "24.32 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.3490981543624161, + "em_stderr": 0.004881701038810246, + "f1": 0.39497588087248336, + "f1_stderr": 0.004768097534076323 + }, + "harness|gsm8k|5": { + "acc": 0.09855951478392722, + "acc_stderr": 0.008210320350946338 + }, + "harness|winogrande|5": { + "acc": 0.7853196527229677, + "acc_stderr": 0.011539912734345398 + }, + "all": { + "em": 0.3490981543624161, + "em_stderr": 0.004881701038810246, + "f1": 0.39497588087248336, + "f1_stderr": 0.004768097534076323, + "acc": 0.44193958375344744, + "acc_stderr": 0.009875116542645869 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "2423e4f4efb3fc0d" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "f9bc2a7dfd81cfde" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "d92ef9333fdb96fa" + }, + "total_evaluation_time_secondes": "9383.500284433365", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-llama2-13b/results_2023-09-02T03-45-23.206143.json b/eval-results/uukuguy/speechless-llama2-13b/results_2023-09-02T03-45-23.206143.json new file mode 100644 index 0000000000000000000000000000000000000000..b1aa163c6fb667b1aad4ecdd46cb8cfd71fa7986 --- /dev/null +++ b/eval-results/uukuguy/speechless-llama2-13b/results_2023-09-02T03-45-23.206143.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-llama2-13b", + "model_sha": "c6362c4fc0dc03420e3c08454b2e7689e4e32d3a", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5767918088737202, + "acc_stderr": 0.014438036220848027, + "acc_norm": 0.606655290102389, + "acc_norm_stderr": 0.014275101465693028 + }, + "harness|hellaswag|10": { + "acc": 0.6236805417247561, + "acc_stderr": 0.004834715814208116, + "acc_norm": 0.8226448914558853, + "acc_norm_stderr": 0.003811883070911272 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5481481481481482, + "acc_stderr": 0.042992689054808644, + "acc_norm": 0.5481481481481482, + "acc_norm_stderr": 0.042992689054808644 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5723684210526315, + "acc_stderr": 0.04026097083296563, + "acc_norm": 0.5723684210526315, + "acc_norm_stderr": 0.04026097083296563 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.04960449637488583, + "acc_norm": 0.58, + "acc_norm_stderr": 0.04960449637488583 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6264150943396226, + "acc_stderr": 0.02977308271331987, + "acc_norm": 0.6264150943396226, + "acc_norm_stderr": 0.02977308271331987 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6736111111111112, + "acc_stderr": 0.03921067198982266, + "acc_norm": 0.6736111111111112, + "acc_norm_stderr": 0.03921067198982266 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5375722543352601, + "acc_stderr": 0.0380168510452446, + "acc_norm": 0.5375722543352601, + "acc_norm_stderr": 0.0380168510452446 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3137254901960784, + "acc_stderr": 0.04617034827006717, + "acc_norm": 0.3137254901960784, + "acc_norm_stderr": 0.04617034827006717 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.71, + "acc_stderr": 0.04560480215720685, + "acc_norm": 0.71, + "acc_norm_stderr": 0.04560480215720685 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5063829787234042, + "acc_stderr": 0.032683358999363366, + "acc_norm": 0.5063829787234042, + "acc_norm_stderr": 0.032683358999363366 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.041424397194893624, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.041424397194893624 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.024796060602699947, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.024796060602699947 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.043758884927270605, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.043758884927270605 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6806451612903226, + "acc_stderr": 0.026522709674667775, + "acc_norm": 0.6806451612903226, + "acc_norm_stderr": 0.026522709674667775 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4729064039408867, + "acc_stderr": 0.03512819077876106, + "acc_norm": 0.4729064039408867, + "acc_norm_stderr": 0.03512819077876106 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.703030303030303, + "acc_stderr": 0.0356796977226805, + "acc_norm": 0.703030303030303, + "acc_norm_stderr": 0.0356796977226805 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7474747474747475, + "acc_stderr": 0.030954055470365897, + "acc_norm": 0.7474747474747475, + "acc_norm_stderr": 0.030954055470365897 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8393782383419689, + "acc_stderr": 0.02649905770139744, + "acc_norm": 0.8393782383419689, + "acc_norm_stderr": 0.02649905770139744 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5948717948717949, + "acc_stderr": 0.024890471769938145, + "acc_norm": 0.5948717948717949, + "acc_norm_stderr": 0.024890471769938145 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32222222222222224, + "acc_stderr": 0.0284934650910286, + "acc_norm": 0.32222222222222224, + "acc_norm_stderr": 0.0284934650910286 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.03196876989195778, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.03196876989195778 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.32450331125827814, + "acc_stderr": 0.038227469376587525, + "acc_norm": 0.32450331125827814, + "acc_norm_stderr": 0.038227469376587525 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7963302752293578, + "acc_stderr": 0.017266742087630797, + "acc_norm": 0.7963302752293578, + "acc_norm_stderr": 0.017266742087630797 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.39814814814814814, + "acc_stderr": 0.033384734032074016, + "acc_norm": 0.39814814814814814, + "acc_norm_stderr": 0.033384734032074016 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7941176470588235, + "acc_stderr": 0.028379449451588667, + "acc_norm": 0.7941176470588235, + "acc_norm_stderr": 0.028379449451588667 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7679324894514767, + "acc_stderr": 0.027479744550808514, + "acc_norm": 0.7679324894514767, + "acc_norm_stderr": 0.027479744550808514 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6860986547085202, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.6860986547085202, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6335877862595419, + "acc_stderr": 0.04225875451969638, + "acc_norm": 0.6335877862595419, + "acc_norm_stderr": 0.04225875451969638 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908705, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.0401910747255735, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.0401910747255735 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6932515337423313, + "acc_stderr": 0.03623089915724147, + "acc_norm": 0.6932515337423313, + "acc_norm_stderr": 0.03623089915724147 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4017857142857143, + "acc_stderr": 0.04653333146973646, + "acc_norm": 0.4017857142857143, + "acc_norm_stderr": 0.04653333146973646 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7378640776699029, + "acc_stderr": 0.043546310772605956, + "acc_norm": 0.7378640776699029, + "acc_norm_stderr": 0.043546310772605956 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8290598290598291, + "acc_stderr": 0.02466249684520982, + "acc_norm": 0.8290598290598291, + "acc_norm_stderr": 0.02466249684520982 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7675606641123882, + "acc_stderr": 0.015104550008905706, + "acc_norm": 0.7675606641123882, + "acc_norm_stderr": 0.015104550008905706 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.025722802200895806, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.025722802200895806 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.41564245810055866, + "acc_stderr": 0.016482782187500662, + "acc_norm": 0.41564245810055866, + "acc_norm_stderr": 0.016482782187500662 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6633986928104575, + "acc_stderr": 0.02705797462449438, + "acc_norm": 0.6633986928104575, + "acc_norm_stderr": 0.02705797462449438 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6495176848874598, + "acc_stderr": 0.027098652621301757, + "acc_norm": 0.6495176848874598, + "acc_norm_stderr": 0.027098652621301757 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.654320987654321, + "acc_stderr": 0.02646248777700187, + "acc_norm": 0.654320987654321, + "acc_norm_stderr": 0.02646248777700187 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.45390070921985815, + "acc_stderr": 0.029700453247291474, + "acc_norm": 0.45390070921985815, + "acc_norm_stderr": 0.029700453247291474 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4361147327249022, + "acc_stderr": 0.012665568135455324, + "acc_norm": 0.4361147327249022, + "acc_norm_stderr": 0.012665568135455324 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5955882352941176, + "acc_stderr": 0.029812630701569743, + "acc_norm": 0.5955882352941176, + "acc_norm_stderr": 0.029812630701569743 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5686274509803921, + "acc_stderr": 0.02003639376835263, + "acc_norm": 0.5686274509803921, + "acc_norm_stderr": 0.02003639376835263 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6653061224489796, + "acc_stderr": 0.030209235226242307, + "acc_norm": 0.6653061224489796, + "acc_norm_stderr": 0.030209235226242307 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7611940298507462, + "acc_stderr": 0.03014777593540922, + "acc_norm": 0.7611940298507462, + "acc_norm_stderr": 0.03014777593540922 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7543859649122807, + "acc_stderr": 0.03301405946987251, + "acc_norm": 0.7543859649122807, + "acc_norm_stderr": 0.03301405946987251 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.39167686658506734, + "mc1_stderr": 0.01708779588176963, + "mc2": 0.5665451524014927, + "mc2_stderr": 0.015598552282279922 + }, + "all": { + "acc": 0.5843572966745227, + "acc_stderr": 0.03408025367463418, + "acc_norm": 0.5882357344874007, + "acc_norm_stderr": 0.034060155920423135, + "mc1": 0.39167686658506734, + "mc1_stderr": 0.01708779588176963, + "mc2": 0.5665451524014927, + "mc2_stderr": 0.015598552282279922 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6752.605266332626", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-llama2-13b/results_2023-09-02T15-58-18.299905.json b/eval-results/uukuguy/speechless-llama2-13b/results_2023-09-02T15-58-18.299905.json new file mode 100644 index 0000000000000000000000000000000000000000..5a788179f36b09730ecb41a6f911256c0c4bd2c7 --- /dev/null +++ b/eval-results/uukuguy/speechless-llama2-13b/results_2023-09-02T15-58-18.299905.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-llama2-13b", + "model_sha": "752a5591e0e7b8e00599e080a75059783dcc9b8f", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5784982935153583, + "acc_stderr": 0.014430197069326023, + "acc_norm": 0.6203071672354948, + "acc_norm_stderr": 0.014182119866974872 + }, + "harness|hellaswag|10": { + "acc": 0.6214897430790679, + "acc_stderr": 0.004840244782805304, + "acc_norm": 0.8184624576777534, + "acc_norm_stderr": 0.0038467514306295384 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5481481481481482, + "acc_stderr": 0.04299268905480864, + "acc_norm": 0.5481481481481482, + "acc_norm_stderr": 0.04299268905480864 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5657894736842105, + "acc_stderr": 0.0403356566784832, + "acc_norm": 0.5657894736842105, + "acc_norm_stderr": 0.0403356566784832 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.630188679245283, + "acc_stderr": 0.02971142188010793, + "acc_norm": 0.630188679245283, + "acc_norm_stderr": 0.02971142188010793 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.625, + "acc_stderr": 0.04048439222695598, + "acc_norm": 0.625, + "acc_norm_stderr": 0.04048439222695598 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5491329479768786, + "acc_stderr": 0.037940126746970296, + "acc_norm": 0.5491329479768786, + "acc_norm_stderr": 0.037940126746970296 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04690650298201942, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04690650298201942 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.49361702127659574, + "acc_stderr": 0.032683358999363366, + "acc_norm": 0.49361702127659574, + "acc_norm_stderr": 0.032683358999363366 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2894736842105263, + "acc_stderr": 0.04266339443159394, + "acc_norm": 0.2894736842105263, + "acc_norm_stderr": 0.04266339443159394 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5448275862068965, + "acc_stderr": 0.04149886942192118, + "acc_norm": 0.5448275862068965, + "acc_norm_stderr": 0.04149886942192118 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.02479606060269994, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.02479606060269994 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.042857142857142816, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.042857142857142816 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6870967741935484, + "acc_stderr": 0.026377567028645854, + "acc_norm": 0.6870967741935484, + "acc_norm_stderr": 0.026377567028645854 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.49261083743842365, + "acc_stderr": 0.03517603540361008, + "acc_norm": 0.49261083743842365, + "acc_norm_stderr": 0.03517603540361008 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.703030303030303, + "acc_stderr": 0.03567969772268049, + "acc_norm": 0.703030303030303, + "acc_norm_stderr": 0.03567969772268049 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7626262626262627, + "acc_stderr": 0.0303137105381989, + "acc_norm": 0.7626262626262627, + "acc_norm_stderr": 0.0303137105381989 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.844559585492228, + "acc_stderr": 0.02614848346915331, + "acc_norm": 0.844559585492228, + "acc_norm_stderr": 0.02614848346915331 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6256410256410256, + "acc_stderr": 0.0245375915728305, + "acc_norm": 0.6256410256410256, + "acc_norm_stderr": 0.0245375915728305 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3111111111111111, + "acc_stderr": 0.02822644674968352, + "acc_norm": 0.3111111111111111, + "acc_norm_stderr": 0.02822644674968352 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6008403361344538, + "acc_stderr": 0.03181110032413926, + "acc_norm": 0.6008403361344538, + "acc_norm_stderr": 0.03181110032413926 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7908256880733945, + "acc_stderr": 0.01743793717334323, + "acc_norm": 0.7908256880733945, + "acc_norm_stderr": 0.01743793717334323 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4027777777777778, + "acc_stderr": 0.033448873829978666, + "acc_norm": 0.4027777777777778, + "acc_norm_stderr": 0.033448873829978666 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.02812597226565438, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.02812597226565438 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229966, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229966 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.03102441174057221, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.03102441174057221 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6412213740458015, + "acc_stderr": 0.04206739313864908, + "acc_norm": 0.6412213740458015, + "acc_norm_stderr": 0.04206739313864908 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908706, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908706 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7685185185185185, + "acc_stderr": 0.04077494709252627, + "acc_norm": 0.7685185185185185, + "acc_norm_stderr": 0.04077494709252627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6809815950920245, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.6809815950920245, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7572815533980582, + "acc_stderr": 0.04245022486384495, + "acc_norm": 0.7572815533980582, + "acc_norm_stderr": 0.04245022486384495 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8376068376068376, + "acc_stderr": 0.02416161812798774, + "acc_norm": 0.8376068376068376, + "acc_norm_stderr": 0.02416161812798774 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7637292464878672, + "acc_stderr": 0.015190473717037495, + "acc_norm": 0.7637292464878672, + "acc_norm_stderr": 0.015190473717037495 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.025722802200895803, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.025722802200895803 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.41564245810055866, + "acc_stderr": 0.016482782187500662, + "acc_norm": 0.41564245810055866, + "acc_norm_stderr": 0.016482782187500662 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6568627450980392, + "acc_stderr": 0.027184498909941616, + "acc_norm": 0.6568627450980392, + "acc_norm_stderr": 0.027184498909941616 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6527331189710611, + "acc_stderr": 0.027040745502307336, + "acc_norm": 0.6527331189710611, + "acc_norm_stderr": 0.027040745502307336 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6450617283950617, + "acc_stderr": 0.02662415247884585, + "acc_norm": 0.6450617283950617, + "acc_norm_stderr": 0.02662415247884585 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.45390070921985815, + "acc_stderr": 0.029700453247291474, + "acc_norm": 0.45390070921985815, + "acc_norm_stderr": 0.029700453247291474 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4367666232073012, + "acc_stderr": 0.012667701919603654, + "acc_norm": 0.4367666232073012, + "acc_norm_stderr": 0.012667701919603654 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5808823529411765, + "acc_stderr": 0.029972807170464622, + "acc_norm": 0.5808823529411765, + "acc_norm_stderr": 0.029972807170464622 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5669934640522876, + "acc_stderr": 0.02004544247332422, + "acc_norm": 0.5669934640522876, + "acc_norm_stderr": 0.02004544247332422 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7, + "acc_stderr": 0.04389311454644287, + "acc_norm": 0.7, + "acc_norm_stderr": 0.04389311454644287 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6775510204081633, + "acc_stderr": 0.029923100563683906, + "acc_norm": 0.6775510204081633, + "acc_norm_stderr": 0.029923100563683906 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7562189054726368, + "acc_stderr": 0.030360490154014645, + "acc_norm": 0.7562189054726368, + "acc_norm_stderr": 0.030360490154014645 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4879518072289157, + "acc_stderr": 0.03891364495835821, + "acc_norm": 0.4879518072289157, + "acc_norm_stderr": 0.03891364495835821 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.031885780176863984, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.031885780176863984 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.38922888616891066, + "mc1_stderr": 0.01706855268069033, + "mc2": 0.5570343878702176, + "mc2_stderr": 0.015440009877017729 + }, + "all": { + "acc": 0.585682771832991, + "acc_stderr": 0.03402069180161956, + "acc_norm": 0.5897299173977169, + "acc_norm_stderr": 0.03399964823289876, + "mc1": 0.38922888616891066, + "mc1_stderr": 0.01706855268069033, + "mc2": 0.5570343878702176, + "mc2_stderr": 0.015440009877017729 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6748.280304431915", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-llama2-13b/results_2023-09-12T13-57-07.476950.json b/eval-results/uukuguy/speechless-llama2-13b/results_2023-09-12T13-57-07.476950.json new file mode 100644 index 0000000000000000000000000000000000000000..a9db623eb7ce40acdb6ea4b4973b1ae8f4382f77 --- /dev/null +++ b/eval-results/uukuguy/speechless-llama2-13b/results_2023-09-12T13-57-07.476950.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-llama2-13b", + "model_sha": "5341819accf229a625b163b5611aa973cf9f9718", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5776450511945392, + "acc_stderr": 0.014434138713379983, + "acc_norm": 0.6220136518771331, + "acc_norm_stderr": 0.0141696645203031 + }, + "harness|hellaswag|10": { + "acc": 0.6210914160525791, + "acc_stderr": 0.004841238763529372, + "acc_norm": 0.81876120294762, + "acc_norm_stderr": 0.003844286350624635 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.562962962962963, + "acc_stderr": 0.04284958639753401, + "acc_norm": 0.562962962962963, + "acc_norm_stderr": 0.04284958639753401 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5723684210526315, + "acc_stderr": 0.04026097083296563, + "acc_norm": 0.5723684210526315, + "acc_norm_stderr": 0.04026097083296563 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.630188679245283, + "acc_stderr": 0.02971142188010793, + "acc_norm": 0.630188679245283, + "acc_norm_stderr": 0.02971142188010793 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6458333333333334, + "acc_stderr": 0.039994111357535424, + "acc_norm": 0.6458333333333334, + "acc_norm_stderr": 0.039994111357535424 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5549132947976878, + "acc_stderr": 0.03789401760283647, + "acc_norm": 0.5549132947976878, + "acc_norm_stderr": 0.03789401760283647 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.04655010411319616, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.04655010411319616 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4978723404255319, + "acc_stderr": 0.03268572658667492, + "acc_norm": 0.4978723404255319, + "acc_norm_stderr": 0.03268572658667492 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.04185774424022056, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.04185774424022056 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5448275862068965, + "acc_stderr": 0.04149886942192118, + "acc_norm": 0.5448275862068965, + "acc_norm_stderr": 0.04149886942192118 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.02490699045899257, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.02490699045899257 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.04306241259127152, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.04306241259127152 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6903225806451613, + "acc_stderr": 0.026302774983517414, + "acc_norm": 0.6903225806451613, + "acc_norm_stderr": 0.026302774983517414 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5024630541871922, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.5024630541871922, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.703030303030303, + "acc_stderr": 0.03567969772268049, + "acc_norm": 0.703030303030303, + "acc_norm_stderr": 0.03567969772268049 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.030532892233932026, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.030532892233932026 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.844559585492228, + "acc_stderr": 0.02614848346915331, + "acc_norm": 0.844559585492228, + "acc_norm_stderr": 0.02614848346915331 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6256410256410256, + "acc_stderr": 0.0245375915728305, + "acc_norm": 0.6256410256410256, + "acc_norm_stderr": 0.0245375915728305 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.02831753349606648, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.02831753349606648 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6008403361344538, + "acc_stderr": 0.03181110032413926, + "acc_norm": 0.6008403361344538, + "acc_norm_stderr": 0.03181110032413926 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.037579499229433426, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.037579499229433426 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7944954128440367, + "acc_stderr": 0.017324352325016012, + "acc_norm": 0.7944954128440367, + "acc_norm_stderr": 0.017324352325016012 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.41203703703703703, + "acc_stderr": 0.03356787758160835, + "acc_norm": 0.41203703703703703, + "acc_norm_stderr": 0.03356787758160835 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.02812597226565438, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.02812597226565438 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.759493670886076, + "acc_stderr": 0.027820781981149685, + "acc_norm": 0.759493670886076, + "acc_norm_stderr": 0.027820781981149685 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6860986547085202, + "acc_stderr": 0.03114679648297246, + "acc_norm": 0.6860986547085202, + "acc_norm_stderr": 0.03114679648297246 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7107438016528925, + "acc_stderr": 0.04139112727635463, + "acc_norm": 0.7107438016528925, + "acc_norm_stderr": 0.04139112727635463 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6809815950920245, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.6809815950920245, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.38392857142857145, + "acc_stderr": 0.04616143075028547, + "acc_norm": 0.38392857142857145, + "acc_norm_stderr": 0.04616143075028547 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7475728155339806, + "acc_stderr": 0.04301250399690878, + "acc_norm": 0.7475728155339806, + "acc_norm_stderr": 0.04301250399690878 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8247863247863247, + "acc_stderr": 0.02490443909891823, + "acc_norm": 0.8247863247863247, + "acc_norm_stderr": 0.02490443909891823 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7637292464878672, + "acc_stderr": 0.015190473717037495, + "acc_norm": 0.7637292464878672, + "acc_norm_stderr": 0.015190473717037495 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.025722802200895803, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.025722802200895803 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.423463687150838, + "acc_stderr": 0.016525425898773514, + "acc_norm": 0.423463687150838, + "acc_norm_stderr": 0.016525425898773514 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6601307189542484, + "acc_stderr": 0.027121956071388856, + "acc_norm": 0.6601307189542484, + "acc_norm_stderr": 0.027121956071388856 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6559485530546624, + "acc_stderr": 0.026981478043648043, + "acc_norm": 0.6559485530546624, + "acc_norm_stderr": 0.026981478043648043 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6512345679012346, + "acc_stderr": 0.026517597724465013, + "acc_norm": 0.6512345679012346, + "acc_norm_stderr": 0.026517597724465013 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4645390070921986, + "acc_stderr": 0.029752389657427047, + "acc_norm": 0.4645390070921986, + "acc_norm_stderr": 0.029752389657427047 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.438722294654498, + "acc_stderr": 0.012673969883493272, + "acc_norm": 0.438722294654498, + "acc_norm_stderr": 0.012673969883493272 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5992647058823529, + "acc_stderr": 0.029768263528933105, + "acc_norm": 0.5992647058823529, + "acc_norm_stderr": 0.029768263528933105 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.565359477124183, + "acc_stderr": 0.020054269200726463, + "acc_norm": 0.565359477124183, + "acc_norm_stderr": 0.020054269200726463 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6818181818181818, + "acc_stderr": 0.04461272175910509, + "acc_norm": 0.6818181818181818, + "acc_norm_stderr": 0.04461272175910509 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.673469387755102, + "acc_stderr": 0.03002105623844031, + "acc_norm": 0.673469387755102, + "acc_norm_stderr": 0.03002105623844031 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7562189054726368, + "acc_stderr": 0.030360490154014645, + "acc_norm": 0.7562189054726368, + "acc_norm_stderr": 0.030360490154014645 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4759036144578313, + "acc_stderr": 0.03887971849597264, + "acc_norm": 0.4759036144578313, + "acc_norm_stderr": 0.03887971849597264 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7719298245614035, + "acc_stderr": 0.03218093795602357, + "acc_norm": 0.7719298245614035, + "acc_norm_stderr": 0.03218093795602357 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3880048959608323, + "mc1_stderr": 0.017058761501347972, + "mc2": 0.5562491990096062, + "mc2_stderr": 0.01544713306521873 + }, + "all": { + "acc": 0.5869376489115714, + "acc_stderr": 0.0340383920445088, + "acc_norm": 0.5910399944637348, + "acc_norm_stderr": 0.03401701193254301, + "mc1": 0.3880048959608323, + "mc1_stderr": 0.017058761501347972, + "mc2": 0.5562491990096062, + "mc2_stderr": 0.01544713306521873 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6345.579582929611", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-llama2-13b/results_2023-10-15T22-09-28.481990.json b/eval-results/uukuguy/speechless-llama2-13b/results_2023-10-15T22-09-28.481990.json new file mode 100644 index 0000000000000000000000000000000000000000..bd467b931294e86b3fa6a15ebe5de91de39dc02a --- /dev/null +++ b/eval-results/uukuguy/speechless-llama2-13b/results_2023-10-15T22-09-28.481990.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-llama2-13b", + "model_sha": "1f15d63c10dd60f9ccdd9cc8b0488ef292648ac2", + "model_size": "24.32 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.021078020134228187, + "em_stderr": 0.001471053944115298, + "f1": 0.13116610738255008, + "f1_stderr": 0.002358318760137305 + }, + "harness|gsm8k|5": { + "acc": 0.13949962092494314, + "acc_stderr": 0.009543426687191308 + }, + "harness|winogrande|5": { + "acc": 0.7655880031570639, + "acc_stderr": 0.011906130106237986 + }, + "all": { + "em": 0.021078020134228187, + "em_stderr": 0.001471053944115298, + "f1": 0.13116610738255008, + "f1_stderr": 0.002358318760137305, + "acc": 0.4525438120410035, + "acc_stderr": 0.010724778396714648 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "39721375bc35899f" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "c3fa0060c3a4dba7" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "00270fd9baf396f4" + }, + "total_evaluation_time_secondes": "11900.790353536606", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-llama2-13b/results_2023-12-09T16-49-00.911665.json b/eval-results/uukuguy/speechless-llama2-13b/results_2023-12-09T16-49-00.911665.json new file mode 100644 index 0000000000000000000000000000000000000000..7c7ee93705ff70fe6a9a2fc2a5e751e05cfa6653 --- /dev/null +++ b/eval-results/uukuguy/speechless-llama2-13b/results_2023-12-09T16-49-00.911665.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 584988.71656279, + "end_time": 595411.587853759, + "total_evaluation_time_secondes": "10422.871290968964", + "model_name": "uukuguy/speechless-llama2-13b", + "model_sha": "4cfc06d1608e577a4c8228863f5e92ef0f0d7b7b", + "model_dtype": "torch.bfloat16", + "model_size": "24.32 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5784982935153583, + "acc_stderr": 0.014430197069326023, + "acc_norm": 0.6203071672354948, + "acc_norm_stderr": 0.014182119866974872 + }, + "harness|hellaswag|10": { + "acc": 0.6212905795658236, + "acc_stderr": 0.004840742206718088, + "acc_norm": 0.8181637124078869, + "acc_norm_stderr": 0.0038492126228151643 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5481481481481482, + "acc_stderr": 0.042992689054808644, + "acc_norm": 0.5481481481481482, + "acc_norm_stderr": 0.042992689054808644 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5657894736842105, + "acc_stderr": 0.0403356566784832, + "acc_norm": 0.5657894736842105, + "acc_norm_stderr": 0.0403356566784832 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.04960449637488583, + "acc_norm": 0.58, + "acc_norm_stderr": 0.04960449637488583 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.630188679245283, + "acc_stderr": 0.02971142188010793, + "acc_norm": 0.630188679245283, + "acc_norm_stderr": 0.02971142188010793 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6458333333333334, + "acc_stderr": 0.039994111357535424, + "acc_norm": 0.6458333333333334, + "acc_norm_stderr": 0.039994111357535424 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5491329479768786, + "acc_stderr": 0.037940126746970296, + "acc_norm": 0.5491329479768786, + "acc_norm_stderr": 0.037940126746970296 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3235294117647059, + "acc_stderr": 0.04655010411319616, + "acc_norm": 0.3235294117647059, + "acc_norm_stderr": 0.04655010411319616 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.49361702127659574, + "acc_stderr": 0.032683358999363366, + "acc_norm": 0.49361702127659574, + "acc_norm_stderr": 0.032683358999363366 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5448275862068965, + "acc_stderr": 0.04149886942192118, + "acc_norm": 0.5448275862068965, + "acc_norm_stderr": 0.04149886942192118 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.37566137566137564, + "acc_stderr": 0.024942368931159798, + "acc_norm": 0.37566137566137564, + "acc_norm_stderr": 0.024942368931159798 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.04325506042017087, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.04325506042017087 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6838709677419355, + "acc_stderr": 0.026450874489042764, + "acc_norm": 0.6838709677419355, + "acc_norm_stderr": 0.026450874489042764 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4975369458128079, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.4975369458128079, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.703030303030303, + "acc_stderr": 0.03567969772268049, + "acc_norm": 0.703030303030303, + "acc_norm_stderr": 0.03567969772268049 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.030532892233932026, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.030532892233932026 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8393782383419689, + "acc_stderr": 0.02649905770139744, + "acc_norm": 0.8393782383419689, + "acc_norm_stderr": 0.02649905770139744 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6230769230769231, + "acc_stderr": 0.024570975364225995, + "acc_norm": 0.6230769230769231, + "acc_norm_stderr": 0.024570975364225995 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.02831753349606648, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.02831753349606648 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5966386554621849, + "acc_stderr": 0.031866081214088314, + "acc_norm": 0.5966386554621849, + "acc_norm_stderr": 0.031866081214088314 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31125827814569534, + "acc_stderr": 0.03780445850526733, + "acc_norm": 0.31125827814569534, + "acc_norm_stderr": 0.03780445850526733 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7908256880733945, + "acc_stderr": 0.01743793717334323, + "acc_norm": 0.7908256880733945, + "acc_norm_stderr": 0.01743793717334323 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.39814814814814814, + "acc_stderr": 0.033384734032074016, + "acc_norm": 0.39814814814814814, + "acc_norm_stderr": 0.033384734032074016 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.02812597226565438, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.02812597226565438 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229966, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229966 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.031024411740572206, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.031024411740572206 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908706, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908706 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6809815950920245, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.6809815950920245, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4017857142857143, + "acc_stderr": 0.04653333146973646, + "acc_norm": 0.4017857142857143, + "acc_norm_stderr": 0.04653333146973646 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7475728155339806, + "acc_stderr": 0.04301250399690878, + "acc_norm": 0.7475728155339806, + "acc_norm_stderr": 0.04301250399690878 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8333333333333334, + "acc_stderr": 0.02441494730454368, + "acc_norm": 0.8333333333333334, + "acc_norm_stderr": 0.02441494730454368 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7624521072796935, + "acc_stderr": 0.015218733046150191, + "acc_norm": 0.7624521072796935, + "acc_norm_stderr": 0.015218733046150191 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.025722802200895803, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.025722802200895803 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.41899441340782123, + "acc_stderr": 0.01650157930686168, + "acc_norm": 0.41899441340782123, + "acc_norm_stderr": 0.01650157930686168 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6568627450980392, + "acc_stderr": 0.027184498909941616, + "acc_norm": 0.6568627450980392, + "acc_norm_stderr": 0.027184498909941616 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6591639871382636, + "acc_stderr": 0.026920841260776165, + "acc_norm": 0.6591639871382636, + "acc_norm_stderr": 0.026920841260776165 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.654320987654321, + "acc_stderr": 0.026462487777001872, + "acc_norm": 0.654320987654321, + "acc_norm_stderr": 0.026462487777001872 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.029766675075873866, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.029766675075873866 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4367666232073012, + "acc_stderr": 0.012667701919603654, + "acc_norm": 0.4367666232073012, + "acc_norm_stderr": 0.012667701919603654 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5808823529411765, + "acc_stderr": 0.029972807170464622, + "acc_norm": 0.5808823529411765, + "acc_norm_stderr": 0.029972807170464622 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5669934640522876, + "acc_stderr": 0.02004544247332422, + "acc_norm": 0.5669934640522876, + "acc_norm_stderr": 0.02004544247332422 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.04350271442923243, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.04350271442923243 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6775510204081633, + "acc_stderr": 0.029923100563683906, + "acc_norm": 0.6775510204081633, + "acc_norm_stderr": 0.029923100563683906 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7562189054726368, + "acc_stderr": 0.030360490154014645, + "acc_norm": 0.7562189054726368, + "acc_norm_stderr": 0.030360490154014645 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4879518072289157, + "acc_stderr": 0.03891364495835821, + "acc_norm": 0.4879518072289157, + "acc_norm_stderr": 0.03891364495835821 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.031885780176863984, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.031885780176863984 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3880048959608323, + "mc1_stderr": 0.017058761501347972, + "mc2": 0.5565985023189125, + "mc2_stderr": 0.015435738665954496 + }, + "harness|winogrande|5": { + "acc": 0.7600631412786109, + "acc_stderr": 0.012002078629485739 + }, + "harness|gsm8k|5": { + "acc": 0.33965125094768767, + "acc_stderr": 0.013045045067665269 + }, + "all": { + "acc": 0.5860683187201721, + "acc_stderr": 0.033316954491979946, + "acc_norm": 0.5913412721401082, + "acc_norm_stderr": 0.0340008049750402, + "mc1": 0.3880048959608323, + "mc1_stderr": 0.017058761501347972, + "mc2": 0.5565985023189125, + "mc2_stderr": 0.015435738665954496 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "c3fa0060c3a4dba7" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "a8fa53915153e1db", + "hash_cont_tokens": "d8262952aeadc141" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-llama2-hermes-orca-platypus-13b/results_2023-09-01T20-32-11.554116.json b/eval-results/uukuguy/speechless-llama2-hermes-orca-platypus-13b/results_2023-09-01T20-32-11.554116.json new file mode 100644 index 0000000000000000000000000000000000000000..9a3173f2eb0a60a3619c9a2cfc4d2bddc14c3beb --- /dev/null +++ b/eval-results/uukuguy/speechless-llama2-hermes-orca-platypus-13b/results_2023-09-01T20-32-11.554116.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-llama2-hermes-orca-platypus-13b", + "model_sha": "f227ad33b16726b099e35e5dc47f4db1f22665a7", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5802047781569966, + "acc_stderr": 0.014422181226303026, + "acc_norm": 0.6092150170648464, + "acc_norm_stderr": 0.014258563880513778 + }, + "harness|hellaswag|10": { + "acc": 0.6351324437363075, + "acc_stderr": 0.0048040917088125485, + "acc_norm": 0.8349930292770364, + "acc_norm_stderr": 0.003704282390781705 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5111111111111111, + "acc_stderr": 0.04318275491977976, + "acc_norm": 0.5111111111111111, + "acc_norm_stderr": 0.04318275491977976 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.625, + "acc_stderr": 0.039397364351956274, + "acc_norm": 0.625, + "acc_norm_stderr": 0.039397364351956274 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6150943396226415, + "acc_stderr": 0.02994649856769995, + "acc_norm": 0.6150943396226415, + "acc_norm_stderr": 0.02994649856769995 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.03852084696008534, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.03852084696008534 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.44, + "acc_stderr": 0.049888765156985884, + "acc_norm": 0.44, + "acc_norm_stderr": 0.049888765156985884 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5433526011560693, + "acc_stderr": 0.03798106566014498, + "acc_norm": 0.5433526011560693, + "acc_norm_stderr": 0.03798106566014498 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3137254901960784, + "acc_stderr": 0.04617034827006718, + "acc_norm": 0.3137254901960784, + "acc_norm_stderr": 0.04617034827006718 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4765957446808511, + "acc_stderr": 0.032650194750335815, + "acc_norm": 0.4765957446808511, + "acc_norm_stderr": 0.032650194750335815 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.043727482902780064, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.043727482902780064 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5310344827586206, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.5310344827586206, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.37566137566137564, + "acc_stderr": 0.02494236893115979, + "acc_norm": 0.37566137566137564, + "acc_norm_stderr": 0.02494236893115979 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.04426266681379909, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.04426266681379909 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6548387096774193, + "acc_stderr": 0.027045746573534327, + "acc_norm": 0.6548387096774193, + "acc_norm_stderr": 0.027045746573534327 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43842364532019706, + "acc_stderr": 0.03491207857486519, + "acc_norm": 0.43842364532019706, + "acc_norm_stderr": 0.03491207857486519 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237101, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237101 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.0347769116216366, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.0347769116216366 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7676767676767676, + "acc_stderr": 0.030088629490217487, + "acc_norm": 0.7676767676767676, + "acc_norm_stderr": 0.030088629490217487 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8601036269430051, + "acc_stderr": 0.025033870583015178, + "acc_norm": 0.8601036269430051, + "acc_norm_stderr": 0.025033870583015178 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6051282051282051, + "acc_stderr": 0.024784316942156395, + "acc_norm": 0.6051282051282051, + "acc_norm_stderr": 0.024784316942156395 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.31851851851851853, + "acc_stderr": 0.02840653309060846, + "acc_norm": 0.31851851851851853, + "acc_norm_stderr": 0.02840653309060846 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6218487394957983, + "acc_stderr": 0.031499305777849054, + "acc_norm": 0.6218487394957983, + "acc_norm_stderr": 0.031499305777849054 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8018348623853211, + "acc_stderr": 0.017090573804217905, + "acc_norm": 0.8018348623853211, + "acc_norm_stderr": 0.017090573804217905 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.44907407407407407, + "acc_stderr": 0.03392238405321616, + "acc_norm": 0.44907407407407407, + "acc_norm_stderr": 0.03392238405321616 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8382352941176471, + "acc_stderr": 0.025845017986926917, + "acc_norm": 0.8382352941176471, + "acc_norm_stderr": 0.025845017986926917 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7679324894514767, + "acc_stderr": 0.027479744550808517, + "acc_norm": 0.7679324894514767, + "acc_norm_stderr": 0.027479744550808517 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.7040358744394619, + "acc_stderr": 0.030636591348699803, + "acc_norm": 0.7040358744394619, + "acc_norm_stderr": 0.030636591348699803 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6412213740458015, + "acc_stderr": 0.04206739313864908, + "acc_norm": 0.6412213740458015, + "acc_norm_stderr": 0.04206739313864908 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.04065578140908705, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.04065578140908705 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7685185185185185, + "acc_stderr": 0.04077494709252626, + "acc_norm": 0.7685185185185185, + "acc_norm_stderr": 0.04077494709252626 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7300613496932515, + "acc_stderr": 0.034878251684978906, + "acc_norm": 0.7300613496932515, + "acc_norm_stderr": 0.034878251684978906 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4017857142857143, + "acc_stderr": 0.04653333146973646, + "acc_norm": 0.4017857142857143, + "acc_norm_stderr": 0.04653333146973646 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7281553398058253, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.7281553398058253, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8205128205128205, + "acc_stderr": 0.025140935950335445, + "acc_norm": 0.8205128205128205, + "acc_norm_stderr": 0.025140935950335445 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.58, + "acc_stderr": 0.04960449637488583, + "acc_norm": 0.58, + "acc_norm_stderr": 0.04960449637488583 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7726692209450831, + "acc_stderr": 0.01498727064094601, + "acc_norm": 0.7726692209450831, + "acc_norm_stderr": 0.01498727064094601 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6734104046242775, + "acc_stderr": 0.025248264774242832, + "acc_norm": 0.6734104046242775, + "acc_norm_stderr": 0.025248264774242832 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.48268156424581005, + "acc_stderr": 0.01671246744170252, + "acc_norm": 0.48268156424581005, + "acc_norm_stderr": 0.01671246744170252 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6143790849673203, + "acc_stderr": 0.027870745278290282, + "acc_norm": 0.6143790849673203, + "acc_norm_stderr": 0.027870745278290282 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6720257234726688, + "acc_stderr": 0.02666441088693762, + "acc_norm": 0.6720257234726688, + "acc_norm_stderr": 0.02666441088693762 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7006172839506173, + "acc_stderr": 0.025483115601195455, + "acc_norm": 0.7006172839506173, + "acc_norm_stderr": 0.025483115601195455 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4645390070921986, + "acc_stderr": 0.02975238965742705, + "acc_norm": 0.4645390070921986, + "acc_norm_stderr": 0.02975238965742705 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4706649282920469, + "acc_stderr": 0.012748238397365549, + "acc_norm": 0.4706649282920469, + "acc_norm_stderr": 0.012748238397365549 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6066176470588235, + "acc_stderr": 0.029674288281311155, + "acc_norm": 0.6066176470588235, + "acc_norm_stderr": 0.029674288281311155 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5947712418300654, + "acc_stderr": 0.019861155193829156, + "acc_norm": 0.5947712418300654, + "acc_norm_stderr": 0.019861155193829156 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6612244897959184, + "acc_stderr": 0.030299506562154185, + "acc_norm": 0.6612244897959184, + "acc_norm_stderr": 0.030299506562154185 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7263681592039801, + "acc_stderr": 0.03152439186555401, + "acc_norm": 0.7263681592039801, + "acc_norm_stderr": 0.03152439186555401 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4939759036144578, + "acc_stderr": 0.03892212195333045, + "acc_norm": 0.4939759036144578, + "acc_norm_stderr": 0.03892212195333045 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8245614035087719, + "acc_stderr": 0.02917088550072767, + "acc_norm": 0.8245614035087719, + "acc_norm_stderr": 0.02917088550072767 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.37454100367197063, + "mc1_stderr": 0.016943535128405324, + "mc2": 0.5428671891462921, + "mc2_stderr": 0.01582271764892174 + }, + "all": { + "acc": 0.5943320588218653, + "acc_stderr": 0.03407483365241444, + "acc_norm": 0.5982112253379429, + "acc_norm_stderr": 0.034053419641163256, + "mc1": 0.37454100367197063, + "mc1_stderr": 0.016943535128405324, + "mc2": 0.5428671891462921, + "mc2_stderr": 0.01582271764892174 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6773.803229808807", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-llama2-hermes-orca-platypus-13b/results_2023-10-18T05-34-26.777818.json b/eval-results/uukuguy/speechless-llama2-hermes-orca-platypus-13b/results_2023-10-18T05-34-26.777818.json new file mode 100644 index 0000000000000000000000000000000000000000..70a4dc382588e47f33c0ddaa970b5c218774021a --- /dev/null +++ b/eval-results/uukuguy/speechless-llama2-hermes-orca-platypus-13b/results_2023-10-18T05-34-26.777818.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-llama2-hermes-orca-platypus-13b", + "model_sha": "ebae4c06bbb90ac52eadbb27253ef9090292f4b9", + "model_size": "24.32 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.020763422818791948, + "em_stderr": 0.0014602692459797255, + "f1": 0.1284448406040266, + "f1_stderr": 0.002326999769158526 + }, + "harness|gsm8k|5": { + "acc": 0.09704321455648218, + "acc_stderr": 0.00815376827455472 + }, + "harness|winogrande|5": { + "acc": 0.7521704814522494, + "acc_stderr": 0.012134386019865348 + }, + "all": { + "em": 0.020763422818791948, + "em_stderr": 0.0014602692459797255, + "f1": 0.1284448406040266, + "f1_stderr": 0.002326999769158526, + "acc": 0.42460684800436577, + "acc_stderr": 0.010144077147210034 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "ef529a6d709c0e5d" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "b46a251072331030" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "07083fb5bbea61fa" + }, + "total_evaluation_time_secondes": "11574.156665086746", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b/results_2023-09-02T00-07-11.850382.json b/eval-results/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b/results_2023-09-02T00-07-11.850382.json new file mode 100644 index 0000000000000000000000000000000000000000..fc8d5c9a28227838af5adddd68cb99f2a09ea927 --- /dev/null +++ b/eval-results/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b/results_2023-09-02T00-07-11.850382.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b", + "model_sha": "4410d8a20871927e9fe981c01bc8314b451b2fcd", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5656996587030717, + "acc_stderr": 0.014484703048857357, + "acc_norm": 0.5964163822525598, + "acc_norm_stderr": 0.014337158914268448 + }, + "harness|hellaswag|10": { + "acc": 0.6240788687512447, + "acc_stderr": 0.004833699243292347, + "acc_norm": 0.8270264887472615, + "acc_norm_stderr": 0.003774513882615949 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5407407407407407, + "acc_stderr": 0.04304979692464242, + "acc_norm": 0.5407407407407407, + "acc_norm_stderr": 0.04304979692464242 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5855263157894737, + "acc_stderr": 0.04008973785779206, + "acc_norm": 0.5855263157894737, + "acc_norm_stderr": 0.04008973785779206 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6264150943396226, + "acc_stderr": 0.02977308271331987, + "acc_norm": 0.6264150943396226, + "acc_norm_stderr": 0.02977308271331987 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6597222222222222, + "acc_stderr": 0.039621355734862175, + "acc_norm": 0.6597222222222222, + "acc_norm_stderr": 0.039621355734862175 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5491329479768786, + "acc_stderr": 0.037940126746970296, + "acc_norm": 0.5491329479768786, + "acc_norm_stderr": 0.037940126746970296 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.04440521906179328, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.04440521906179328 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.502127659574468, + "acc_stderr": 0.03268572658667492, + "acc_norm": 0.502127659574468, + "acc_norm_stderr": 0.03268572658667492 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.024594975128920938, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.024594975128920938 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.04375888492727061, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.04375888492727061 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.667741935483871, + "acc_stderr": 0.026795560848122794, + "acc_norm": 0.667741935483871, + "acc_norm_stderr": 0.026795560848122794 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5024630541871922, + "acc_stderr": 0.035179450386910616, + "acc_norm": 0.5024630541871922, + "acc_norm_stderr": 0.035179450386910616 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.03546563019624336, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.03546563019624336 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7474747474747475, + "acc_stderr": 0.030954055470365897, + "acc_norm": 0.7474747474747475, + "acc_norm_stderr": 0.030954055470365897 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8549222797927462, + "acc_stderr": 0.025416343096306433, + "acc_norm": 0.8549222797927462, + "acc_norm_stderr": 0.025416343096306433 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5871794871794872, + "acc_stderr": 0.024962683564331806, + "acc_norm": 0.5871794871794872, + "acc_norm_stderr": 0.024962683564331806 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32592592592592595, + "acc_stderr": 0.028578348365473075, + "acc_norm": 0.32592592592592595, + "acc_norm_stderr": 0.028578348365473075 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6134453781512605, + "acc_stderr": 0.03163145807552379, + "acc_norm": 0.6134453781512605, + "acc_norm_stderr": 0.03163145807552379 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943343, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943343 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7889908256880734, + "acc_stderr": 0.01749392240411265, + "acc_norm": 0.7889908256880734, + "acc_norm_stderr": 0.01749392240411265 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.03350991604696043, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.03350991604696043 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8137254901960784, + "acc_stderr": 0.027325470966716312, + "acc_norm": 0.8137254901960784, + "acc_norm_stderr": 0.027325470966716312 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7679324894514767, + "acc_stderr": 0.027479744550808514, + "acc_norm": 0.7679324894514767, + "acc_norm_stderr": 0.027479744550808514 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6771300448430493, + "acc_stderr": 0.03138147637575498, + "acc_norm": 0.6771300448430493, + "acc_norm_stderr": 0.03138147637575498 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6259541984732825, + "acc_stderr": 0.042438692422305246, + "acc_norm": 0.6259541984732825, + "acc_norm_stderr": 0.042438692422305246 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7024793388429752, + "acc_stderr": 0.04173349148083499, + "acc_norm": 0.7024793388429752, + "acc_norm_stderr": 0.04173349148083499 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.04133119440243839, + "acc_norm": 0.7592592592592593, + "acc_norm_stderr": 0.04133119440243839 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6809815950920245, + "acc_stderr": 0.03661997551073836, + "acc_norm": 0.6809815950920245, + "acc_norm_stderr": 0.03661997551073836 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.375, + "acc_stderr": 0.04595091388086298, + "acc_norm": 0.375, + "acc_norm_stderr": 0.04595091388086298 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7475728155339806, + "acc_stderr": 0.04301250399690878, + "acc_norm": 0.7475728155339806, + "acc_norm_stderr": 0.04301250399690878 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8205128205128205, + "acc_stderr": 0.025140935950335445, + "acc_norm": 0.8205128205128205, + "acc_norm_stderr": 0.025140935950335445 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7650063856960408, + "acc_stderr": 0.015162024152278446, + "acc_norm": 0.7650063856960408, + "acc_norm_stderr": 0.015162024152278446 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6647398843930635, + "acc_stderr": 0.02541600377316555, + "acc_norm": 0.6647398843930635, + "acc_norm_stderr": 0.02541600377316555 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.41787709497206704, + "acc_stderr": 0.01649540063582008, + "acc_norm": 0.41787709497206704, + "acc_norm_stderr": 0.01649540063582008 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6470588235294118, + "acc_stderr": 0.02736359328468497, + "acc_norm": 0.6470588235294118, + "acc_norm_stderr": 0.02736359328468497 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6430868167202572, + "acc_stderr": 0.027210420375934023, + "acc_norm": 0.6430868167202572, + "acc_norm_stderr": 0.027210420375934023 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6481481481481481, + "acc_stderr": 0.026571483480719964, + "acc_norm": 0.6481481481481481, + "acc_norm_stderr": 0.026571483480719964 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4574468085106383, + "acc_stderr": 0.02971928127223684, + "acc_norm": 0.4574468085106383, + "acc_norm_stderr": 0.02971928127223684 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4406779661016949, + "acc_stderr": 0.012680037994097067, + "acc_norm": 0.4406779661016949, + "acc_norm_stderr": 0.012680037994097067 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.02989616303312547, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.02989616303312547 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5800653594771242, + "acc_stderr": 0.019966811178256483, + "acc_norm": 0.5800653594771242, + "acc_norm_stderr": 0.019966811178256483 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6909090909090909, + "acc_stderr": 0.044262946482000985, + "acc_norm": 0.6909090909090909, + "acc_norm_stderr": 0.044262946482000985 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6489795918367347, + "acc_stderr": 0.03055531675557364, + "acc_norm": 0.6489795918367347, + "acc_norm_stderr": 0.03055531675557364 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7611940298507462, + "acc_stderr": 0.03014777593540922, + "acc_norm": 0.7611940298507462, + "acc_norm_stderr": 0.03014777593540922 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7602339181286549, + "acc_stderr": 0.03274485211946956, + "acc_norm": 0.7602339181286549, + "acc_norm_stderr": 0.03274485211946956 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3880048959608323, + "mc1_stderr": 0.017058761501347972, + "mc2": 0.5599582588635479, + "mc2_stderr": 0.015546326272126147 + }, + "all": { + "acc": 0.5833895996915851, + "acc_stderr": 0.03412386866499071, + "acc_norm": 0.5873500122940513, + "acc_norm_stderr": 0.03410341562269808, + "mc1": 0.3880048959608323, + "mc1_stderr": 0.017058761501347972, + "mc2": 0.5599582588635479, + "mc2_stderr": 0.015546326272126147 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6767.007708787918", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b/results_2023-09-12T15-48-02.156025.json b/eval-results/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b/results_2023-09-12T15-48-02.156025.json new file mode 100644 index 0000000000000000000000000000000000000000..3519bde6ab5ff1583f6f717af403f6739dc3d0e5 --- /dev/null +++ b/eval-results/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b/results_2023-09-12T15-48-02.156025.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b", + "model_sha": "4410d8a20871927e9fe981c01bc8314b451b2fcd", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5691126279863481, + "acc_stderr": 0.01447113339264247, + "acc_norm": 0.5955631399317406, + "acc_norm_stderr": 0.014342036483436175 + }, + "harness|hellaswag|10": { + "acc": 0.6244771957777335, + "acc_stderr": 0.00483267918878879, + "acc_norm": 0.8260306711810397, + "acc_norm_stderr": 0.0037830836739860606 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5481481481481482, + "acc_stderr": 0.04299268905480864, + "acc_norm": 0.5481481481481482, + "acc_norm_stderr": 0.04299268905480864 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5986842105263158, + "acc_stderr": 0.03988903703336284, + "acc_norm": 0.5986842105263158, + "acc_norm_stderr": 0.03988903703336284 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6264150943396226, + "acc_stderr": 0.02977308271331987, + "acc_norm": 0.6264150943396226, + "acc_norm_stderr": 0.02977308271331987 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.03942082639927213, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.03942082639927213 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5491329479768786, + "acc_stderr": 0.037940126746970296, + "acc_norm": 0.5491329479768786, + "acc_norm_stderr": 0.037940126746970296 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.04440521906179328, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.04440521906179328 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5063829787234042, + "acc_stderr": 0.03268335899936336, + "acc_norm": 0.5063829787234042, + "acc_norm_stderr": 0.03268335899936336 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2807017543859649, + "acc_stderr": 0.042270544512322, + "acc_norm": 0.2807017543859649, + "acc_norm_stderr": 0.042270544512322 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.36243386243386244, + "acc_stderr": 0.024757473902752056, + "acc_norm": 0.36243386243386244, + "acc_norm_stderr": 0.024757473902752056 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.04390259265377562, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.04390259265377562 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6709677419354839, + "acc_stderr": 0.02672949906834996, + "acc_norm": 0.6709677419354839, + "acc_norm_stderr": 0.02672949906834996 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.035158955511656986, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.035158955511656986 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7151515151515152, + "acc_stderr": 0.03524390844511781, + "acc_norm": 0.7151515151515152, + "acc_norm_stderr": 0.03524390844511781 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7474747474747475, + "acc_stderr": 0.030954055470365897, + "acc_norm": 0.7474747474747475, + "acc_norm_stderr": 0.030954055470365897 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8549222797927462, + "acc_stderr": 0.025416343096306433, + "acc_norm": 0.8549222797927462, + "acc_norm_stderr": 0.025416343096306433 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5897435897435898, + "acc_stderr": 0.024939313906940788, + "acc_norm": 0.5897435897435898, + "acc_norm_stderr": 0.024939313906940788 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.028742040903948492, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.028742040903948492 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6092436974789915, + "acc_stderr": 0.03169380235712996, + "acc_norm": 0.6092436974789915, + "acc_norm_stderr": 0.03169380235712996 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.037345356767871984, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.037345356767871984 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7944954128440367, + "acc_stderr": 0.017324352325016015, + "acc_norm": 0.7944954128440367, + "acc_norm_stderr": 0.017324352325016015 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.03350991604696042, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.03350991604696042 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8088235294117647, + "acc_stderr": 0.02759917430064077, + "acc_norm": 0.8088235294117647, + "acc_norm_stderr": 0.02759917430064077 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7679324894514767, + "acc_stderr": 0.027479744550808517, + "acc_norm": 0.7679324894514767, + "acc_norm_stderr": 0.027479744550808517 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6771300448430493, + "acc_stderr": 0.03138147637575498, + "acc_norm": 0.6771300448430493, + "acc_norm_stderr": 0.03138147637575498 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6259541984732825, + "acc_stderr": 0.042438692422305246, + "acc_norm": 0.6259541984732825, + "acc_norm_stderr": 0.042438692422305246 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6942148760330579, + "acc_stderr": 0.042059539338841226, + "acc_norm": 0.6942148760330579, + "acc_norm_stderr": 0.042059539338841226 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.04133119440243839, + "acc_norm": 0.7592592592592593, + "acc_norm_stderr": 0.04133119440243839 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6748466257668712, + "acc_stderr": 0.036803503712864616, + "acc_norm": 0.6748466257668712, + "acc_norm_stderr": 0.036803503712864616 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.045723723587374296, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.045723723587374296 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7475728155339806, + "acc_stderr": 0.04301250399690878, + "acc_norm": 0.7475728155339806, + "acc_norm_stderr": 0.04301250399690878 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8205128205128205, + "acc_stderr": 0.025140935950335445, + "acc_norm": 0.8205128205128205, + "acc_norm_stderr": 0.025140935950335445 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.6, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.6, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7675606641123882, + "acc_stderr": 0.015104550008905709, + "acc_norm": 0.7675606641123882, + "acc_norm_stderr": 0.015104550008905709 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6647398843930635, + "acc_stderr": 0.02541600377316555, + "acc_norm": 0.6647398843930635, + "acc_norm_stderr": 0.02541600377316555 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4134078212290503, + "acc_stderr": 0.016469814928406178, + "acc_norm": 0.4134078212290503, + "acc_norm_stderr": 0.016469814928406178 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6470588235294118, + "acc_stderr": 0.027363593284684972, + "acc_norm": 0.6470588235294118, + "acc_norm_stderr": 0.027363593284684972 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6430868167202572, + "acc_stderr": 0.027210420375934023, + "acc_norm": 0.6430868167202572, + "acc_norm_stderr": 0.027210420375934023 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.654320987654321, + "acc_stderr": 0.02646248777700187, + "acc_norm": 0.654320987654321, + "acc_norm_stderr": 0.02646248777700187 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.46099290780141844, + "acc_stderr": 0.029736592526424434, + "acc_norm": 0.46099290780141844, + "acc_norm_stderr": 0.029736592526424434 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44198174706649285, + "acc_stderr": 0.012683972513598813, + "acc_norm": 0.44198174706649285, + "acc_norm_stderr": 0.012683972513598813 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5845588235294118, + "acc_stderr": 0.029935342707877753, + "acc_norm": 0.5845588235294118, + "acc_norm_stderr": 0.029935342707877753 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5784313725490197, + "acc_stderr": 0.019977422600227477, + "acc_norm": 0.5784313725490197, + "acc_norm_stderr": 0.019977422600227477 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.7, + "acc_stderr": 0.04389311454644287, + "acc_norm": 0.7, + "acc_norm_stderr": 0.04389311454644287 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6530612244897959, + "acc_stderr": 0.030472526026726492, + "acc_norm": 0.6530612244897959, + "acc_norm_stderr": 0.030472526026726492 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7611940298507462, + "acc_stderr": 0.03014777593540922, + "acc_norm": 0.7611940298507462, + "acc_norm_stderr": 0.03014777593540922 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036625, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036625 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7543859649122807, + "acc_stderr": 0.0330140594698725, + "acc_norm": 0.7543859649122807, + "acc_norm_stderr": 0.0330140594698725 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.38922888616891066, + "mc1_stderr": 0.01706855268069033, + "mc2": 0.5602490758700006, + "mc2_stderr": 0.015553402773010839 + }, + "all": { + "acc": 0.583918763260702, + "acc_stderr": 0.0341343237781577, + "acc_norm": 0.5877832376225443, + "acc_norm_stderr": 0.034114345940462636, + "mc1": 0.38922888616891066, + "mc1_stderr": 0.01706855268069033, + "mc2": 0.5602490758700006, + "mc2_stderr": 0.015553402773010839 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6360.987108945847", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b/results_2023-10-15T13-11-43.680043.json b/eval-results/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b/results_2023-10-15T13-11-43.680043.json new file mode 100644 index 0000000000000000000000000000000000000000..1d3b07af160720871dab6bb74266bbce19143cc8 --- /dev/null +++ b/eval-results/uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b/results_2023-10-15T13-11-43.680043.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-llama2-hermes-orca-platypus-wizardlm-13b", + "model_sha": "3cd8fe05b53db21d3f1c07cfc04061f14c323b31", + "model_size": "24.32 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.057466442953020135, + "em_stderr": 0.0023833905882384896, + "f1": 0.17808829697986514, + "f1_stderr": 0.002972308703760267 + }, + "harness|gsm8k|5": { + "acc": 0.13115996967399546, + "acc_stderr": 0.009298499235587858 + }, + "harness|winogrande|5": { + "acc": 0.7537490134175217, + "acc_stderr": 0.012108365307437531 + }, + "all": { + "em": 0.057466442953020135, + "em_stderr": 0.0023833905882384896, + "f1": 0.17808829697986514, + "f1_stderr": 0.002972308703760267, + "acc": 0.44245449154575855, + "acc_stderr": 0.010703432271512695 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "ab06ac95c741c2d3" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "85ecc16feafb73ba" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "c2fb9c87681606c6" + }, + "total_evaluation_time_secondes": "11715.848466873169", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-llama2-luban-orca-platypus-13b/results_2023-09-01T05-54-43.169153.json b/eval-results/uukuguy/speechless-llama2-luban-orca-platypus-13b/results_2023-09-01T05-54-43.169153.json new file mode 100644 index 0000000000000000000000000000000000000000..c67cc5a60459375ad67bfa036322e4695750f660 --- /dev/null +++ b/eval-results/uukuguy/speechless-llama2-luban-orca-platypus-13b/results_2023-09-01T05-54-43.169153.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-llama2-luban-orca-platypus-13b", + "model_sha": "908cfb670611875b52045c4bab81cff53f0279a7", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5947098976109215, + "acc_stderr": 0.014346869060229328, + "acc_norm": 0.6254266211604096, + "acc_norm_stderr": 0.014144193471893446 + }, + "harness|hellaswag|10": { + "acc": 0.6261700856403107, + "acc_stderr": 0.0048283050419044024, + "acc_norm": 0.8276239792869946, + "acc_norm_stderr": 0.003769350079195899 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5333333333333333, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.5333333333333333, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6118421052631579, + "acc_stderr": 0.03965842097512744, + "acc_norm": 0.6118421052631579, + "acc_norm_stderr": 0.03965842097512744 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6037735849056604, + "acc_stderr": 0.030102793781791197, + "acc_norm": 0.6037735849056604, + "acc_norm_stderr": 0.030102793781791197 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6458333333333334, + "acc_stderr": 0.039994111357535424, + "acc_norm": 0.6458333333333334, + "acc_norm_stderr": 0.039994111357535424 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5953757225433526, + "acc_stderr": 0.03742461193887248, + "acc_norm": 0.5953757225433526, + "acc_norm_stderr": 0.03742461193887248 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.30392156862745096, + "acc_stderr": 0.045766654032077615, + "acc_norm": 0.30392156862745096, + "acc_norm_stderr": 0.045766654032077615 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5063829787234042, + "acc_stderr": 0.03268335899936337, + "acc_norm": 0.5063829787234042, + "acc_norm_stderr": 0.03268335899936337 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.044346007015849245, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.044346007015849245 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5517241379310345, + "acc_stderr": 0.04144311810878152, + "acc_norm": 0.5517241379310345, + "acc_norm_stderr": 0.04144311810878152 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3412698412698413, + "acc_stderr": 0.024419234966819064, + "acc_norm": 0.3412698412698413, + "acc_norm_stderr": 0.024419234966819064 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.04325506042017087, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.04325506042017087 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.048241815132442176, + "acc_norm": 0.36, + "acc_norm_stderr": 0.048241815132442176 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6741935483870968, + "acc_stderr": 0.026662010578567104, + "acc_norm": 0.6741935483870968, + "acc_norm_stderr": 0.026662010578567104 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.47783251231527096, + "acc_stderr": 0.03514528562175008, + "acc_norm": 0.47783251231527096, + "acc_norm_stderr": 0.03514528562175008 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7454545454545455, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.7454545454545455, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.030532892233932026, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.030532892233932026 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8652849740932642, + "acc_stderr": 0.024639789097709443, + "acc_norm": 0.8652849740932642, + "acc_norm_stderr": 0.024639789097709443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6205128205128205, + "acc_stderr": 0.024603626924097417, + "acc_norm": 0.6205128205128205, + "acc_norm_stderr": 0.024603626924097417 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.32592592592592595, + "acc_stderr": 0.028578348365473065, + "acc_norm": 0.32592592592592595, + "acc_norm_stderr": 0.028578348365473065 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6218487394957983, + "acc_stderr": 0.031499305777849054, + "acc_norm": 0.6218487394957983, + "acc_norm_stderr": 0.031499305777849054 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943342, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943342 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8, + "acc_stderr": 0.017149858514250965, + "acc_norm": 0.8, + "acc_norm_stderr": 0.017149858514250965 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4166666666666667, + "acc_stderr": 0.03362277436608044, + "acc_norm": 0.4166666666666667, + "acc_norm_stderr": 0.03362277436608044 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8480392156862745, + "acc_stderr": 0.025195658428931792, + "acc_norm": 0.8480392156862745, + "acc_norm_stderr": 0.025195658428931792 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7848101265822784, + "acc_stderr": 0.02675082699467617, + "acc_norm": 0.7848101265822784, + "acc_norm_stderr": 0.02675082699467617 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6681614349775785, + "acc_stderr": 0.03160295143776678, + "acc_norm": 0.6681614349775785, + "acc_norm_stderr": 0.03160295143776678 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7022900763358778, + "acc_stderr": 0.04010358942462203, + "acc_norm": 0.7022900763358778, + "acc_norm_stderr": 0.04010358942462203 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.04026187527591206, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.04026187527591206 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7685185185185185, + "acc_stderr": 0.04077494709252627, + "acc_norm": 0.7685185185185185, + "acc_norm_stderr": 0.04077494709252627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6871165644171779, + "acc_stderr": 0.03642914578292406, + "acc_norm": 0.6871165644171779, + "acc_norm_stderr": 0.03642914578292406 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4107142857142857, + "acc_stderr": 0.04669510663875191, + "acc_norm": 0.4107142857142857, + "acc_norm_stderr": 0.04669510663875191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7281553398058253, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.7281553398058253, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8290598290598291, + "acc_stderr": 0.024662496845209825, + "acc_norm": 0.8290598290598291, + "acc_norm_stderr": 0.024662496845209825 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.63, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.63, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7905491698595147, + "acc_stderr": 0.014551310568143705, + "acc_norm": 0.7905491698595147, + "acc_norm_stderr": 0.014551310568143705 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6445086705202312, + "acc_stderr": 0.025770292082977254, + "acc_norm": 0.6445086705202312, + "acc_norm_stderr": 0.025770292082977254 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.43910614525139663, + "acc_stderr": 0.016598022120580425, + "acc_norm": 0.43910614525139663, + "acc_norm_stderr": 0.016598022120580425 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.673202614379085, + "acc_stderr": 0.026857294663281413, + "acc_norm": 0.673202614379085, + "acc_norm_stderr": 0.026857294663281413 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6816720257234726, + "acc_stderr": 0.02645722506781103, + "acc_norm": 0.6816720257234726, + "acc_norm_stderr": 0.02645722506781103 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.691358024691358, + "acc_stderr": 0.025702640260603753, + "acc_norm": 0.691358024691358, + "acc_norm_stderr": 0.025702640260603753 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4787234042553192, + "acc_stderr": 0.029800481645628693, + "acc_norm": 0.4787234042553192, + "acc_norm_stderr": 0.029800481645628693 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4602346805736636, + "acc_stderr": 0.01272978538659856, + "acc_norm": 0.4602346805736636, + "acc_norm_stderr": 0.01272978538659856 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6029411764705882, + "acc_stderr": 0.02972215209928007, + "acc_norm": 0.6029411764705882, + "acc_norm_stderr": 0.02972215209928007 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5735294117647058, + "acc_stderr": 0.020007912739359365, + "acc_norm": 0.5735294117647058, + "acc_norm_stderr": 0.020007912739359365 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6653061224489796, + "acc_stderr": 0.030209235226242307, + "acc_norm": 0.6653061224489796, + "acc_norm_stderr": 0.030209235226242307 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7711442786069652, + "acc_stderr": 0.029705284056772436, + "acc_norm": 0.7711442786069652, + "acc_norm_stderr": 0.029705284056772436 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.79, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4879518072289157, + "acc_stderr": 0.03891364495835821, + "acc_norm": 0.4879518072289157, + "acc_norm_stderr": 0.03891364495835821 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.03094445977853321, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.03094445977853321 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3953488372093023, + "mc1_stderr": 0.017115815632418197, + "mc2": 0.5466373720355463, + "mc2_stderr": 0.015666286200823117 + }, + "all": { + "acc": 0.5929341398724475, + "acc_stderr": 0.03391834960107917, + "acc_norm": 0.5968692350791623, + "acc_norm_stderr": 0.0338969660324174, + "mc1": 0.3953488372093023, + "mc1_stderr": 0.017115815632418197, + "mc2": 0.5466373720355463, + "mc2_stderr": 0.015666286200823117 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6827.424413204193", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-llama2-luban-orca-platypus-13b/results_2023-10-16T17-51-55.747438.json b/eval-results/uukuguy/speechless-llama2-luban-orca-platypus-13b/results_2023-10-16T17-51-55.747438.json new file mode 100644 index 0000000000000000000000000000000000000000..4eca0f0e9437d549d77dcbefc825e276cceeb379 --- /dev/null +++ b/eval-results/uukuguy/speechless-llama2-luban-orca-platypus-13b/results_2023-10-16T17-51-55.747438.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-llama2-luban-orca-platypus-13b", + "model_sha": "f043fe517c31f029a90755d259b545e9c675ebb7", + "model_size": "24.32 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.006921140939597316, + "em_stderr": 0.0008490247804930292, + "f1": 0.11193687080536992, + "f1_stderr": 0.0020523308364626394 + }, + "harness|gsm8k|5": { + "acc": 0.08188021228203184, + "acc_stderr": 0.007552338527716947 + }, + "harness|winogrande|5": { + "acc": 0.771112865035517, + "acc_stderr": 0.011807360224025388 + }, + "all": { + "em": 0.006921140939597316, + "em_stderr": 0.0008490247804930292, + "f1": 0.11193687080536992, + "f1_stderr": 0.0020523308364626394, + "acc": 0.4264965386587744, + "acc_stderr": 0.009679849375871168 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "2442c141b93837bb" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "f036ae8176a47a84" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "50832a331cfd7698" + }, + "total_evaluation_time_secondes": "11842.130831480026", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-mistral-7b-dare-0.85/results_2023-11-28T06-35-43.607271.json b/eval-results/uukuguy/speechless-mistral-7b-dare-0.85/results_2023-11-28T06-35-43.607271.json new file mode 100644 index 0000000000000000000000000000000000000000..3b08cc5eb4b1f83eb677f0a6a7366d3ffddf4758 --- /dev/null +++ b/eval-results/uukuguy/speechless-mistral-7b-dare-0.85/results_2023-11-28T06-35-43.607271.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 977216.77365809, + "end_time": 992439.426280606, + "total_evaluation_time_secondes": "15222.652622515918", + "model_name": "uukuguy/speechless-mistral-7b-dare-0.85", + "model_sha": "b19e60f64b3be7f41658958658658bc12038c68f", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6040955631399317, + "acc_stderr": 0.014291228393536588, + "acc_norm": 0.6356655290102389, + "acc_norm_stderr": 0.014063260279882417 + }, + "harness|hellaswag|10": { + "acc": 0.6522605058753237, + "acc_stderr": 0.004752794829825043, + "acc_norm": 0.8482374029077873, + "acc_norm_stderr": 0.003580573563373656 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6148148148148148, + "acc_stderr": 0.04203921040156279, + "acc_norm": 0.6148148148148148, + "acc_norm_stderr": 0.04203921040156279 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6842105263157895, + "acc_stderr": 0.03782728980865469, + "acc_norm": 0.6842105263157895, + "acc_norm_stderr": 0.03782728980865469 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6943396226415094, + "acc_stderr": 0.028353298073322666, + "acc_norm": 0.6943396226415094, + "acc_norm_stderr": 0.028353298073322666 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7430555555555556, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.7430555555555556, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.653179190751445, + "acc_stderr": 0.036291466701596636, + "acc_norm": 0.653179190751445, + "acc_norm_stderr": 0.036291466701596636 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.39215686274509803, + "acc_stderr": 0.048580835742663454, + "acc_norm": 0.39215686274509803, + "acc_norm_stderr": 0.048580835742663454 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.83, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.83, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5617021276595745, + "acc_stderr": 0.03243618636108101, + "acc_norm": 0.5617021276595745, + "acc_norm_stderr": 0.03243618636108101 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4473684210526316, + "acc_stderr": 0.04677473004491199, + "acc_norm": 0.4473684210526316, + "acc_norm_stderr": 0.04677473004491199 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5586206896551724, + "acc_stderr": 0.04137931034482758, + "acc_norm": 0.5586206896551724, + "acc_norm_stderr": 0.04137931034482758 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4021164021164021, + "acc_stderr": 0.02525303255499769, + "acc_norm": 0.4021164021164021, + "acc_norm_stderr": 0.02525303255499769 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.04444444444444449, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.04444444444444449 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7741935483870968, + "acc_stderr": 0.023785577884181015, + "acc_norm": 0.7741935483870968, + "acc_norm_stderr": 0.023785577884181015 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.035158955511656986, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.035158955511656986 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7636363636363637, + "acc_stderr": 0.03317505930009182, + "acc_norm": 0.7636363636363637, + "acc_norm_stderr": 0.03317505930009182 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7929292929292929, + "acc_stderr": 0.02886977846026705, + "acc_norm": 0.7929292929292929, + "acc_norm_stderr": 0.02886977846026705 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.9015544041450777, + "acc_stderr": 0.02150024957603346, + "acc_norm": 0.9015544041450777, + "acc_norm_stderr": 0.02150024957603346 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6564102564102564, + "acc_stderr": 0.024078696580635477, + "acc_norm": 0.6564102564102564, + "acc_norm_stderr": 0.024078696580635477 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.35185185185185186, + "acc_stderr": 0.02911661760608301, + "acc_norm": 0.35185185185185186, + "acc_norm_stderr": 0.02911661760608301 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6680672268907563, + "acc_stderr": 0.03058869701378364, + "acc_norm": 0.6680672268907563, + "acc_norm_stderr": 0.03058869701378364 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.03861557546255169, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.03861557546255169 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.818348623853211, + "acc_stderr": 0.01653061740926688, + "acc_norm": 0.818348623853211, + "acc_norm_stderr": 0.01653061740926688 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5092592592592593, + "acc_stderr": 0.034093869469927006, + "acc_norm": 0.5092592592592593, + "acc_norm_stderr": 0.034093869469927006 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.028125972265654373, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.028125972265654373 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7848101265822784, + "acc_stderr": 0.02675082699467617, + "acc_norm": 0.7848101265822784, + "acc_norm_stderr": 0.02675082699467617 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6995515695067265, + "acc_stderr": 0.030769352008229146, + "acc_norm": 0.6995515695067265, + "acc_norm_stderr": 0.030769352008229146 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7862595419847328, + "acc_stderr": 0.0359546161177469, + "acc_norm": 0.7862595419847328, + "acc_norm_stderr": 0.0359546161177469 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8099173553719008, + "acc_stderr": 0.03581796951709282, + "acc_norm": 0.8099173553719008, + "acc_norm_stderr": 0.03581796951709282 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.0401910747255735, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.0401910747255735 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7852760736196319, + "acc_stderr": 0.03226219377286775, + "acc_norm": 0.7852760736196319, + "acc_norm_stderr": 0.03226219377286775 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4732142857142857, + "acc_stderr": 0.047389751192741546, + "acc_norm": 0.4732142857142857, + "acc_norm_stderr": 0.047389751192741546 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8888888888888888, + "acc_stderr": 0.020588491316092375, + "acc_norm": 0.8888888888888888, + "acc_norm_stderr": 0.020588491316092375 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8186462324393359, + "acc_stderr": 0.013778693778464076, + "acc_norm": 0.8186462324393359, + "acc_norm_stderr": 0.013778693778464076 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7312138728323699, + "acc_stderr": 0.023868003262500107, + "acc_norm": 0.7312138728323699, + "acc_norm_stderr": 0.023868003262500107 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3541899441340782, + "acc_stderr": 0.015995644947299235, + "acc_norm": 0.3541899441340782, + "acc_norm_stderr": 0.015995644947299235 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7581699346405228, + "acc_stderr": 0.024518195641879334, + "acc_norm": 0.7581699346405228, + "acc_norm_stderr": 0.024518195641879334 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6977491961414791, + "acc_stderr": 0.02608270069539966, + "acc_norm": 0.6977491961414791, + "acc_norm_stderr": 0.02608270069539966 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7469135802469136, + "acc_stderr": 0.024191808600713, + "acc_norm": 0.7469135802469136, + "acc_norm_stderr": 0.024191808600713 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4858156028368794, + "acc_stderr": 0.02981549448368206, + "acc_norm": 0.4858156028368794, + "acc_norm_stderr": 0.02981549448368206 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.45436766623207303, + "acc_stderr": 0.012716941720734804, + "acc_norm": 0.45436766623207303, + "acc_norm_stderr": 0.012716941720734804 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6617647058823529, + "acc_stderr": 0.02873932851398357, + "acc_norm": 0.6617647058823529, + "acc_norm_stderr": 0.02873932851398357 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6633986928104575, + "acc_stderr": 0.01911721391149515, + "acc_norm": 0.6633986928104575, + "acc_norm_stderr": 0.01911721391149515 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.04494290866252089, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.04494290866252089 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7510204081632653, + "acc_stderr": 0.027682979522960234, + "acc_norm": 0.7510204081632653, + "acc_norm_stderr": 0.027682979522960234 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.845771144278607, + "acc_stderr": 0.025538433368578337, + "acc_norm": 0.845771144278607, + "acc_norm_stderr": 0.025538433368578337 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.84, + "acc_stderr": 0.03684529491774709, + "acc_norm": 0.84, + "acc_norm_stderr": 0.03684529491774709 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4879518072289157, + "acc_stderr": 0.03891364495835821, + "acc_norm": 0.4879518072289157, + "acc_norm_stderr": 0.03891364495835821 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35006119951040393, + "mc1_stderr": 0.01669794942015103, + "mc2": 0.506642206921476, + "mc2_stderr": 0.01507360276939749 + }, + "harness|winogrande|5": { + "acc": 0.7924230465666929, + "acc_stderr": 0.011398593419386772 + }, + "harness|drop|3": { + "em": 0.04173657718120805, + "em_stderr": 0.0020480498431639763, + "f1": 0.10687709731543625, + "f1_stderr": 0.002393001545443343 + }, + "harness|gsm8k|5": { + "acc": 0.19711902956785443, + "acc_stderr": 0.010958021630300631 + }, + "all": { + "acc": 0.637549491490135, + "acc_stderr": 0.03209136477170731, + "acc_norm": 0.6462461655536689, + "acc_norm_stderr": 0.03277654994312462, + "mc1": 0.35006119951040393, + "mc1_stderr": 0.01669794942015103, + "mc2": 0.506642206921476, + "mc2_stderr": 0.01507360276939749, + "em": 0.04173657718120805, + "em_stderr": 0.0020480498431639763, + "f1": 0.10687709731543625, + "f1_stderr": 0.002393001545443343 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "c7483c4e0fed1f1f" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "8b9cdb0b413d1a5e" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "f2c30d571b615dba" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-mistral-7b-dare-0.85/results_2023-12-03T18-43-10.266119.json b/eval-results/uukuguy/speechless-mistral-7b-dare-0.85/results_2023-12-03T18-43-10.266119.json new file mode 100644 index 0000000000000000000000000000000000000000..021d81231186229b67cbb4e0125a30143f1d7cc2 --- /dev/null +++ b/eval-results/uukuguy/speechless-mistral-7b-dare-0.85/results_2023-12-03T18-43-10.266119.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 80976.203483026, + "end_time": 83861.312668194, + "total_evaluation_time_secondes": "2885.1091851680103", + "model_name": "uukuguy/speechless-mistral-7b-dare-0.85", + "model_sha": "b19e60f64b3be7f41658958658658bc12038c68f", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.45564821834723274, + "acc_stderr": 0.013718194542485601 + }, + "all": { + "acc": 0.45564821834723274, + "acc_stderr": 0.013718194542485601 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "8b9cdb0b413d1a5e" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "f17391d49d33b9c0", + "hash_cont_tokens": "a06c02eb7fcd2ab5" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b-dare-0.85/results_2023-11-28T06-27-52.070093.json b/eval-results/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b-dare-0.85/results_2023-11-28T06-27-52.070093.json new file mode 100644 index 0000000000000000000000000000000000000000..1e71d91cdb1abcf228c0e10717c0d4788fe07559 --- /dev/null +++ b/eval-results/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b-dare-0.85/results_2023-11-28T06-27-52.070093.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1051048.540620069, + "end_time": 1065889.794992604, + "total_evaluation_time_secondes": "14841.25437253504", + "model_name": "uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b-dare-0.85", + "model_sha": "7a3def1c382793d2b12741896302c31a471b6d1d", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5691126279863481, + "acc_stderr": 0.014471133392642471, + "acc_norm": 0.6168941979522184, + "acc_norm_stderr": 0.014206472661672876 + }, + "harness|hellaswag|10": { + "acc": 0.6338378809002191, + "acc_stderr": 0.004807699539973411, + "acc_norm": 0.838478390758813, + "acc_norm_stderr": 0.0036725927293636334 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621503, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621503 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6296296296296297, + "acc_stderr": 0.041716541613545426, + "acc_norm": 0.6296296296296297, + "acc_norm_stderr": 0.041716541613545426 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6644736842105263, + "acc_stderr": 0.03842498559395268, + "acc_norm": 0.6644736842105263, + "acc_norm_stderr": 0.03842498559395268 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.7094339622641509, + "acc_stderr": 0.02794321998933714, + "acc_norm": 0.7094339622641509, + "acc_norm_stderr": 0.02794321998933714 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7361111111111112, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.7361111111111112, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6647398843930635, + "acc_stderr": 0.03599586301247077, + "acc_norm": 0.6647398843930635, + "acc_norm_stderr": 0.03599586301247077 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.37254901960784315, + "acc_stderr": 0.04810840148082636, + "acc_norm": 0.37254901960784315, + "acc_norm_stderr": 0.04810840148082636 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.77, + "acc_stderr": 0.042295258468165065, + "acc_norm": 0.77, + "acc_norm_stderr": 0.042295258468165065 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5659574468085107, + "acc_stderr": 0.03240038086792747, + "acc_norm": 0.5659574468085107, + "acc_norm_stderr": 0.03240038086792747 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.4824561403508772, + "acc_stderr": 0.04700708033551038, + "acc_norm": 0.4824561403508772, + "acc_norm_stderr": 0.04700708033551038 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5724137931034483, + "acc_stderr": 0.04122737111370332, + "acc_norm": 0.5724137931034483, + "acc_norm_stderr": 0.04122737111370332 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3862433862433862, + "acc_stderr": 0.025075981767601688, + "acc_norm": 0.3862433862433862, + "acc_norm_stderr": 0.025075981767601688 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.04426266681379909, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.04426266681379909 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7580645161290323, + "acc_stderr": 0.024362599693031096, + "acc_norm": 0.7580645161290323, + "acc_norm_stderr": 0.024362599693031096 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5073891625615764, + "acc_stderr": 0.0351760354036101, + "acc_norm": 0.5073891625615764, + "acc_norm_stderr": 0.0351760354036101 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7818181818181819, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.7818181818181819, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7828282828282829, + "acc_stderr": 0.02937661648494563, + "acc_norm": 0.7828282828282829, + "acc_norm_stderr": 0.02937661648494563 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8756476683937824, + "acc_stderr": 0.023814477086593542, + "acc_norm": 0.8756476683937824, + "acc_norm_stderr": 0.023814477086593542 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6564102564102564, + "acc_stderr": 0.024078696580635477, + "acc_norm": 0.6564102564102564, + "acc_norm_stderr": 0.024078696580635477 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.35555555555555557, + "acc_stderr": 0.02918571494985741, + "acc_norm": 0.35555555555555557, + "acc_norm_stderr": 0.02918571494985741 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6680672268907563, + "acc_stderr": 0.03058869701378364, + "acc_norm": 0.6680672268907563, + "acc_norm_stderr": 0.03058869701378364 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.038020397601079024, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.038020397601079024 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.818348623853211, + "acc_stderr": 0.016530617409266878, + "acc_norm": 0.818348623853211, + "acc_norm_stderr": 0.016530617409266878 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5601851851851852, + "acc_stderr": 0.0338517797604481, + "acc_norm": 0.5601851851851852, + "acc_norm_stderr": 0.0338517797604481 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.803921568627451, + "acc_stderr": 0.027865942286639318, + "acc_norm": 0.803921568627451, + "acc_norm_stderr": 0.027865942286639318 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229966, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229966 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.03102441174057222, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.03102441174057222 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7938931297709924, + "acc_stderr": 0.03547771004159463, + "acc_norm": 0.7938931297709924, + "acc_norm_stderr": 0.03547771004159463 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7933884297520661, + "acc_stderr": 0.03695980128098825, + "acc_norm": 0.7933884297520661, + "acc_norm_stderr": 0.03695980128098825 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7962962962962963, + "acc_stderr": 0.03893542518824847, + "acc_norm": 0.7962962962962963, + "acc_norm_stderr": 0.03893542518824847 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7730061349693251, + "acc_stderr": 0.03291099578615769, + "acc_norm": 0.7730061349693251, + "acc_norm_stderr": 0.03291099578615769 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.5, + "acc_stderr": 0.04745789978762494, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04745789978762494 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8155339805825242, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.8155339805825242, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.02190190511507333, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.02190190511507333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8173690932311622, + "acc_stderr": 0.013816335389973136, + "acc_norm": 0.8173690932311622, + "acc_norm_stderr": 0.013816335389973136 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7138728323699421, + "acc_stderr": 0.02433214677913413, + "acc_norm": 0.7138728323699421, + "acc_norm_stderr": 0.02433214677913413 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.35977653631284917, + "acc_stderr": 0.016051419760310263, + "acc_norm": 0.35977653631284917, + "acc_norm_stderr": 0.016051419760310263 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7516339869281046, + "acc_stderr": 0.02473998135511359, + "acc_norm": 0.7516339869281046, + "acc_norm_stderr": 0.02473998135511359 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7106109324758842, + "acc_stderr": 0.025755865922632945, + "acc_norm": 0.7106109324758842, + "acc_norm_stderr": 0.025755865922632945 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7283950617283951, + "acc_stderr": 0.02474862449053737, + "acc_norm": 0.7283950617283951, + "acc_norm_stderr": 0.02474862449053737 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.49645390070921985, + "acc_stderr": 0.02982674915328092, + "acc_norm": 0.49645390070921985, + "acc_norm_stderr": 0.02982674915328092 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4426336375488918, + "acc_stderr": 0.012685906538206247, + "acc_norm": 0.4426336375488918, + "acc_norm_stderr": 0.012685906538206247 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6985294117647058, + "acc_stderr": 0.027875982114273168, + "acc_norm": 0.6985294117647058, + "acc_norm_stderr": 0.027875982114273168 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6781045751633987, + "acc_stderr": 0.018901015322093085, + "acc_norm": 0.6781045751633987, + "acc_norm_stderr": 0.018901015322093085 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302506, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302506 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7428571428571429, + "acc_stderr": 0.02797982353874455, + "acc_norm": 0.7428571428571429, + "acc_norm_stderr": 0.02797982353874455 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8308457711442786, + "acc_stderr": 0.026508590656233268, + "acc_norm": 0.8308457711442786, + "acc_norm_stderr": 0.026508590656233268 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.88, + "acc_stderr": 0.03265986323710906, + "acc_norm": 0.88, + "acc_norm_stderr": 0.03265986323710906 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.536144578313253, + "acc_stderr": 0.038823108508905954, + "acc_norm": 0.536144578313253, + "acc_norm_stderr": 0.038823108508905954 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2876376988984088, + "mc1_stderr": 0.01584631510139481, + "mc2": 0.4313430300768237, + "mc2_stderr": 0.014237388533726152 + }, + "harness|winogrande|5": { + "acc": 0.7892659826361483, + "acc_stderr": 0.011462046419710676 + }, + "harness|drop|3": { + "em": 0.0016778523489932886, + "em_stderr": 0.00041913301788268467, + "f1": 0.06324035234899322, + "f1_stderr": 0.001392729457467942 + }, + "harness|gsm8k|5": { + "acc": 0.18347232752084913, + "acc_stderr": 0.010661370448699657 + }, + "all": { + "acc": 0.6377438290695429, + "acc_stderr": 0.032141473401050225, + "acc_norm": 0.6471535142866032, + "acc_norm_stderr": 0.032832316831424985, + "mc1": 0.2876376988984088, + "mc1_stderr": 0.01584631510139481, + "mc2": 0.4313430300768237, + "mc2_stderr": 0.014237388533726152, + "em": 0.0016778523489932886, + "em_stderr": 0.00041913301788268467, + "f1": 0.06324035234899322, + "f1_stderr": 0.001392729457467942 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "8773f93e0b5559cb" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "e49ce31107ab0d3c" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "d34b87a5aad48b7b" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b-dare-0.85/results_2023-12-03T19-04-28.043244.json b/eval-results/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b-dare-0.85/results_2023-12-03T19-04-28.043244.json new file mode 100644 index 0000000000000000000000000000000000000000..c18d250dbe4bb1f5147f81e94a8d449da4f4ce36 --- /dev/null +++ b/eval-results/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b-dare-0.85/results_2023-12-03T19-04-28.043244.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 82178.817431969, + "end_time": 85142.897812076, + "total_evaluation_time_secondes": "2964.0803801069997", + "model_name": "uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b-dare-0.85", + "model_sha": "7a3def1c382793d2b12741896302c31a471b6d1d", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.40333586050037906, + "acc_stderr": 0.013512654781814695 + }, + "all": { + "acc": 0.40333586050037906, + "acc_stderr": 0.013512654781814695 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "e49ce31107ab0d3c" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "f17391d49d33b9c0", + "hash_cont_tokens": "c1094a700c288f1a" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b/results_2023-11-09T14-37-01.184556.json b/eval-results/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b/results_2023-11-09T14-37-01.184556.json new file mode 100644 index 0000000000000000000000000000000000000000..c269dbd65c6f8603a73346326563507ce1720bb2 --- /dev/null +++ b/eval-results/uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b/results_2023-11-09T14-37-01.184556.json @@ -0,0 +1,1433 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "model_name": "uukuguy/speechless-mistral-dolphin-orca-platypus-samantha-7b", + "model_sha": "d4039b40e842df7f6b8de50532444c8944ea5791", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.6083617747440273, + "acc_stderr": 0.014264122124938211, + "acc_norm": 0.643344709897611, + "acc_norm_stderr": 0.013998056902620194 + }, + "harness|hellaswag|10": { + "acc": 0.6489743079067914, + "acc_stderr": 0.004763155068744876, + "acc_norm": 0.8439553873730332, + "acc_norm_stderr": 0.003621559719378182 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6074074074074074, + "acc_stderr": 0.0421850621536888, + "acc_norm": 0.6074074074074074, + "acc_norm_stderr": 0.0421850621536888 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6973684210526315, + "acc_stderr": 0.03738520676119668, + "acc_norm": 0.6973684210526315, + "acc_norm_stderr": 0.03738520676119668 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.690566037735849, + "acc_stderr": 0.028450154794118637, + "acc_norm": 0.690566037735849, + "acc_norm_stderr": 0.028450154794118637 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7291666666666666, + "acc_stderr": 0.037161774375660185, + "acc_norm": 0.7291666666666666, + "acc_norm_stderr": 0.037161774375660185 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.630057803468208, + "acc_stderr": 0.036812296333943194, + "acc_norm": 0.630057803468208, + "acc_norm_stderr": 0.036812296333943194 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.35294117647058826, + "acc_stderr": 0.04755129616062947, + "acc_norm": 0.35294117647058826, + "acc_norm_stderr": 0.04755129616062947 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.79, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5574468085106383, + "acc_stderr": 0.03246956919789958, + "acc_norm": 0.5574468085106383, + "acc_norm_stderr": 0.03246956919789958 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.45614035087719296, + "acc_stderr": 0.04685473041907789, + "acc_norm": 0.45614035087719296, + "acc_norm_stderr": 0.04685473041907789 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5862068965517241, + "acc_stderr": 0.04104269211806232, + "acc_norm": 0.5862068965517241, + "acc_norm_stderr": 0.04104269211806232 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4021164021164021, + "acc_stderr": 0.025253032554997692, + "acc_norm": 0.4021164021164021, + "acc_norm_stderr": 0.025253032554997692 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.04426266681379909, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.04426266681379909 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7774193548387097, + "acc_stderr": 0.023664216671642514, + "acc_norm": 0.7774193548387097, + "acc_norm_stderr": 0.023664216671642514 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5123152709359606, + "acc_stderr": 0.035169204442208966, + "acc_norm": 0.5123152709359606, + "acc_norm_stderr": 0.035169204442208966 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7515151515151515, + "acc_stderr": 0.033744026441394036, + "acc_norm": 0.7515151515151515, + "acc_norm_stderr": 0.033744026441394036 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7676767676767676, + "acc_stderr": 0.030088629490217487, + "acc_norm": 0.7676767676767676, + "acc_norm_stderr": 0.030088629490217487 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8601036269430051, + "acc_stderr": 0.025033870583015178, + "acc_norm": 0.8601036269430051, + "acc_norm_stderr": 0.025033870583015178 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6461538461538462, + "acc_stderr": 0.02424378399406216, + "acc_norm": 0.6461538461538462, + "acc_norm_stderr": 0.02424378399406216 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.337037037037037, + "acc_stderr": 0.02882088466625326, + "acc_norm": 0.337037037037037, + "acc_norm_stderr": 0.02882088466625326 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6638655462184874, + "acc_stderr": 0.030684737115135363, + "acc_norm": 0.6638655462184874, + "acc_norm_stderr": 0.030684737115135363 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3443708609271523, + "acc_stderr": 0.038796870240733264, + "acc_norm": 0.3443708609271523, + "acc_norm_stderr": 0.038796870240733264 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8275229357798165, + "acc_stderr": 0.016197807956848054, + "acc_norm": 0.8275229357798165, + "acc_norm_stderr": 0.016197807956848054 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5, + "acc_stderr": 0.034099716973523674, + "acc_norm": 0.5, + "acc_norm_stderr": 0.034099716973523674 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.028125972265654373, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.028125972265654373 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8016877637130801, + "acc_stderr": 0.02595502084162113, + "acc_norm": 0.8016877637130801, + "acc_norm_stderr": 0.02595502084162113 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.031024411740572213, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.031024411740572213 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7786259541984732, + "acc_stderr": 0.0364129708131373, + "acc_norm": 0.7786259541984732, + "acc_norm_stderr": 0.0364129708131373 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.8099173553719008, + "acc_stderr": 0.03581796951709282, + "acc_norm": 0.8099173553719008, + "acc_norm_stderr": 0.03581796951709282 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.0395783547198098, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.0395783547198098 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7484662576687117, + "acc_stderr": 0.034089978868575295, + "acc_norm": 0.7484662576687117, + "acc_norm_stderr": 0.034089978868575295 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4642857142857143, + "acc_stderr": 0.04733667890053756, + "acc_norm": 0.4642857142857143, + "acc_norm_stderr": 0.04733667890053756 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8058252427184466, + "acc_stderr": 0.039166677628225836, + "acc_norm": 0.8058252427184466, + "acc_norm_stderr": 0.039166677628225836 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8675213675213675, + "acc_stderr": 0.022209309073165612, + "acc_norm": 0.8675213675213675, + "acc_norm_stderr": 0.022209309073165612 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8237547892720306, + "acc_stderr": 0.013625556907993445, + "acc_norm": 0.8237547892720306, + "acc_norm_stderr": 0.013625556907993445 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.708092485549133, + "acc_stderr": 0.024476994076247326, + "acc_norm": 0.708092485549133, + "acc_norm_stderr": 0.024476994076247326 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3687150837988827, + "acc_stderr": 0.01613575901503012, + "acc_norm": 0.3687150837988827, + "acc_norm_stderr": 0.01613575901503012 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6862745098039216, + "acc_stderr": 0.02656892101545714, + "acc_norm": 0.6862745098039216, + "acc_norm_stderr": 0.02656892101545714 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6945337620578779, + "acc_stderr": 0.02616058445014045, + "acc_norm": 0.6945337620578779, + "acc_norm_stderr": 0.02616058445014045 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.02492200116888633, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.02492200116888633 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4645390070921986, + "acc_stderr": 0.029752389657427047, + "acc_norm": 0.4645390070921986, + "acc_norm_stderr": 0.029752389657427047 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.46740547588005216, + "acc_stderr": 0.01274307294265336, + "acc_norm": 0.46740547588005216, + "acc_norm_stderr": 0.01274307294265336 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6617647058823529, + "acc_stderr": 0.028739328513983572, + "acc_norm": 0.6617647058823529, + "acc_norm_stderr": 0.028739328513983572 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6486928104575164, + "acc_stderr": 0.01931267606578656, + "acc_norm": 0.6486928104575164, + "acc_norm_stderr": 0.01931267606578656 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.04494290866252089, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.04494290866252089 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7306122448979592, + "acc_stderr": 0.02840125202902294, + "acc_norm": 0.7306122448979592, + "acc_norm_stderr": 0.02840125202902294 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.835820895522388, + "acc_stderr": 0.026193923544454132, + "acc_norm": 0.835820895522388, + "acc_norm_stderr": 0.026193923544454132 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.033799766898963086, + "acc_norm": 0.87, + "acc_norm_stderr": 0.033799766898963086 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5421686746987951, + "acc_stderr": 0.0387862677100236, + "acc_norm": 0.5421686746987951, + "acc_norm_stderr": 0.0387862677100236 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8362573099415205, + "acc_stderr": 0.028380919596145866, + "acc_norm": 0.8362573099415205, + "acc_norm_stderr": 0.028380919596145866 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3525091799265606, + "mc1_stderr": 0.016724646380756547, + "mc2": 0.5252044347181257, + "mc2_stderr": 0.015164118244947575 + }, + "harness|winogrande|5": { + "acc": 0.7837411207576953, + "acc_stderr": 0.01157061486140935 + }, + "harness|drop|3": { + "em": 0.0032508389261744967, + "em_stderr": 0.0005829486708558965, + "f1": 0.08664324664429508, + "f1_stderr": 0.0017394064480495393 + }, + "harness|gsm8k|5": { + "acc": 0.2137983320697498, + "acc_stderr": 0.011293054698635055 + }, + "all": { + "acc": 0.6324163252907573, + "acc_stderr": 0.032217952701907616, + "acc_norm": 0.6408444136360775, + "acc_norm_stderr": 0.03289870821499382, + "mc1": 0.3525091799265606, + "mc1_stderr": 0.016724646380756547, + "mc2": 0.5252044347181257, + "mc2_stderr": 0.015164118244947575, + "em": 0.0032508389261744967, + "em_stderr": 0.0005829486708558965, + "f1": 0.08664324664429508, + "f1_stderr": 0.0017394064480495393 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "139ed647e6ff3014" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "cfe2ceac11f76f44" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "67001694952329e6" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-mistral-six-in-one-7b-orth-1.0/results_2023-12-13T11-13-22.485134.json b/eval-results/uukuguy/speechless-mistral-six-in-one-7b-orth-1.0/results_2023-12-13T11-13-22.485134.json new file mode 100644 index 0000000000000000000000000000000000000000..68ccd27fd2c36b4c64c698e3137154b8d5b1f900 --- /dev/null +++ b/eval-results/uukuguy/speechless-mistral-six-in-one-7b-orth-1.0/results_2023-12-13T11-13-22.485134.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 89050.007819371, + "end_time": 96959.54893161, + "total_evaluation_time_secondes": "7909.541112238992", + "model_name": "uukuguy/speechless-mistral-six-in-one-7b-orth-1.0", + "model_sha": "e500285ba420cb3865d72aa0cc3b1fb9cc0bfee8", + "model_dtype": "torch.bfloat16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.22696245733788395, + "acc_stderr": 0.012240491536132861, + "acc_norm": 0.22696245733788395, + "acc_norm_stderr": 0.012240491536132861 + }, + "harness|hellaswag|10": { + "acc": 0.2504481179047998, + "acc_stderr": 0.004323856300539177, + "acc_norm": 0.2504481179047998, + "acc_norm_stderr": 0.004323856300539177 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.18518518518518517, + "acc_stderr": 0.03355677216313142, + "acc_norm": 0.18518518518518517, + "acc_norm_stderr": 0.03355677216313142 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.21509433962264152, + "acc_stderr": 0.02528839450289137, + "acc_norm": 0.21509433962264152, + "acc_norm_stderr": 0.02528839450289137 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749874, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749874 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813365 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.20899470899470898, + "acc_stderr": 0.02094048156533486, + "acc_norm": 0.20899470899470898, + "acc_norm_stderr": 0.02094048156533486 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04040610178208841, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04040610178208841 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.1774193548387097, + "acc_stderr": 0.02173254068932927, + "acc_norm": 0.1774193548387097, + "acc_norm_stderr": 0.02173254068932927 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.15270935960591134, + "acc_stderr": 0.02530890453938063, + "acc_norm": 0.15270935960591134, + "acc_norm_stderr": 0.02530890453938063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.17676767676767677, + "acc_stderr": 0.027178752639044915, + "acc_norm": 0.17676767676767677, + "acc_norm_stderr": 0.027178752639044915 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.19689119170984457, + "acc_stderr": 0.028697873971860664, + "acc_norm": 0.19689119170984457, + "acc_norm_stderr": 0.028697873971860664 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.20256410256410257, + "acc_stderr": 0.020377660970371372, + "acc_norm": 0.20256410256410257, + "acc_norm_stderr": 0.020377660970371372 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2111111111111111, + "acc_stderr": 0.024882116857655075, + "acc_norm": 0.2111111111111111, + "acc_norm_stderr": 0.024882116857655075 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21008403361344538, + "acc_stderr": 0.026461398717471874, + "acc_norm": 0.21008403361344538, + "acc_norm_stderr": 0.026461398717471874 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.1986754966887417, + "acc_stderr": 0.03257847384436776, + "acc_norm": 0.1986754966887417, + "acc_norm_stderr": 0.03257847384436776 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1926605504587156, + "acc_stderr": 0.016909276884936094, + "acc_norm": 0.1926605504587156, + "acc_norm_stderr": 0.016909276884936094 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.1527777777777778, + "acc_stderr": 0.024536326026134224, + "acc_norm": 0.1527777777777778, + "acc_norm_stderr": 0.024536326026134224 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.31390134529147984, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.31390134529147984, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2905982905982906, + "acc_stderr": 0.02974504857267404, + "acc_norm": 0.2905982905982906, + "acc_norm_stderr": 0.02974504857267404 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.23754789272030652, + "acc_stderr": 0.015218733046150193, + "acc_norm": 0.23754789272030652, + "acc_norm_stderr": 0.015218733046150193 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.023929155517351284, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.023929155517351284 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1864951768488746, + "acc_stderr": 0.02212243977248077, + "acc_norm": 0.1864951768488746, + "acc_norm_stderr": 0.02212243977248077 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21604938271604937, + "acc_stderr": 0.022899162918445806, + "acc_norm": 0.21604938271604937, + "acc_norm_stderr": 0.022899162918445806 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23404255319148937, + "acc_stderr": 0.025257861359432417, + "acc_norm": 0.23404255319148937, + "acc_norm_stderr": 0.025257861359432417 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142692, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142692 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.18382352941176472, + "acc_stderr": 0.023529242185193106, + "acc_norm": 0.18382352941176472, + "acc_norm_stderr": 0.023529242185193106 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25, + "acc_stderr": 0.01751781884501444, + "acc_norm": 0.25, + "acc_norm_stderr": 0.01751781884501444 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.18775510204081633, + "acc_stderr": 0.02500025603954621, + "acc_norm": 0.18775510204081633, + "acc_norm_stderr": 0.02500025603954621 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401465, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401465 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.28313253012048195, + "acc_stderr": 0.03507295431370518, + "acc_norm": 0.28313253012048195, + "acc_norm_stderr": 0.03507295431370518 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 1.0, + "mc1_stderr": 0.0, + "mc2": NaN, + "mc2_stderr": NaN + }, + "harness|winogrande|5": { + "acc": 0.4956590370955012, + "acc_stderr": 0.014051956064076911 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "all": { + "acc": 0.23196194129343728, + "acc_stderr": 0.029934654752561563, + "acc_norm": 0.2314240573187148, + "acc_norm_stderr": 0.03071122006512167, + "mc1": 1.0, + "mc1_stderr": 0.0, + "mc2": NaN, + "mc2_stderr": NaN + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "8401e6188d643544" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "dd2174ba254fe7c3" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-mistral-six-in-one-7b/results_2023-11-12T18-14-50.698039.json b/eval-results/uukuguy/speechless-mistral-six-in-one-7b/results_2023-11-12T18-14-50.698039.json new file mode 100644 index 0000000000000000000000000000000000000000..adca95a7396b1a555a17531ede56870b1bdd350f --- /dev/null +++ b/eval-results/uukuguy/speechless-mistral-six-in-one-7b/results_2023-11-12T18-14-50.698039.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "167773f1d5d1647c60dadc31c9e731ab7dbcbbad", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 243961.037941617, + "end_time": null, + "model_name": "uukuguy/speechless-mistral-six-in-one-7b", + "model_sha": "41e912e0f79094a80687f88ca5555f84aa9d307f", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5981228668941979, + "acc_stderr": 0.014327268614578276, + "acc_norm": 0.6296928327645052, + "acc_norm_stderr": 0.01411129875167495 + }, + "harness|hellaswag|10": { + "acc": 0.652459669388568, + "acc_stderr": 0.004752158936871872, + "acc_norm": 0.8460466042620992, + "acc_norm_stderr": 0.00360166483871892 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6222222222222222, + "acc_stderr": 0.04188307537595853, + "acc_norm": 0.6222222222222222, + "acc_norm_stderr": 0.04188307537595853 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6578947368421053, + "acc_stderr": 0.03860731599316092, + "acc_norm": 0.6578947368421053, + "acc_norm_stderr": 0.03860731599316092 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6792452830188679, + "acc_stderr": 0.02872750295788027, + "acc_norm": 0.6792452830188679, + "acc_norm_stderr": 0.02872750295788027 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7361111111111112, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.7361111111111112, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956911, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956911 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.53, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.53, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6184971098265896, + "acc_stderr": 0.03703851193099521, + "acc_norm": 0.6184971098265896, + "acc_norm_stderr": 0.03703851193099521 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.04784060704105654, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.04784060704105654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932261, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932261 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5319148936170213, + "acc_stderr": 0.03261936918467382, + "acc_norm": 0.5319148936170213, + "acc_norm_stderr": 0.03261936918467382 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.45614035087719296, + "acc_stderr": 0.04685473041907789, + "acc_norm": 0.45614035087719296, + "acc_norm_stderr": 0.04685473041907789 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.6, + "acc_stderr": 0.040824829046386284, + "acc_norm": 0.6, + "acc_norm_stderr": 0.040824829046386284 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.025305906241590626, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.025305906241590626 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7677419354838709, + "acc_stderr": 0.024022256130308235, + "acc_norm": 0.7677419354838709, + "acc_norm_stderr": 0.024022256130308235 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5270935960591133, + "acc_stderr": 0.03512819077876106, + "acc_norm": 0.5270935960591133, + "acc_norm_stderr": 0.03512819077876106 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252607, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252607 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.03346409881055953, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.03346409881055953 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7828282828282829, + "acc_stderr": 0.02937661648494563, + "acc_norm": 0.7828282828282829, + "acc_norm_stderr": 0.02937661648494563 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8704663212435233, + "acc_stderr": 0.024233532297758723, + "acc_norm": 0.8704663212435233, + "acc_norm_stderr": 0.024233532297758723 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6410256410256411, + "acc_stderr": 0.024321738484602354, + "acc_norm": 0.6410256410256411, + "acc_norm_stderr": 0.024321738484602354 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34814814814814815, + "acc_stderr": 0.029045600290616255, + "acc_norm": 0.34814814814814815, + "acc_norm_stderr": 0.029045600290616255 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6764705882352942, + "acc_stderr": 0.030388353551886783, + "acc_norm": 0.6764705882352942, + "acc_norm_stderr": 0.030388353551886783 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3509933774834437, + "acc_stderr": 0.03896981964257375, + "acc_norm": 0.3509933774834437, + "acc_norm_stderr": 0.03896981964257375 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8385321100917431, + "acc_stderr": 0.015776239256163224, + "acc_norm": 0.8385321100917431, + "acc_norm_stderr": 0.015776239256163224 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.49537037037037035, + "acc_stderr": 0.03409825519163572, + "acc_norm": 0.49537037037037035, + "acc_norm_stderr": 0.03409825519163572 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.028125972265654373, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.028125972265654373 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7637130801687764, + "acc_stderr": 0.02765215314415927, + "acc_norm": 0.7637130801687764, + "acc_norm_stderr": 0.02765215314415927 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.03102441174057221, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.03102441174057221 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7557251908396947, + "acc_stderr": 0.03768335959728742, + "acc_norm": 0.7557251908396947, + "acc_norm_stderr": 0.03768335959728742 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7851239669421488, + "acc_stderr": 0.037494924487096966, + "acc_norm": 0.7851239669421488, + "acc_norm_stderr": 0.037494924487096966 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7870370370370371, + "acc_stderr": 0.03957835471980979, + "acc_norm": 0.7870370370370371, + "acc_norm_stderr": 0.03957835471980979 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.754601226993865, + "acc_stderr": 0.03380939813943354, + "acc_norm": 0.754601226993865, + "acc_norm_stderr": 0.03380939813943354 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4642857142857143, + "acc_stderr": 0.04733667890053756, + "acc_norm": 0.4642857142857143, + "acc_norm_stderr": 0.04733667890053756 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8888888888888888, + "acc_stderr": 0.020588491316092375, + "acc_norm": 0.8888888888888888, + "acc_norm_stderr": 0.020588491316092375 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.71, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.71, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8122605363984674, + "acc_stderr": 0.013964393769899136, + "acc_norm": 0.8122605363984674, + "acc_norm_stderr": 0.013964393769899136 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.708092485549133, + "acc_stderr": 0.024476994076247337, + "acc_norm": 0.708092485549133, + "acc_norm_stderr": 0.024476994076247337 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.41899441340782123, + "acc_stderr": 0.016501579306861677, + "acc_norm": 0.41899441340782123, + "acc_norm_stderr": 0.016501579306861677 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7091503267973857, + "acc_stderr": 0.02600480036395213, + "acc_norm": 0.7091503267973857, + "acc_norm_stderr": 0.02600480036395213 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6784565916398714, + "acc_stderr": 0.026527724079528872, + "acc_norm": 0.6784565916398714, + "acc_norm_stderr": 0.026527724079528872 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7006172839506173, + "acc_stderr": 0.025483115601195455, + "acc_norm": 0.7006172839506173, + "acc_norm_stderr": 0.025483115601195455 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4574468085106383, + "acc_stderr": 0.029719281272236855, + "acc_norm": 0.4574468085106383, + "acc_norm_stderr": 0.029719281272236855 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.455019556714472, + "acc_stderr": 0.012718456618701773, + "acc_norm": 0.455019556714472, + "acc_norm_stderr": 0.012718456618701773 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6544117647058824, + "acc_stderr": 0.02888819310398863, + "acc_norm": 0.6544117647058824, + "acc_norm_stderr": 0.02888819310398863 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6388888888888888, + "acc_stderr": 0.01943177567703731, + "acc_norm": 0.6388888888888888, + "acc_norm_stderr": 0.01943177567703731 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6727272727272727, + "acc_stderr": 0.04494290866252089, + "acc_norm": 0.6727272727272727, + "acc_norm_stderr": 0.04494290866252089 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7224489795918367, + "acc_stderr": 0.028666857790274648, + "acc_norm": 0.7224489795918367, + "acc_norm_stderr": 0.028666857790274648 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8507462686567164, + "acc_stderr": 0.02519692987482707, + "acc_norm": 0.8507462686567164, + "acc_norm_stderr": 0.02519692987482707 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.03861229196653694, + "acc_norm": 0.82, + "acc_norm_stderr": 0.03861229196653694 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5421686746987951, + "acc_stderr": 0.0387862677100236, + "acc_norm": 0.5421686746987951, + "acc_norm_stderr": 0.0387862677100236 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.847953216374269, + "acc_stderr": 0.02753912288906145, + "acc_norm": 0.847953216374269, + "acc_norm_stderr": 0.02753912288906145 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.40514075887392903, + "mc1_stderr": 0.017185611727753368, + "mc2": 0.5776708582574724, + "mc2_stderr": 0.01544223129155929 + }, + "harness|winogrande|5": { + "acc": 0.7750591949486977, + "acc_stderr": 0.011735043564126735 + }, + "harness|drop|3": { + "em": 0.0041946308724832215, + "em_stderr": 0.0006618716168266571, + "f1": 0.09125943791946291, + "f1_stderr": 0.0018243790800558358 + }, + "harness|gsm8k|5": { + "acc": 0.18423047763457165, + "acc_stderr": 0.010678414428555008 + }, + "all": { + "acc": 0.6276350372644707, + "acc_stderr": 0.03243221410411415, + "acc_norm": 0.636467872903276, + "acc_norm_stderr": 0.03312856166774958, + "mc1": 0.40514075887392903, + "mc1_stderr": 0.017185611727753368, + "mc2": 0.5776708582574724, + "mc2_stderr": 0.01544223129155929, + "em": 0.0041946308724832215, + "em_stderr": 0.0006618716168266571, + "f1": 0.09125943791946291, + "f1_stderr": 0.0018243790800558358 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "b418ea81201cf359" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "75522357ce7a6f4f" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "d4803d0718170830" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0, + "total_evaluation_time_secondes": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-orca-platypus-coig-lite-2k-0.6e-13b/results_2023-08-30T17-54-51.197545.json b/eval-results/uukuguy/speechless-orca-platypus-coig-lite-2k-0.6e-13b/results_2023-08-30T17-54-51.197545.json new file mode 100644 index 0000000000000000000000000000000000000000..9df05bed13927358f64c791fc40629aa031e05ee --- /dev/null +++ b/eval-results/uukuguy/speechless-orca-platypus-coig-lite-2k-0.6e-13b/results_2023-08-30T17-54-51.197545.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-orca-platypus-coig-lite-2k-0.6e-13b", + "model_sha": "65214c9923d55795ecd6e7f9e0fcee5ba5f26929", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5546075085324232, + "acc_stderr": 0.014523987638344078, + "acc_norm": 0.5989761092150171, + "acc_norm_stderr": 0.014322255790719867 + }, + "harness|hellaswag|10": { + "acc": 0.6051583349930293, + "acc_stderr": 0.004878176541703579, + "acc_norm": 0.8076080462059351, + "acc_norm_stderr": 0.003933736699983616 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4740740740740741, + "acc_stderr": 0.04313531696750574, + "acc_norm": 0.4740740740740741, + "acc_norm_stderr": 0.04313531696750574 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6381578947368421, + "acc_stderr": 0.03910525752849724, + "acc_norm": 0.6381578947368421, + "acc_norm_stderr": 0.03910525752849724 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6264150943396226, + "acc_stderr": 0.029773082713319875, + "acc_norm": 0.6264150943396226, + "acc_norm_stderr": 0.029773082713319875 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6458333333333334, + "acc_stderr": 0.03999411135753543, + "acc_norm": 0.6458333333333334, + "acc_norm_stderr": 0.03999411135753543 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.5, + "acc_stderr": 0.050251890762960605, + "acc_norm": 0.5, + "acc_norm_stderr": 0.050251890762960605 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5317919075144508, + "acc_stderr": 0.03804749744364764, + "acc_norm": 0.5317919075144508, + "acc_norm_stderr": 0.03804749744364764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3431372549019608, + "acc_stderr": 0.04724007352383887, + "acc_norm": 0.3431372549019608, + "acc_norm_stderr": 0.04724007352383887 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.46382978723404256, + "acc_stderr": 0.032600385118357715, + "acc_norm": 0.46382978723404256, + "acc_norm_stderr": 0.032600385118357715 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.0433913832257986, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.0433913832257986 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5517241379310345, + "acc_stderr": 0.041443118108781526, + "acc_norm": 0.5517241379310345, + "acc_norm_stderr": 0.041443118108781526 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.35978835978835977, + "acc_stderr": 0.024718075944129288, + "acc_norm": 0.35978835978835977, + "acc_norm_stderr": 0.024718075944129288 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.04390259265377561, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.04390259265377561 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6580645161290323, + "acc_stderr": 0.02698528957655274, + "acc_norm": 0.6580645161290323, + "acc_norm_stderr": 0.02698528957655274 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.458128078817734, + "acc_stderr": 0.03505630140785741, + "acc_norm": 0.458128078817734, + "acc_norm_stderr": 0.03505630140785741 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7393939393939394, + "acc_stderr": 0.034277431758165236, + "acc_norm": 0.7393939393939394, + "acc_norm_stderr": 0.034277431758165236 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7575757575757576, + "acc_stderr": 0.030532892233932026, + "acc_norm": 0.7575757575757576, + "acc_norm_stderr": 0.030532892233932026 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8290155440414507, + "acc_stderr": 0.027171213683164528, + "acc_norm": 0.8290155440414507, + "acc_norm_stderr": 0.027171213683164528 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6102564102564103, + "acc_stderr": 0.024726967886647078, + "acc_norm": 0.6102564102564103, + "acc_norm_stderr": 0.024726967886647078 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.02831753349606647, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.02831753349606647 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5882352941176471, + "acc_stderr": 0.03196876989195778, + "acc_norm": 0.5882352941176471, + "acc_norm_stderr": 0.03196876989195778 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.03861557546255169, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.03861557546255169 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7889908256880734, + "acc_stderr": 0.01749392240411265, + "acc_norm": 0.7889908256880734, + "acc_norm_stderr": 0.01749392240411265 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.49074074074074076, + "acc_stderr": 0.03409386946992699, + "acc_norm": 0.49074074074074076, + "acc_norm_stderr": 0.03409386946992699 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8235294117647058, + "acc_stderr": 0.026756401538078962, + "acc_norm": 0.8235294117647058, + "acc_norm_stderr": 0.026756401538078962 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7721518987341772, + "acc_stderr": 0.027303484599069432, + "acc_norm": 0.7721518987341772, + "acc_norm_stderr": 0.027303484599069432 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6591928251121076, + "acc_stderr": 0.0318114974705536, + "acc_norm": 0.6591928251121076, + "acc_norm_stderr": 0.0318114974705536 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6259541984732825, + "acc_stderr": 0.042438692422305246, + "acc_norm": 0.6259541984732825, + "acc_norm_stderr": 0.042438692422305246 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7024793388429752, + "acc_stderr": 0.04173349148083499, + "acc_norm": 0.7024793388429752, + "acc_norm_stderr": 0.04173349148083499 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.04133119440243839, + "acc_norm": 0.7592592592592593, + "acc_norm_stderr": 0.04133119440243839 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6625766871165644, + "acc_stderr": 0.03714908409935574, + "acc_norm": 0.6625766871165644, + "acc_norm_stderr": 0.03714908409935574 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4107142857142857, + "acc_stderr": 0.04669510663875191, + "acc_norm": 0.4107142857142857, + "acc_norm_stderr": 0.04669510663875191 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7281553398058253, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.7281553398058253, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8034188034188035, + "acc_stderr": 0.02603538609895129, + "acc_norm": 0.8034188034188035, + "acc_norm_stderr": 0.02603538609895129 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.01486682166470959, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.01486682166470959 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6705202312138728, + "acc_stderr": 0.025305258131879695, + "acc_norm": 0.6705202312138728, + "acc_norm_stderr": 0.025305258131879695 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4491620111731844, + "acc_stderr": 0.01663583834163192, + "acc_norm": 0.4491620111731844, + "acc_norm_stderr": 0.01663583834163192 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6209150326797386, + "acc_stderr": 0.027780141207023344, + "acc_norm": 0.6209150326797386, + "acc_norm_stderr": 0.027780141207023344 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6881028938906752, + "acc_stderr": 0.026311858071854155, + "acc_norm": 0.6881028938906752, + "acc_norm_stderr": 0.026311858071854155 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.025630824975621344, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.025630824975621344 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.450354609929078, + "acc_stderr": 0.029680105565029036, + "acc_norm": 0.450354609929078, + "acc_norm_stderr": 0.029680105565029036 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.45632333767926986, + "acc_stderr": 0.012721420501462547, + "acc_norm": 0.45632333767926986, + "acc_norm_stderr": 0.012721420501462547 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5367647058823529, + "acc_stderr": 0.030290619180485694, + "acc_norm": 0.5367647058823529, + "acc_norm_stderr": 0.030290619180485694 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5898692810457516, + "acc_stderr": 0.019898412717635906, + "acc_norm": 0.5898692810457516, + "acc_norm_stderr": 0.019898412717635906 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6272727272727273, + "acc_stderr": 0.04631381319425465, + "acc_norm": 0.6272727272727273, + "acc_norm_stderr": 0.04631381319425465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6816326530612244, + "acc_stderr": 0.029822533793982062, + "acc_norm": 0.6816326530612244, + "acc_norm_stderr": 0.029822533793982062 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7611940298507462, + "acc_stderr": 0.03014777593540922, + "acc_norm": 0.7611940298507462, + "acc_norm_stderr": 0.03014777593540922 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.83, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4578313253012048, + "acc_stderr": 0.038786267710023595, + "acc_norm": 0.4578313253012048, + "acc_norm_stderr": 0.038786267710023595 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7660818713450293, + "acc_stderr": 0.03246721765117826, + "acc_norm": 0.7660818713450293, + "acc_norm_stderr": 0.03246721765117826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35006119951040393, + "mc1_stderr": 0.01669794942015103, + "mc2": 0.47973543490554316, + "mc2_stderr": 0.01515523774313743 + }, + "all": { + "acc": 0.5833040742836654, + "acc_stderr": 0.034148505589925465, + "acc_norm": 0.587487435502233, + "acc_norm_stderr": 0.03412907895112302, + "mc1": 0.35006119951040393, + "mc1_stderr": 0.01669794942015103, + "mc2": 0.47973543490554316, + "mc2_stderr": 0.01515523774313743 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "3ced177a9740ab72" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "736cbacfc627c9ce" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "d2d9cf5534b74b0b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "9d8617775e7afb7e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "8a729845cf844415" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "258d18b5a76e9d51" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "41f6ee2445154160" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "69114fe474fd53fa" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "76b2fa379520c907" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "b515d408b1bdf6f5" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "935dc99247031e33" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "85f7f7d7ac099657" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "d41d04de2e5e5d4b" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "587dad76855b6265" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "84745da13334a4b5" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "05f39a5a580500e1" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "8181ae2e48363b69" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "6d11e1c9a9d46862" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "8e94e84c0b1d140d" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "79e75724ab447f67" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "ec2a22eed7584a34" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "2ed2183b9bdf6b00" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "91fb99cbc39ad638" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "fdfb0c61160424af" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "793bad98a4990ca2" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "af786994f8c0cec8" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "37734a01ffbfc9c8" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "faf445de2faeb578" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "640c8dab253ca811" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "b51d8363b9d664e5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "12f3db94ad7a571a" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "e93f00105a26e30c" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "42d667fb2f670b76" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "fcea00b906601945" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "d83e6d4f7eacf9cd" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "ff40ec7eb62e0c4a" + }, + "total_evaluation_time_secondes": "6866.360788345337", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-orca-platypus-coig-lite-2k-0.6e-13b/results_2023-10-19T07-05-10.941569.json b/eval-results/uukuguy/speechless-orca-platypus-coig-lite-2k-0.6e-13b/results_2023-10-19T07-05-10.941569.json new file mode 100644 index 0000000000000000000000000000000000000000..4b6a2146f56f8d784288a797a3af0baed0b7b22b --- /dev/null +++ b/eval-results/uukuguy/speechless-orca-platypus-coig-lite-2k-0.6e-13b/results_2023-10-19T07-05-10.941569.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-orca-platypus-coig-lite-2k-0.6e-13b", + "model_sha": "65214c9923d55795ecd6e7f9e0fcee5ba5f26929", + "model_size": "24.32 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.40740352348993286, + "em_stderr": 0.005031895046041088, + "f1": 0.46848573825503476, + "f1_stderr": 0.004828956071080464 + }, + "harness|gsm8k|5": { + "acc": 0.07505686125852919, + "acc_stderr": 0.007257633145486643 + }, + "harness|winogrande|5": { + "acc": 0.7790055248618785, + "acc_stderr": 0.011661223637643416 + }, + "all": { + "em": 0.40740352348993286, + "em_stderr": 0.005031895046041088, + "f1": 0.46848573825503476, + "f1_stderr": 0.004828956071080464, + "acc": 0.4270311930602038, + "acc_stderr": 0.00945942839156503 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "67410f4ed1fe22db" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "8592f2fa7e3a75c1" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "96375e41f9939659" + }, + "total_evaluation_time_secondes": "30718.529972314835", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-orca-platypus-coig-lite-4k-0.5e-13b/results_2023-08-30T23-37-31.114358.json b/eval-results/uukuguy/speechless-orca-platypus-coig-lite-4k-0.5e-13b/results_2023-08-30T23-37-31.114358.json new file mode 100644 index 0000000000000000000000000000000000000000..03415dc7912f7aaa14775501efa512e64e01a75a --- /dev/null +++ b/eval-results/uukuguy/speechless-orca-platypus-coig-lite-4k-0.5e-13b/results_2023-08-30T23-37-31.114358.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-orca-platypus-coig-lite-4k-0.5e-13b", + "model_sha": "081d1da5cfa2f6ad43abdf4fb5e41f8ec5846224", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5366894197952219, + "acc_stderr": 0.01457200052775699, + "acc_norm": 0.5802047781569966, + "acc_norm_stderr": 0.014422181226303026 + }, + "harness|hellaswag|10": { + "acc": 0.5990838478390759, + "acc_stderr": 0.004890824718530301, + "acc_norm": 0.8015335590519816, + "acc_norm_stderr": 0.00398030097024142 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.631578947368421, + "acc_stderr": 0.03925523381052932, + "acc_norm": 0.631578947368421, + "acc_norm_stderr": 0.03925523381052932 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6150943396226415, + "acc_stderr": 0.029946498567699948, + "acc_norm": 0.6150943396226415, + "acc_norm_stderr": 0.029946498567699948 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6597222222222222, + "acc_stderr": 0.039621355734862175, + "acc_norm": 0.6597222222222222, + "acc_norm_stderr": 0.039621355734862175 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.42, + "acc_stderr": 0.04960449637488584, + "acc_norm": 0.42, + "acc_norm_stderr": 0.04960449637488584 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.4913294797687861, + "acc_stderr": 0.038118909889404126, + "acc_norm": 0.4913294797687861, + "acc_norm_stderr": 0.038118909889404126 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04690650298201942, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04690650298201942 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4723404255319149, + "acc_stderr": 0.03263597118409769, + "acc_norm": 0.4723404255319149, + "acc_norm_stderr": 0.03263597118409769 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.044346007015849245, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.044346007015849245 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.46206896551724136, + "acc_stderr": 0.041546596717075474, + "acc_norm": 0.46206896551724136, + "acc_norm_stderr": 0.041546596717075474 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.34656084656084657, + "acc_stderr": 0.024508777521028424, + "acc_norm": 0.34656084656084657, + "acc_norm_stderr": 0.024508777521028424 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.043758884927270605, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.043758884927270605 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6645161290322581, + "acc_stderr": 0.026860206444724356, + "acc_norm": 0.6645161290322581, + "acc_norm_stderr": 0.026860206444724356 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43349753694581283, + "acc_stderr": 0.03486731727419872, + "acc_norm": 0.43349753694581283, + "acc_norm_stderr": 0.03486731727419872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7212121212121212, + "acc_stderr": 0.035014387062967806, + "acc_norm": 0.7212121212121212, + "acc_norm_stderr": 0.035014387062967806 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7373737373737373, + "acc_stderr": 0.031353050095330855, + "acc_norm": 0.7373737373737373, + "acc_norm_stderr": 0.031353050095330855 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8341968911917098, + "acc_stderr": 0.026839845022314415, + "acc_norm": 0.8341968911917098, + "acc_norm_stderr": 0.026839845022314415 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5871794871794872, + "acc_stderr": 0.024962683564331806, + "acc_norm": 0.5871794871794872, + "acc_norm_stderr": 0.024962683564331806 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.337037037037037, + "acc_stderr": 0.028820884666253255, + "acc_norm": 0.337037037037037, + "acc_norm_stderr": 0.028820884666253255 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6050420168067226, + "acc_stderr": 0.03175367846096626, + "acc_norm": 0.6050420168067226, + "acc_norm_stderr": 0.03175367846096626 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943342, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943342 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7339449541284404, + "acc_stderr": 0.018946022322225604, + "acc_norm": 0.7339449541284404, + "acc_norm_stderr": 0.018946022322225604 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.49537037037037035, + "acc_stderr": 0.03409825519163572, + "acc_norm": 0.49537037037037035, + "acc_norm_stderr": 0.03409825519163572 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.028867431449849313, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.028867431449849313 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229966, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229966 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6636771300448431, + "acc_stderr": 0.031708824268455, + "acc_norm": 0.6636771300448431, + "acc_norm_stderr": 0.031708824268455 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6793893129770993, + "acc_stderr": 0.04093329229834278, + "acc_norm": 0.6793893129770993, + "acc_norm_stderr": 0.04093329229834278 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6942148760330579, + "acc_stderr": 0.04205953933884123, + "acc_norm": 0.6942148760330579, + "acc_norm_stderr": 0.04205953933884123 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7685185185185185, + "acc_stderr": 0.04077494709252626, + "acc_norm": 0.7685185185185185, + "acc_norm_stderr": 0.04077494709252626 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7177914110429447, + "acc_stderr": 0.03536117886664742, + "acc_norm": 0.7177914110429447, + "acc_norm_stderr": 0.03536117886664742 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.04432804055291518, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.04432804055291518 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7184466019417476, + "acc_stderr": 0.04453254836326468, + "acc_norm": 0.7184466019417476, + "acc_norm_stderr": 0.04453254836326468 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7863247863247863, + "acc_stderr": 0.026853450377009164, + "acc_norm": 0.7863247863247863, + "acc_norm_stderr": 0.026853450377009164 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7343550446998723, + "acc_stderr": 0.015794302487888726, + "acc_norm": 0.7343550446998723, + "acc_norm_stderr": 0.015794302487888726 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6502890173410405, + "acc_stderr": 0.025674281456531018, + "acc_norm": 0.6502890173410405, + "acc_norm_stderr": 0.025674281456531018 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.46368715083798884, + "acc_stderr": 0.01667834189453317, + "acc_norm": 0.46368715083798884, + "acc_norm_stderr": 0.01667834189453317 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.027914055510468008, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.027914055510468008 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6463022508038585, + "acc_stderr": 0.02715520810320087, + "acc_norm": 0.6463022508038585, + "acc_norm_stderr": 0.02715520810320087 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6419753086419753, + "acc_stderr": 0.026675611926037096, + "acc_norm": 0.6419753086419753, + "acc_norm_stderr": 0.026675611926037096 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.45390070921985815, + "acc_stderr": 0.02970045324729148, + "acc_norm": 0.45390070921985815, + "acc_norm_stderr": 0.02970045324729148 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4517601043024772, + "acc_stderr": 0.012710662233660247, + "acc_norm": 0.4517601043024772, + "acc_norm_stderr": 0.012710662233660247 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5110294117647058, + "acc_stderr": 0.030365446477275675, + "acc_norm": 0.5110294117647058, + "acc_norm_stderr": 0.030365446477275675 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.019944914136873583, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.019944914136873583 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6272727272727273, + "acc_stderr": 0.04631381319425465, + "acc_norm": 0.6272727272727273, + "acc_norm_stderr": 0.04631381319425465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6530612244897959, + "acc_stderr": 0.030472526026726492, + "acc_norm": 0.6530612244897959, + "acc_norm_stderr": 0.030472526026726492 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7114427860696517, + "acc_stderr": 0.03203841040213322, + "acc_norm": 0.7114427860696517, + "acc_norm_stderr": 0.03203841040213322 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.8, + "acc_stderr": 0.04020151261036846, + "acc_norm": 0.8, + "acc_norm_stderr": 0.04020151261036846 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7660818713450293, + "acc_stderr": 0.03246721765117826, + "acc_norm": 0.7660818713450293, + "acc_norm_stderr": 0.03246721765117826 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.3390452876376989, + "mc1_stderr": 0.016571797910626608, + "mc2": 0.4803942259941586, + "mc2_stderr": 0.015171048096844537 + }, + "all": { + "acc": 0.5724854113753837, + "acc_stderr": 0.03436338726163606, + "acc_norm": 0.5766543108597004, + "acc_norm_stderr": 0.03434541534553873, + "mc1": 0.3390452876376989, + "mc1_stderr": 0.016571797910626608, + "mc2": 0.4803942259941586, + "mc2_stderr": 0.015171048096844537 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "3ced177a9740ab72" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "736cbacfc627c9ce" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "d2d9cf5534b74b0b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "9d8617775e7afb7e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "8a729845cf844415" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "258d18b5a76e9d51" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "41f6ee2445154160" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "69114fe474fd53fa" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "76b2fa379520c907" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "b515d408b1bdf6f5" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "935dc99247031e33" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "85f7f7d7ac099657" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "d41d04de2e5e5d4b" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "587dad76855b6265" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "84745da13334a4b5" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "05f39a5a580500e1" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "8181ae2e48363b69" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "6d11e1c9a9d46862" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "8e94e84c0b1d140d" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "79e75724ab447f67" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "ec2a22eed7584a34" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "2ed2183b9bdf6b00" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "91fb99cbc39ad638" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "fdfb0c61160424af" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "793bad98a4990ca2" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "af786994f8c0cec8" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "37734a01ffbfc9c8" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "faf445de2faeb578" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "640c8dab253ca811" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "b51d8363b9d664e5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "12f3db94ad7a571a" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "e93f00105a26e30c" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "42d667fb2f670b76" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "fcea00b906601945" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "d83e6d4f7eacf9cd" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "ff40ec7eb62e0c4a" + }, + "total_evaluation_time_secondes": "6831.3353135585785", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-orca-platypus-coig-lite-4k-0.5e-13b/results_2023-10-18T05-18-23.703135.json b/eval-results/uukuguy/speechless-orca-platypus-coig-lite-4k-0.5e-13b/results_2023-10-18T05-18-23.703135.json new file mode 100644 index 0000000000000000000000000000000000000000..2669b0addcbb253781a536457a8776833c832e3e --- /dev/null +++ b/eval-results/uukuguy/speechless-orca-platypus-coig-lite-4k-0.5e-13b/results_2023-10-18T05-18-23.703135.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-orca-platypus-coig-lite-4k-0.5e-13b", + "model_sha": "081d1da5cfa2f6ad43abdf4fb5e41f8ec5846224", + "model_size": "24.32 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.33598993288590606, + "em_stderr": 0.004837156244460491, + "f1": 0.39884333053691334, + "f1_stderr": 0.00469778128078984 + }, + "harness|gsm8k|5": { + "acc": 0.058377558756633814, + "acc_stderr": 0.006458083557832456 + }, + "harness|winogrande|5": { + "acc": 0.7545382794001578, + "acc_stderr": 0.012095272937183644 + }, + "all": { + "em": 0.33598993288590606, + "em_stderr": 0.004837156244460491, + "f1": 0.39884333053691334, + "f1_stderr": 0.00469778128078984, + "acc": 0.40645791907839585, + "acc_stderr": 0.00927667824750805 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "42e24da9f5691931" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "565ff9665a64e4b4" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "5ae2cdba67189f2f" + }, + "total_evaluation_time_secondes": "35367.78791117668", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-orca-platypus-coig-lite-4k-0.6e-13b/results_2023-08-31T13-16-07.085332.json b/eval-results/uukuguy/speechless-orca-platypus-coig-lite-4k-0.6e-13b/results_2023-08-31T13-16-07.085332.json new file mode 100644 index 0000000000000000000000000000000000000000..3b3544d4a1d921bbe3f68aa8bdacf9879f605c85 --- /dev/null +++ b/eval-results/uukuguy/speechless-orca-platypus-coig-lite-4k-0.6e-13b/results_2023-08-31T13-16-07.085332.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-orca-platypus-coig-lite-4k-0.6e-13b", + "model_sha": "6bf4cf6211489bdbea70585a4a5c0f39deefb4e5", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5341296928327645, + "acc_stderr": 0.014577311315231099, + "acc_norm": 0.5878839590443686, + "acc_norm_stderr": 0.014383915302225405 + }, + "harness|hellaswag|10": { + "acc": 0.596494722166899, + "acc_stderr": 0.004895977676625234, + "acc_norm": 0.7993427604062936, + "acc_norm_stderr": 0.0039967359428195685 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6052631578947368, + "acc_stderr": 0.039777499346220734, + "acc_norm": 0.6052631578947368, + "acc_norm_stderr": 0.039777499346220734 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6339622641509434, + "acc_stderr": 0.029647813539365245, + "acc_norm": 0.6339622641509434, + "acc_norm_stderr": 0.029647813539365245 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6458333333333334, + "acc_stderr": 0.039994111357535424, + "acc_norm": 0.6458333333333334, + "acc_norm_stderr": 0.039994111357535424 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5664739884393064, + "acc_stderr": 0.03778621079092056, + "acc_norm": 0.5664739884393064, + "acc_norm_stderr": 0.03778621079092056 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.048971049527263666, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.048971049527263666 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252609, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252609 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.03261936918467381, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.03261936918467381 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537314, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537314 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.46206896551724136, + "acc_stderr": 0.041546596717075474, + "acc_norm": 0.46206896551724136, + "acc_norm_stderr": 0.041546596717075474 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.02413015829976262, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.02413015829976262 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.04343525428949097, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.04343525428949097 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6483870967741936, + "acc_stderr": 0.027162537826948458, + "acc_norm": 0.6483870967741936, + "acc_norm_stderr": 0.027162537826948458 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4729064039408867, + "acc_stderr": 0.03512819077876106, + "acc_norm": 0.4729064039408867, + "acc_norm_stderr": 0.03512819077876106 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.03546563019624336, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.03546563019624336 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7171717171717171, + "acc_stderr": 0.03208779558786752, + "acc_norm": 0.7171717171717171, + "acc_norm_stderr": 0.03208779558786752 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8134715025906736, + "acc_stderr": 0.02811209121011746, + "acc_norm": 0.8134715025906736, + "acc_norm_stderr": 0.02811209121011746 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6076923076923076, + "acc_stderr": 0.02475600038213095, + "acc_norm": 0.6076923076923076, + "acc_norm_stderr": 0.02475600038213095 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3074074074074074, + "acc_stderr": 0.028133252578815635, + "acc_norm": 0.3074074074074074, + "acc_norm_stderr": 0.028133252578815635 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6050420168067226, + "acc_stderr": 0.03175367846096626, + "acc_norm": 0.6050420168067226, + "acc_norm_stderr": 0.03175367846096626 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7559633027522936, + "acc_stderr": 0.018415286351416402, + "acc_norm": 0.7559633027522936, + "acc_norm_stderr": 0.018415286351416402 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8137254901960784, + "acc_stderr": 0.027325470966716312, + "acc_norm": 0.8137254901960784, + "acc_norm_stderr": 0.027325470966716312 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7848101265822784, + "acc_stderr": 0.026750826994676173, + "acc_norm": 0.7848101265822784, + "acc_norm_stderr": 0.026750826994676173 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6322869955156951, + "acc_stderr": 0.03236198350928275, + "acc_norm": 0.6322869955156951, + "acc_norm_stderr": 0.03236198350928275 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6776859504132231, + "acc_stderr": 0.04266416363352168, + "acc_norm": 0.6776859504132231, + "acc_norm_stderr": 0.04266416363352168 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.04330043749650742, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.04330043749650742 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.656441717791411, + "acc_stderr": 0.037311335196738925, + "acc_norm": 0.656441717791411, + "acc_norm_stderr": 0.037311335196738925 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2767857142857143, + "acc_stderr": 0.04246624336697625, + "acc_norm": 0.2767857142857143, + "acc_norm_stderr": 0.04246624336697625 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7087378640776699, + "acc_stderr": 0.04498676320572922, + "acc_norm": 0.7087378640776699, + "acc_norm_stderr": 0.04498676320572922 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7606837606837606, + "acc_stderr": 0.027951826808924336, + "acc_norm": 0.7606837606837606, + "acc_norm_stderr": 0.027951826808924336 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7279693486590039, + "acc_stderr": 0.015913367447500517, + "acc_norm": 0.7279693486590039, + "acc_norm_stderr": 0.015913367447500517 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.025722802200895817, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.025722802200895817 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.49162011173184356, + "acc_stderr": 0.01672015279467255, + "acc_norm": 0.49162011173184356, + "acc_norm_stderr": 0.01672015279467255 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5849673202614379, + "acc_stderr": 0.028213504177824093, + "acc_norm": 0.5849673202614379, + "acc_norm_stderr": 0.028213504177824093 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.617363344051447, + "acc_stderr": 0.027604689028581993, + "acc_norm": 0.617363344051447, + "acc_norm_stderr": 0.027604689028581993 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6388888888888888, + "acc_stderr": 0.026725868809100793, + "acc_norm": 0.6388888888888888, + "acc_norm_stderr": 0.026725868809100793 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4219858156028369, + "acc_stderr": 0.0294621892333706, + "acc_norm": 0.4219858156028369, + "acc_norm_stderr": 0.0294621892333706 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44328552803129073, + "acc_stderr": 0.012687818419599924, + "acc_norm": 0.44328552803129073, + "acc_norm_stderr": 0.012687818419599924 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5698529411764706, + "acc_stderr": 0.030074971917302875, + "acc_norm": 0.5698529411764706, + "acc_norm_stderr": 0.030074971917302875 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5473856209150327, + "acc_stderr": 0.020136790918492523, + "acc_norm": 0.5473856209150327, + "acc_norm_stderr": 0.020136790918492523 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5909090909090909, + "acc_stderr": 0.04709306978661895, + "acc_norm": 0.5909090909090909, + "acc_norm_stderr": 0.04709306978661895 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6693877551020408, + "acc_stderr": 0.030116426296540603, + "acc_norm": 0.6693877551020408, + "acc_norm_stderr": 0.030116426296540603 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.736318407960199, + "acc_stderr": 0.031157150869355554, + "acc_norm": 0.736318407960199, + "acc_norm_stderr": 0.031157150869355554 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4578313253012048, + "acc_stderr": 0.038786267710023595, + "acc_norm": 0.4578313253012048, + "acc_norm_stderr": 0.038786267710023595 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7309941520467836, + "acc_stderr": 0.0340105262010409, + "acc_norm": 0.7309941520467836, + "acc_norm_stderr": 0.0340105262010409 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.33414932680538556, + "mc1_stderr": 0.016512530677150538, + "mc2": 0.48289518787925, + "mc2_stderr": 0.015130306362544773 + }, + "all": { + "acc": 0.5676475308984814, + "acc_stderr": 0.03447542059110964, + "acc_norm": 0.5719967224993457, + "acc_norm_stderr": 0.034456901307265385, + "mc1": 0.33414932680538556, + "mc1_stderr": 0.016512530677150538, + "mc2": 0.48289518787925, + "mc2_stderr": 0.015130306362544773 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "3ced177a9740ab72" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "736cbacfc627c9ce" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "d2d9cf5534b74b0b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "9d8617775e7afb7e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "8a729845cf844415" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "258d18b5a76e9d51" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "41f6ee2445154160" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "69114fe474fd53fa" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "76b2fa379520c907" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "b515d408b1bdf6f5" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "935dc99247031e33" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "85f7f7d7ac099657" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "d41d04de2e5e5d4b" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "587dad76855b6265" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "84745da13334a4b5" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "05f39a5a580500e1" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "8181ae2e48363b69" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "6d11e1c9a9d46862" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "8e94e84c0b1d140d" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "79e75724ab447f67" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "ec2a22eed7584a34" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "2ed2183b9bdf6b00" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "91fb99cbc39ad638" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "fdfb0c61160424af" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "793bad98a4990ca2" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "af786994f8c0cec8" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "37734a01ffbfc9c8" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "faf445de2faeb578" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "640c8dab253ca811" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "b51d8363b9d664e5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "12f3db94ad7a571a" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "e93f00105a26e30c" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "42d667fb2f670b76" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "fcea00b906601945" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "d83e6d4f7eacf9cd" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "ff40ec7eb62e0c4a" + }, + "total_evaluation_time_secondes": "6840.1147537231445", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-orca-platypus-coig-lite-4k-0.6e-13b/results_2023-08-31T13-45-32.435027.json b/eval-results/uukuguy/speechless-orca-platypus-coig-lite-4k-0.6e-13b/results_2023-08-31T13-45-32.435027.json new file mode 100644 index 0000000000000000000000000000000000000000..eef1bfe2ea70f7d63a39c5e78e3c7181e02269d9 --- /dev/null +++ b/eval-results/uukuguy/speechless-orca-platypus-coig-lite-4k-0.6e-13b/results_2023-08-31T13-45-32.435027.json @@ -0,0 +1,1366 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-orca-platypus-coig-lite-4k-0.6e-13b", + "model_sha": "6bf4cf6211489bdbea70585a4a5c0f39deefb4e5", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "4b9a38c2102259daeb17d1d0294fc2b4ce7f4e63", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5341296928327645, + "acc_stderr": 0.014577311315231099, + "acc_norm": 0.5878839590443686, + "acc_norm_stderr": 0.014383915302225405 + }, + "harness|hellaswag|10": { + "acc": 0.596494722166899, + "acc_stderr": 0.004895977676625234, + "acc_norm": 0.7993427604062936, + "acc_norm_stderr": 0.0039967359428195685 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6052631578947368, + "acc_stderr": 0.039777499346220734, + "acc_norm": 0.6052631578947368, + "acc_norm_stderr": 0.039777499346220734 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6339622641509434, + "acc_stderr": 0.029647813539365245, + "acc_norm": 0.6339622641509434, + "acc_norm_stderr": 0.029647813539365245 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6458333333333334, + "acc_stderr": 0.039994111357535424, + "acc_norm": 0.6458333333333334, + "acc_norm_stderr": 0.039994111357535424 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5664739884393064, + "acc_stderr": 0.03778621079092056, + "acc_norm": 0.5664739884393064, + "acc_norm_stderr": 0.03778621079092056 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.048971049527263666, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.048971049527263666 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252609, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252609 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.03261936918467381, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.03261936918467381 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537314, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537314 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.46206896551724136, + "acc_stderr": 0.041546596717075474, + "acc_norm": 0.46206896551724136, + "acc_norm_stderr": 0.041546596717075474 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3253968253968254, + "acc_stderr": 0.02413015829976262, + "acc_norm": 0.3253968253968254, + "acc_norm_stderr": 0.02413015829976262 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.38095238095238093, + "acc_stderr": 0.04343525428949097, + "acc_norm": 0.38095238095238093, + "acc_norm_stderr": 0.04343525428949097 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6483870967741936, + "acc_stderr": 0.027162537826948458, + "acc_norm": 0.6483870967741936, + "acc_norm_stderr": 0.027162537826948458 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4729064039408867, + "acc_stderr": 0.03512819077876106, + "acc_norm": 0.4729064039408867, + "acc_norm_stderr": 0.03512819077876106 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.56, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.56, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.03546563019624336, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.03546563019624336 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7171717171717171, + "acc_stderr": 0.03208779558786752, + "acc_norm": 0.7171717171717171, + "acc_norm_stderr": 0.03208779558786752 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8134715025906736, + "acc_stderr": 0.02811209121011746, + "acc_norm": 0.8134715025906736, + "acc_norm_stderr": 0.02811209121011746 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6076923076923076, + "acc_stderr": 0.02475600038213095, + "acc_norm": 0.6076923076923076, + "acc_norm_stderr": 0.02475600038213095 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3074074074074074, + "acc_stderr": 0.028133252578815635, + "acc_norm": 0.3074074074074074, + "acc_norm_stderr": 0.028133252578815635 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6050420168067226, + "acc_stderr": 0.03175367846096626, + "acc_norm": 0.6050420168067226, + "acc_norm_stderr": 0.03175367846096626 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7559633027522936, + "acc_stderr": 0.018415286351416402, + "acc_norm": 0.7559633027522936, + "acc_norm_stderr": 0.018415286351416402 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5277777777777778, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.5277777777777778, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8137254901960784, + "acc_stderr": 0.027325470966716312, + "acc_norm": 0.8137254901960784, + "acc_norm_stderr": 0.027325470966716312 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7848101265822784, + "acc_stderr": 0.026750826994676173, + "acc_norm": 0.7848101265822784, + "acc_norm_stderr": 0.026750826994676173 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6322869955156951, + "acc_stderr": 0.03236198350928275, + "acc_norm": 0.6322869955156951, + "acc_norm_stderr": 0.03236198350928275 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.648854961832061, + "acc_stderr": 0.04186445163013751, + "acc_norm": 0.648854961832061, + "acc_norm_stderr": 0.04186445163013751 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6776859504132231, + "acc_stderr": 0.04266416363352168, + "acc_norm": 0.6776859504132231, + "acc_norm_stderr": 0.04266416363352168 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.04330043749650742, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.04330043749650742 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.656441717791411, + "acc_stderr": 0.037311335196738925, + "acc_norm": 0.656441717791411, + "acc_norm_stderr": 0.037311335196738925 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2767857142857143, + "acc_stderr": 0.04246624336697625, + "acc_norm": 0.2767857142857143, + "acc_norm_stderr": 0.04246624336697625 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7087378640776699, + "acc_stderr": 0.04498676320572922, + "acc_norm": 0.7087378640776699, + "acc_norm_stderr": 0.04498676320572922 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7606837606837606, + "acc_stderr": 0.027951826808924336, + "acc_norm": 0.7606837606837606, + "acc_norm_stderr": 0.027951826808924336 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7279693486590039, + "acc_stderr": 0.015913367447500517, + "acc_norm": 0.7279693486590039, + "acc_norm_stderr": 0.015913367447500517 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6473988439306358, + "acc_stderr": 0.025722802200895817, + "acc_norm": 0.6473988439306358, + "acc_norm_stderr": 0.025722802200895817 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.49162011173184356, + "acc_stderr": 0.01672015279467255, + "acc_norm": 0.49162011173184356, + "acc_norm_stderr": 0.01672015279467255 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5849673202614379, + "acc_stderr": 0.028213504177824093, + "acc_norm": 0.5849673202614379, + "acc_norm_stderr": 0.028213504177824093 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.617363344051447, + "acc_stderr": 0.027604689028581993, + "acc_norm": 0.617363344051447, + "acc_norm_stderr": 0.027604689028581993 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6388888888888888, + "acc_stderr": 0.026725868809100793, + "acc_norm": 0.6388888888888888, + "acc_norm_stderr": 0.026725868809100793 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4219858156028369, + "acc_stderr": 0.0294621892333706, + "acc_norm": 0.4219858156028369, + "acc_norm_stderr": 0.0294621892333706 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44328552803129073, + "acc_stderr": 0.012687818419599924, + "acc_norm": 0.44328552803129073, + "acc_norm_stderr": 0.012687818419599924 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5698529411764706, + "acc_stderr": 0.030074971917302875, + "acc_norm": 0.5698529411764706, + "acc_norm_stderr": 0.030074971917302875 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5473856209150327, + "acc_stderr": 0.020136790918492523, + "acc_norm": 0.5473856209150327, + "acc_norm_stderr": 0.020136790918492523 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.5909090909090909, + "acc_stderr": 0.04709306978661895, + "acc_norm": 0.5909090909090909, + "acc_norm_stderr": 0.04709306978661895 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6693877551020408, + "acc_stderr": 0.030116426296540603, + "acc_norm": 0.6693877551020408, + "acc_norm_stderr": 0.030116426296540603 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.736318407960199, + "acc_stderr": 0.031157150869355554, + "acc_norm": 0.736318407960199, + "acc_norm_stderr": 0.031157150869355554 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.82, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.82, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4578313253012048, + "acc_stderr": 0.038786267710023595, + "acc_norm": 0.4578313253012048, + "acc_norm_stderr": 0.038786267710023595 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7309941520467836, + "acc_stderr": 0.0340105262010409, + "acc_norm": 0.7309941520467836, + "acc_norm_stderr": 0.0340105262010409 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.33414932680538556, + "mc1_stderr": 0.016512530677150538, + "mc2": 0.48289518787925, + "mc2_stderr": 0.015130306362544773 + }, + "all": { + "acc": 0.5676475308984814, + "acc_stderr": 0.03447542059110964, + "acc_norm": 0.5719967224993457, + "acc_norm_stderr": 0.034456901307265385, + "mc1": 0.33414932680538556, + "mc1_stderr": 0.016512530677150538, + "mc2": 0.48289518787925, + "mc2_stderr": 0.015130306362544773 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "3ced177a9740ab72" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "736cbacfc627c9ce" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "d2d9cf5534b74b0b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "9d8617775e7afb7e" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "8a729845cf844415" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "258d18b5a76e9d51" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "41f6ee2445154160" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "69114fe474fd53fa" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "76b2fa379520c907" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "b515d408b1bdf6f5" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "935dc99247031e33" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "85f7f7d7ac099657" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "d41d04de2e5e5d4b" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "587dad76855b6265" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "84745da13334a4b5" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "05f39a5a580500e1" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "8181ae2e48363b69" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "6d11e1c9a9d46862" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "8e94e84c0b1d140d" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "79e75724ab447f67" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "ec2a22eed7584a34" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "2ed2183b9bdf6b00" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "91fb99cbc39ad638" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "fdfb0c61160424af" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "793bad98a4990ca2" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "af786994f8c0cec8" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "37734a01ffbfc9c8" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "faf445de2faeb578" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "640c8dab253ca811" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "b51d8363b9d664e5" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "12f3db94ad7a571a" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "e93f00105a26e30c" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "42d667fb2f670b76" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "fcea00b906601945" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "d83e6d4f7eacf9cd" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "ff40ec7eb62e0c4a" + }, + "total_evaluation_time_secondes": "6759.704301595688", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-orca-platypus-coig-lite-4k-0.6e-13b/results_2023-10-29T12-34-15.259983.json b/eval-results/uukuguy/speechless-orca-platypus-coig-lite-4k-0.6e-13b/results_2023-10-29T12-34-15.259983.json new file mode 100644 index 0000000000000000000000000000000000000000..21e8116fd35af2d1c73084f06cf2302edfd2f519 --- /dev/null +++ b/eval-results/uukuguy/speechless-orca-platypus-coig-lite-4k-0.6e-13b/results_2023-10-29T12-34-15.259983.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-orca-platypus-coig-lite-4k-0.6e-13b", + "model_sha": "6bf4cf6211489bdbea70585a4a5c0f39deefb4e5", + "model_size": "24.32 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.3847525167785235, + "em_stderr": 0.004982591799399597, + "f1": 0.4459479865771823, + "f1_stderr": 0.004798591730535464 + }, + "harness|gsm8k|5": { + "acc": 0.04245640636846096, + "acc_stderr": 0.005553837749990046 + }, + "harness|winogrande|5": { + "acc": 0.7592738752959748, + "acc_stderr": 0.012015559212224174 + }, + "all": { + "em": 0.3847525167785235, + "em_stderr": 0.004982591799399597, + "f1": 0.4459479865771823, + "f1_stderr": 0.004798591730535464, + "acc": 0.40086514083221786, + "acc_stderr": 0.00878469848110711 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "d382e2b6a19e9f46" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "a21af631baeb356d" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "edcdbf32518d3ca0" + }, + "total_evaluation_time_secondes": "33337.53098344803", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-tools-7b/results_2023-12-04T15-59-01.119688.json b/eval-results/uukuguy/speechless-tools-7b/results_2023-12-04T15-59-01.119688.json new file mode 100644 index 0000000000000000000000000000000000000000..a4e7c329239b28fed7f9d8666410df73e074700d --- /dev/null +++ b/eval-results/uukuguy/speechless-tools-7b/results_2023-12-04T15-59-01.119688.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 153352.761853825, + "end_time": 160417.112357441, + "total_evaluation_time_secondes": "7064.350503615977", + "model_name": "uukuguy/speechless-tools-7b", + "model_sha": "81aefc8983d1192378c2c803f0e0d14d48561117", + "model_dtype": "torch.bfloat16", + "model_size": "12.8 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.3412969283276451, + "acc_stderr": 0.013855831287497717, + "acc_norm": 0.3890784982935154, + "acc_norm_stderr": 0.014247309976045607 + }, + "harness|hellaswag|10": { + "acc": 0.44901414060944034, + "acc_stderr": 0.004963771168672087, + "acc_norm": 0.5768771161123282, + "acc_norm_stderr": 0.004930448527146669 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.04605661864718381, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04605661864718381 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.31851851851851853, + "acc_stderr": 0.040247784019771096, + "acc_norm": 0.31851851851851853, + "acc_norm_stderr": 0.040247784019771096 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.28289473684210525, + "acc_stderr": 0.03665349695640767, + "acc_norm": 0.28289473684210525, + "acc_norm_stderr": 0.03665349695640767 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.36981132075471695, + "acc_stderr": 0.029711421880107922, + "acc_norm": 0.36981132075471695, + "acc_norm_stderr": 0.029711421880107922 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2708333333333333, + "acc_stderr": 0.037161774375660164, + "acc_norm": 0.2708333333333333, + "acc_norm_stderr": 0.037161774375660164 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.17, + "acc_stderr": 0.0377525168068637, + "acc_norm": 0.17, + "acc_norm_stderr": 0.0377525168068637 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2947976878612717, + "acc_stderr": 0.034765996075164785, + "acc_norm": 0.2947976878612717, + "acc_norm_stderr": 0.034765996075164785 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2936170212765957, + "acc_stderr": 0.029771642712491234, + "acc_norm": 0.2936170212765957, + "acc_norm_stderr": 0.029771642712491234 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.24561403508771928, + "acc_stderr": 0.04049339297748142, + "acc_norm": 0.24561403508771928, + "acc_norm_stderr": 0.04049339297748142 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3586206896551724, + "acc_stderr": 0.03996629574876719, + "acc_norm": 0.3586206896551724, + "acc_norm_stderr": 0.03996629574876719 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.26455026455026454, + "acc_stderr": 0.022717467897708617, + "acc_norm": 0.26455026455026454, + "acc_norm_stderr": 0.022717467897708617 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.23015873015873015, + "acc_stderr": 0.03764950879790606, + "acc_norm": 0.23015873015873015, + "acc_norm_stderr": 0.03764950879790606 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3225806451612903, + "acc_stderr": 0.026593084516572284, + "acc_norm": 0.3225806451612903, + "acc_norm_stderr": 0.026593084516572284 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.28078817733990147, + "acc_stderr": 0.031618563353586086, + "acc_norm": 0.28078817733990147, + "acc_norm_stderr": 0.031618563353586086 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.4121212121212121, + "acc_stderr": 0.03843566993588718, + "acc_norm": 0.4121212121212121, + "acc_norm_stderr": 0.03843566993588718 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3939393939393939, + "acc_stderr": 0.03481285338232963, + "acc_norm": 0.3939393939393939, + "acc_norm_stderr": 0.03481285338232963 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.34196891191709844, + "acc_stderr": 0.034234651001042844, + "acc_norm": 0.34196891191709844, + "acc_norm_stderr": 0.034234651001042844 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.31794871794871793, + "acc_stderr": 0.023610884308927858, + "acc_norm": 0.31794871794871793, + "acc_norm_stderr": 0.023610884308927858 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.23333333333333334, + "acc_stderr": 0.025787874220959323, + "acc_norm": 0.23333333333333334, + "acc_norm_stderr": 0.025787874220959323 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.29831932773109243, + "acc_stderr": 0.029719142876342853, + "acc_norm": 0.29831932773109243, + "acc_norm_stderr": 0.029719142876342853 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.1986754966887417, + "acc_stderr": 0.03257847384436776, + "acc_norm": 0.1986754966887417, + "acc_norm_stderr": 0.03257847384436776 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3798165137614679, + "acc_stderr": 0.020808825617866244, + "acc_norm": 0.3798165137614679, + "acc_norm_stderr": 0.020808825617866244 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.027920963147993662, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.027920963147993662 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.36764705882352944, + "acc_stderr": 0.03384132045674119, + "acc_norm": 0.36764705882352944, + "acc_norm_stderr": 0.03384132045674119 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.4430379746835443, + "acc_stderr": 0.03233532777533485, + "acc_norm": 0.4430379746835443, + "acc_norm_stderr": 0.03233532777533485 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3452914798206278, + "acc_stderr": 0.03191100192835794, + "acc_norm": 0.3452914798206278, + "acc_norm_stderr": 0.03191100192835794 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.40458015267175573, + "acc_stderr": 0.043046937953806645, + "acc_norm": 0.40458015267175573, + "acc_norm_stderr": 0.043046937953806645 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.4628099173553719, + "acc_stderr": 0.045517111961042175, + "acc_norm": 0.4628099173553719, + "acc_norm_stderr": 0.045517111961042175 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.04830366024635331, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.04830366024635331 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.39263803680981596, + "acc_stderr": 0.03836740907831028, + "acc_norm": 0.39263803680981596, + "acc_norm_stderr": 0.03836740907831028 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.25892857142857145, + "acc_stderr": 0.041577515398656284, + "acc_norm": 0.25892857142857145, + "acc_norm_stderr": 0.041577515398656284 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.4077669902912621, + "acc_stderr": 0.048657775704107696, + "acc_norm": 0.4077669902912621, + "acc_norm_stderr": 0.048657775704107696 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.5, + "acc_stderr": 0.03275608910402091, + "acc_norm": 0.5, + "acc_norm_stderr": 0.03275608910402091 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.4086845466155811, + "acc_stderr": 0.017579250148153393, + "acc_norm": 0.4086845466155811, + "acc_norm_stderr": 0.017579250148153393 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.38439306358381503, + "acc_stderr": 0.026189666966272028, + "acc_norm": 0.38439306358381503, + "acc_norm_stderr": 0.026189666966272028 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2670391061452514, + "acc_stderr": 0.014796502622562567, + "acc_norm": 0.2670391061452514, + "acc_norm_stderr": 0.014796502622562567 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.32679738562091504, + "acc_stderr": 0.026857294663281416, + "acc_norm": 0.32679738562091504, + "acc_norm_stderr": 0.026857294663281416 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3665594855305466, + "acc_stderr": 0.027368078243971614, + "acc_norm": 0.3665594855305466, + "acc_norm_stderr": 0.027368078243971614 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.37037037037037035, + "acc_stderr": 0.02686949074481525, + "acc_norm": 0.37037037037037035, + "acc_norm_stderr": 0.02686949074481525 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.30141843971631205, + "acc_stderr": 0.027374128882631146, + "acc_norm": 0.30141843971631205, + "acc_norm_stderr": 0.027374128882631146 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2920469361147327, + "acc_stderr": 0.011613349136271808, + "acc_norm": 0.2920469361147327, + "acc_norm_stderr": 0.011613349136271808 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.2426470588235294, + "acc_stderr": 0.026040662474201268, + "acc_norm": 0.2426470588235294, + "acc_norm_stderr": 0.026040662474201268 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.31862745098039214, + "acc_stderr": 0.018850084696468712, + "acc_norm": 0.31862745098039214, + "acc_norm_stderr": 0.018850084696468712 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.4, + "acc_stderr": 0.0469237132203465, + "acc_norm": 0.4, + "acc_norm_stderr": 0.0469237132203465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.3673469387755102, + "acc_stderr": 0.030862144921087558, + "acc_norm": 0.3673469387755102, + "acc_norm_stderr": 0.030862144921087558 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.4228855721393035, + "acc_stderr": 0.034932317774212816, + "acc_norm": 0.4228855721393035, + "acc_norm_stderr": 0.034932317774212816 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3132530120481928, + "acc_stderr": 0.03610805018031024, + "acc_norm": 0.3132530120481928, + "acc_norm_stderr": 0.03610805018031024 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.03615507630310935, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.03615507630310935 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2741738066095471, + "mc1_stderr": 0.015616518497219371, + "mc2": 0.4408018939045002, + "mc2_stderr": 0.015451244968527669 + }, + "harness|winogrande|5": { + "acc": 0.585635359116022, + "acc_stderr": 0.013844846232268558 + }, + "harness|gsm8k|5": { + "acc": 0.07505686125852919, + "acc_stderr": 0.007257633145486642 + }, + "all": { + "acc": 0.33439260458192543, + "acc_stderr": 0.03318510046683825, + "acc_norm": 0.3375068000778248, + "acc_norm_stderr": 0.03395842042621018, + "mc1": 0.2741738066095471, + "mc1_stderr": 0.015616518497219371, + "mc2": 0.4408018939045002, + "mc2_stderr": 0.015451244968527669 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "9fb6baaf1bb8cac5" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "a8fa53915153e1db", + "hash_cont_tokens": "894715e9c12b081d" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-tora-code-7b-v1.0/results_2023-10-10T14-11-59.032357.json b/eval-results/uukuguy/speechless-tora-code-7b-v1.0/results_2023-10-10T14-11-59.032357.json new file mode 100644 index 0000000000000000000000000000000000000000..fab1b6434c2d94040c7b302b44b87a818fa93d0e --- /dev/null +++ b/eval-results/uukuguy/speechless-tora-code-7b-v1.0/results_2023-10-10T14-11-59.032357.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-tora-code-7b-v1.0", + "model_sha": "f7b1f87a096045f1bba8f68c62e062102218717b", + "model_size": "12.8 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.3822525597269625, + "acc_stderr": 0.014200454049979284, + "acc_norm": 0.42662116040955633, + "acc_norm_stderr": 0.014453185592920293 + }, + "harness|hellaswag|10": { + "acc": 0.48904600677155946, + "acc_stderr": 0.0049885838203099185, + "acc_norm": 0.6515634335789683, + "acc_norm_stderr": 0.0047550132430221265 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3111111111111111, + "acc_stderr": 0.03999262876617721, + "acc_norm": 0.3111111111111111, + "acc_norm_stderr": 0.03999262876617721 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.03925523381052932, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.03925523381052932 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.38113207547169814, + "acc_stderr": 0.029890609686286637, + "acc_norm": 0.38113207547169814, + "acc_norm_stderr": 0.029890609686286637 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3680555555555556, + "acc_stderr": 0.040329990539607175, + "acc_norm": 0.3680555555555556, + "acc_norm_stderr": 0.040329990539607175 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.3236994219653179, + "acc_stderr": 0.0356760379963917, + "acc_norm": 0.3236994219653179, + "acc_norm_stderr": 0.0356760379963917 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.043898699568087785, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.043898699568087785 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.33191489361702126, + "acc_stderr": 0.03078373675774565, + "acc_norm": 0.33191489361702126, + "acc_norm_stderr": 0.03078373675774565 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537315, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537315 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3586206896551724, + "acc_stderr": 0.03996629574876719, + "acc_norm": 0.3586206896551724, + "acc_norm_stderr": 0.03996629574876719 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2724867724867725, + "acc_stderr": 0.022930973071633342, + "acc_norm": 0.2724867724867725, + "acc_norm_stderr": 0.022930973071633342 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.25396825396825395, + "acc_stderr": 0.03893259610604674, + "acc_norm": 0.25396825396825395, + "acc_norm_stderr": 0.03893259610604674 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.38387096774193546, + "acc_stderr": 0.02766618207553964, + "acc_norm": 0.38387096774193546, + "acc_norm_stderr": 0.02766618207553964 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3103448275862069, + "acc_stderr": 0.03255086769970103, + "acc_norm": 0.3103448275862069, + "acc_norm_stderr": 0.03255086769970103 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.4909090909090909, + "acc_stderr": 0.03903698647748441, + "acc_norm": 0.4909090909090909, + "acc_norm_stderr": 0.03903698647748441 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.48484848484848486, + "acc_stderr": 0.0356071651653106, + "acc_norm": 0.48484848484848486, + "acc_norm_stderr": 0.0356071651653106 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.47150259067357514, + "acc_stderr": 0.03602573571288441, + "acc_norm": 0.47150259067357514, + "acc_norm_stderr": 0.03602573571288441 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.36153846153846153, + "acc_stderr": 0.024359581465396987, + "acc_norm": 0.36153846153846153, + "acc_norm_stderr": 0.024359581465396987 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.0273091405882302, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.0273091405882302 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.3907563025210084, + "acc_stderr": 0.03169380235712997, + "acc_norm": 0.3907563025210084, + "acc_norm_stderr": 0.03169380235712997 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.25165562913907286, + "acc_stderr": 0.035433042343899844, + "acc_norm": 0.25165562913907286, + "acc_norm_stderr": 0.035433042343899844 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.45688073394495415, + "acc_stderr": 0.021357458785226213, + "acc_norm": 0.45688073394495415, + "acc_norm_stderr": 0.021357458785226213 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.03114144782353602, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.03114144782353602 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.4019607843137255, + "acc_stderr": 0.034411900234824655, + "acc_norm": 0.4019607843137255, + "acc_norm_stderr": 0.034411900234824655 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.5147679324894515, + "acc_stderr": 0.032533028078777386, + "acc_norm": 0.5147679324894515, + "acc_norm_stderr": 0.032533028078777386 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.4618834080717489, + "acc_stderr": 0.03346015011973228, + "acc_norm": 0.4618834080717489, + "acc_norm_stderr": 0.03346015011973228 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3969465648854962, + "acc_stderr": 0.04291135671009223, + "acc_norm": 0.3969465648854962, + "acc_norm_stderr": 0.04291135671009223 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.5454545454545454, + "acc_stderr": 0.04545454545454548, + "acc_norm": 0.5454545454545454, + "acc_norm_stderr": 0.04545454545454548 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.4351851851851852, + "acc_stderr": 0.04792898170907062, + "acc_norm": 0.4351851851851852, + "acc_norm_stderr": 0.04792898170907062 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.4294478527607362, + "acc_stderr": 0.03889066619112722, + "acc_norm": 0.4294478527607362, + "acc_norm_stderr": 0.03889066619112722 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.044328040552915185, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.044328040552915185 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.5825242718446602, + "acc_stderr": 0.048828405482122375, + "acc_norm": 0.5825242718446602, + "acc_norm_stderr": 0.048828405482122375 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.6239316239316239, + "acc_stderr": 0.03173393632969482, + "acc_norm": 0.6239316239316239, + "acc_norm_stderr": 0.03173393632969482 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.4891443167305236, + "acc_stderr": 0.017875748840242407, + "acc_norm": 0.4891443167305236, + "acc_norm_stderr": 0.017875748840242407 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.4046242774566474, + "acc_stderr": 0.026424816594009852, + "acc_norm": 0.4046242774566474, + "acc_norm_stderr": 0.026424816594009852 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.26145251396648045, + "acc_stderr": 0.014696599650364557, + "acc_norm": 0.26145251396648045, + "acc_norm_stderr": 0.014696599650364557 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.35947712418300654, + "acc_stderr": 0.027475969910660952, + "acc_norm": 0.35947712418300654, + "acc_norm_stderr": 0.027475969910660952 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.4212218649517685, + "acc_stderr": 0.028043399858210635, + "acc_norm": 0.4212218649517685, + "acc_norm_stderr": 0.028043399858210635 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.39814814814814814, + "acc_stderr": 0.027237415094592477, + "acc_norm": 0.39814814814814814, + "acc_norm_stderr": 0.027237415094592477 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.30851063829787234, + "acc_stderr": 0.027553366165101362, + "acc_norm": 0.30851063829787234, + "acc_norm_stderr": 0.027553366165101362 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.31421121251629724, + "acc_stderr": 0.011855911587048228, + "acc_norm": 0.31421121251629724, + "acc_norm_stderr": 0.011855911587048228 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.29044117647058826, + "acc_stderr": 0.027576468622740515, + "acc_norm": 0.29044117647058826, + "acc_norm_stderr": 0.027576468622740515 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.3480392156862745, + "acc_stderr": 0.019270998708223977, + "acc_norm": 0.3480392156862745, + "acc_norm_stderr": 0.019270998708223977 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.44545454545454544, + "acc_stderr": 0.047605488214603246, + "acc_norm": 0.44545454545454544, + "acc_norm_stderr": 0.047605488214603246 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.39183673469387753, + "acc_stderr": 0.031251275910891656, + "acc_norm": 0.39183673469387753, + "acc_norm_stderr": 0.031251275910891656 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.47761194029850745, + "acc_stderr": 0.035319879302087305, + "acc_norm": 0.47761194029850745, + "acc_norm_stderr": 0.035319879302087305 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.55, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.55, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3674698795180723, + "acc_stderr": 0.03753267402120575, + "acc_norm": 0.3674698795180723, + "acc_norm_stderr": 0.03753267402120575 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.4502923976608187, + "acc_stderr": 0.038158273659132366, + "acc_norm": 0.4502923976608187, + "acc_norm_stderr": 0.038158273659132366 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.27539779681762544, + "mc1_stderr": 0.01563813566777552, + "mc2": 0.4205675471010907, + "mc2_stderr": 0.014623112128590065 + }, + "all": { + "acc": 0.3873136911648318, + "acc_stderr": 0.03488491861505594, + "acc_norm": 0.39082023400364535, + "acc_norm_stderr": 0.034885243377185654, + "mc1": 0.27539779681762544, + "mc1_stderr": 0.01563813566777552, + "mc2": 0.4205675471010907, + "mc2_stderr": 0.014623112128590065 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "4547.296867847443", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/speechless-tora-code-7b-v1.0/results_2023-10-29T00-51-17.507006.json b/eval-results/uukuguy/speechless-tora-code-7b-v1.0/results_2023-10-29T00-51-17.507006.json new file mode 100644 index 0000000000000000000000000000000000000000..4280b1080d504c5e4be3e977cefdc827e9d40571 --- /dev/null +++ b/eval-results/uukuguy/speechless-tora-code-7b-v1.0/results_2023-10-29T00-51-17.507006.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "uukuguy/speechless-tora-code-7b-v1.0", + "model_sha": "4b4fac38530d4e63b599b2953e67408f58cf4bda", + "model_size": "12.8 GB", + "model_dtype": "torch.bfloat16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.23468959731543623, + "em_stderr": 0.004340156396807698, + "f1": 0.2847546140939602, + "f1_stderr": 0.004356308687759715 + }, + "harness|gsm8k|5": { + "acc": 0.009097801364670205, + "acc_stderr": 0.0026153265107756725 + }, + "harness|winogrande|5": { + "acc": 0.6290449881610103, + "acc_stderr": 0.013576399902231568 + }, + "all": { + "em": 0.23468959731543623, + "em_stderr": 0.004340156396807698, + "f1": 0.2847546140939602, + "f1_stderr": 0.004356308687759715, + "acc": 0.31907139476284024, + "acc_stderr": 0.00809586320650362 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "1d29649196706ef9" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "7ddaa018eaa64583" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "61afeb5d72e8a863" + }, + "total_evaluation_time_secondes": "7262.791357517242", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/uukuguy/zephyr-7b-alpha-dare-0.85/results_2023-12-04T16-03-30.985884.json b/eval-results/uukuguy/zephyr-7b-alpha-dare-0.85/results_2023-12-04T16-03-30.985884.json new file mode 100644 index 0000000000000000000000000000000000000000..b8bc70bada95c540f0068ef2f56aa93e794162b7 --- /dev/null +++ b/eval-results/uukuguy/zephyr-7b-alpha-dare-0.85/results_2023-12-04T16-03-30.985884.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 153420.219658431, + "end_time": 160679.946138533, + "total_evaluation_time_secondes": "7259.726480102021", + "model_name": "uukuguy/zephyr-7b-alpha-dare-0.85", + "model_sha": "afe35301593b4ce2e7b5d1696066724ef1f802eb", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5767918088737202, + "acc_stderr": 0.01443803622084803, + "acc_norm": 0.6117747440273038, + "acc_norm_stderr": 0.01424161420741405 + }, + "harness|hellaswag|10": { + "acc": 0.6387173869747063, + "acc_stderr": 0.004793904922401889, + "acc_norm": 0.8366859191396137, + "acc_norm_stderr": 0.0036889652317335197 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.04793724854411021, + "acc_norm": 0.35, + "acc_norm_stderr": 0.04793724854411021 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.6370370370370371, + "acc_stderr": 0.04153948404742398, + "acc_norm": 0.6370370370370371, + "acc_norm_stderr": 0.04153948404742398 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6644736842105263, + "acc_stderr": 0.03842498559395268, + "acc_norm": 0.6644736842105263, + "acc_norm_stderr": 0.03842498559395268 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6981132075471698, + "acc_stderr": 0.02825420034443866, + "acc_norm": 0.6981132075471698, + "acc_norm_stderr": 0.02825420034443866 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.7152777777777778, + "acc_stderr": 0.037738099906869334, + "acc_norm": 0.7152777777777778, + "acc_norm_stderr": 0.037738099906869334 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6589595375722543, + "acc_stderr": 0.03614665424180826, + "acc_norm": 0.6589595375722543, + "acc_norm_stderr": 0.03614665424180826 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4215686274509804, + "acc_stderr": 0.04913595201274498, + "acc_norm": 0.4215686274509804, + "acc_norm_stderr": 0.04913595201274498 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.79, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5829787234042553, + "acc_stderr": 0.03223276266711712, + "acc_norm": 0.5829787234042553, + "acc_norm_stderr": 0.03223276266711712 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.45614035087719296, + "acc_stderr": 0.04685473041907789, + "acc_norm": 0.45614035087719296, + "acc_norm_stderr": 0.04685473041907789 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5793103448275863, + "acc_stderr": 0.0411391498118926, + "acc_norm": 0.5793103448275863, + "acc_norm_stderr": 0.0411391498118926 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3941798941798942, + "acc_stderr": 0.02516798233389414, + "acc_norm": 0.3941798941798942, + "acc_norm_stderr": 0.02516798233389414 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.42063492063492064, + "acc_stderr": 0.04415438226743744, + "acc_norm": 0.42063492063492064, + "acc_norm_stderr": 0.04415438226743744 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7741935483870968, + "acc_stderr": 0.023785577884181015, + "acc_norm": 0.7741935483870968, + "acc_norm_stderr": 0.023785577884181015 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.5270935960591133, + "acc_stderr": 0.03512819077876106, + "acc_norm": 0.5270935960591133, + "acc_norm_stderr": 0.03512819077876106 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7515151515151515, + "acc_stderr": 0.033744026441394036, + "acc_norm": 0.7515151515151515, + "acc_norm_stderr": 0.033744026441394036 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7878787878787878, + "acc_stderr": 0.029126522834586808, + "acc_norm": 0.7878787878787878, + "acc_norm_stderr": 0.029126522834586808 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8860103626943006, + "acc_stderr": 0.022935144053919443, + "acc_norm": 0.8860103626943006, + "acc_norm_stderr": 0.022935144053919443 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6564102564102564, + "acc_stderr": 0.024078696580635477, + "acc_norm": 0.6564102564102564, + "acc_norm_stderr": 0.024078696580635477 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3592592592592593, + "acc_stderr": 0.029252905927251976, + "acc_norm": 0.3592592592592593, + "acc_norm_stderr": 0.029252905927251976 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6638655462184874, + "acc_stderr": 0.030684737115135356, + "acc_norm": 0.6638655462184874, + "acc_norm_stderr": 0.030684737115135356 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.818348623853211, + "acc_stderr": 0.016530617409266875, + "acc_norm": 0.818348623853211, + "acc_norm_stderr": 0.016530617409266875 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5787037037037037, + "acc_stderr": 0.033674621388960775, + "acc_norm": 0.5787037037037037, + "acc_norm_stderr": 0.033674621388960775 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.803921568627451, + "acc_stderr": 0.027865942286639318, + "acc_norm": 0.803921568627451, + "acc_norm_stderr": 0.027865942286639318 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7763713080168776, + "acc_stderr": 0.027123298205229966, + "acc_norm": 0.7763713080168776, + "acc_norm_stderr": 0.027123298205229966 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6905829596412556, + "acc_stderr": 0.03102441174057222, + "acc_norm": 0.6905829596412556, + "acc_norm_stderr": 0.03102441174057222 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7786259541984732, + "acc_stderr": 0.036412970813137296, + "acc_norm": 0.7786259541984732, + "acc_norm_stderr": 0.036412970813137296 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7851239669421488, + "acc_stderr": 0.037494924487096966, + "acc_norm": 0.7851239669421488, + "acc_norm_stderr": 0.037494924487096966 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7685185185185185, + "acc_stderr": 0.04077494709252627, + "acc_norm": 0.7685185185185185, + "acc_norm_stderr": 0.04077494709252627 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7791411042944786, + "acc_stderr": 0.03259177392742178, + "acc_norm": 0.7791411042944786, + "acc_norm_stderr": 0.03259177392742178 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.48214285714285715, + "acc_stderr": 0.047427623612430116, + "acc_norm": 0.48214285714285715, + "acc_norm_stderr": 0.047427623612430116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.8252427184466019, + "acc_stderr": 0.03760178006026621, + "acc_norm": 0.8252427184466019, + "acc_norm_stderr": 0.03760178006026621 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8803418803418803, + "acc_stderr": 0.021262719400406943, + "acc_norm": 0.8803418803418803, + "acc_norm_stderr": 0.021262719400406943 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.76, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.76, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8173690932311622, + "acc_stderr": 0.013816335389973136, + "acc_norm": 0.8173690932311622, + "acc_norm_stderr": 0.013816335389973136 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.7196531791907514, + "acc_stderr": 0.024182427496577615, + "acc_norm": 0.7196531791907514, + "acc_norm_stderr": 0.024182427496577615 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3139664804469274, + "acc_stderr": 0.015521923933523642, + "acc_norm": 0.3139664804469274, + "acc_norm_stderr": 0.015521923933523642 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.7581699346405228, + "acc_stderr": 0.024518195641879334, + "acc_norm": 0.7581699346405228, + "acc_norm_stderr": 0.024518195641879334 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.7009646302250804, + "acc_stderr": 0.02600330111788514, + "acc_norm": 0.7009646302250804, + "acc_norm_stderr": 0.02600330111788514 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7283950617283951, + "acc_stderr": 0.02474862449053737, + "acc_norm": 0.7283950617283951, + "acc_norm_stderr": 0.02474862449053737 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.48226950354609927, + "acc_stderr": 0.02980873964223777, + "acc_norm": 0.48226950354609927, + "acc_norm_stderr": 0.02980873964223777 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4426336375488918, + "acc_stderr": 0.012685906538206242, + "acc_norm": 0.4426336375488918, + "acc_norm_stderr": 0.012685906538206242 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6801470588235294, + "acc_stderr": 0.02833295951403121, + "acc_norm": 0.6801470588235294, + "acc_norm_stderr": 0.02833295951403121 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6683006535947712, + "acc_stderr": 0.019047485239360378, + "acc_norm": 0.6683006535947712, + "acc_norm_stderr": 0.019047485239360378 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6818181818181818, + "acc_stderr": 0.04461272175910509, + "acc_norm": 0.6818181818181818, + "acc_norm_stderr": 0.04461272175910509 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.7224489795918367, + "acc_stderr": 0.028666857790274648, + "acc_norm": 0.7224489795918367, + "acc_norm_stderr": 0.028666857790274648 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.835820895522388, + "acc_stderr": 0.026193923544454125, + "acc_norm": 0.835820895522388, + "acc_norm_stderr": 0.026193923544454125 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.89, + "acc_stderr": 0.03144660377352203, + "acc_norm": 0.89, + "acc_norm_stderr": 0.03144660377352203 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.5542168674698795, + "acc_stderr": 0.03869543323472101, + "acc_norm": 0.5542168674698795, + "acc_norm_stderr": 0.03869543323472101 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8245614035087719, + "acc_stderr": 0.029170885500727668, + "acc_norm": 0.8245614035087719, + "acc_norm_stderr": 0.029170885500727668 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.29498164014687883, + "mc1_stderr": 0.015964400965589657, + "mc2": 0.4441404853042373, + "mc2_stderr": 0.014450558004670922 + }, + "harness|winogrande|5": { + "acc": 0.7845303867403315, + "acc_stderr": 0.011555295286059282 + }, + "harness|gsm8k|5": { + "acc": 0.42077331311599697, + "acc_stderr": 0.013598489497182837 + }, + "all": { + "acc": 0.6405125012890543, + "acc_stderr": 0.0322440782989453, + "acc_norm": 0.6457442431541438, + "acc_norm_stderr": 0.032888705588954556, + "mc1": 0.29498164014687883, + "mc1_stderr": 0.015964400965589657, + "mc2": 0.4441404853042373, + "mc2_stderr": 0.014450558004670922 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "11ab19dbc40104f1" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "9c04e828ae29cacc", + "hash_cont_tokens": "5e3cca0936659bef" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113460, + "non_padded": 1412, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/vihangd/dopeyplats-1.1b-2T-v1/results_2023-11-28T03-52-00.248474.json b/eval-results/vihangd/dopeyplats-1.1b-2T-v1/results_2023-11-28T03-52-00.248474.json new file mode 100644 index 0000000000000000000000000000000000000000..59c7ffa736089ee56592205f10e60fb29fb67797 --- /dev/null +++ b/eval-results/vihangd/dopeyplats-1.1b-2T-v1/results_2023-11-28T03-52-00.248474.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 993942.160607091, + "end_time": 1003687.240007278, + "total_evaluation_time_secondes": "9745.079400187009", + "model_name": "vihangd/dopeyplats-1.1b-2T-v1", + "model_sha": "4ca47b470296de0e7bf3261e377aabaff9ad5c06", + "model_dtype": "torch.float16", + "model_size": "2.06 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.29436860068259385, + "acc_stderr": 0.013318528460539426, + "acc_norm": 0.3310580204778157, + "acc_norm_stderr": 0.01375206241981783 + }, + "harness|hellaswag|10": { + "acc": 0.41216889065923124, + "acc_stderr": 0.004912192800263313, + "acc_norm": 0.5431189006174069, + "acc_norm_stderr": 0.004971192387202447 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2, + "acc_stderr": 0.034554737023254366, + "acc_norm": 0.2, + "acc_norm_stderr": 0.034554737023254366 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.23773584905660378, + "acc_stderr": 0.026199808807561925, + "acc_norm": 0.23773584905660378, + "acc_norm_stderr": 0.026199808807561925 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2361111111111111, + "acc_stderr": 0.03551446610810826, + "acc_norm": 0.2361111111111111, + "acc_norm_stderr": 0.03551446610810826 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909283, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909283 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.26011560693641617, + "acc_stderr": 0.03345036916788992, + "acc_norm": 0.26011560693641617, + "acc_norm_stderr": 0.03345036916788992 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.251063829787234, + "acc_stderr": 0.02834696377716245, + "acc_norm": 0.251063829787234, + "acc_norm_stderr": 0.02834696377716245 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.04142439719489361, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.04142439719489361 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.23448275862068965, + "acc_stderr": 0.035306258743465914, + "acc_norm": 0.23448275862068965, + "acc_norm_stderr": 0.035306258743465914 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.19047619047619047, + "acc_stderr": 0.02022388031792385, + "acc_norm": 0.19047619047619047, + "acc_norm_stderr": 0.02022388031792385 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.18253968253968253, + "acc_stderr": 0.03455071019102149, + "acc_norm": 0.18253968253968253, + "acc_norm_stderr": 0.03455071019102149 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2161290322580645, + "acc_stderr": 0.023415293433568532, + "acc_norm": 0.2161290322580645, + "acc_norm_stderr": 0.023415293433568532 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.18719211822660098, + "acc_stderr": 0.027444924966882618, + "acc_norm": 0.18719211822660098, + "acc_norm_stderr": 0.027444924966882618 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.19393939393939394, + "acc_stderr": 0.030874145136562097, + "acc_norm": 0.19393939393939394, + "acc_norm_stderr": 0.030874145136562097 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.16666666666666666, + "acc_stderr": 0.026552207828215293, + "acc_norm": 0.16666666666666666, + "acc_norm_stderr": 0.026552207828215293 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.25906735751295334, + "acc_stderr": 0.03161877917935411, + "acc_norm": 0.25906735751295334, + "acc_norm_stderr": 0.03161877917935411 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.30512820512820515, + "acc_stderr": 0.023346335293325884, + "acc_norm": 0.30512820512820515, + "acc_norm_stderr": 0.023346335293325884 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24814814814814815, + "acc_stderr": 0.0263357394040558, + "acc_norm": 0.24814814814814815, + "acc_norm_stderr": 0.0263357394040558 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2184873949579832, + "acc_stderr": 0.02684151432295893, + "acc_norm": 0.2184873949579832, + "acc_norm_stderr": 0.02684151432295893 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23841059602649006, + "acc_stderr": 0.0347918557259966, + "acc_norm": 0.23841059602649006, + "acc_norm_stderr": 0.0347918557259966 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.21284403669724772, + "acc_stderr": 0.017549376389313694, + "acc_norm": 0.21284403669724772, + "acc_norm_stderr": 0.017549376389313694 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.03214952147802749, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.03214952147802749 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2107843137254902, + "acc_stderr": 0.028626547912437378, + "acc_norm": 0.2107843137254902, + "acc_norm_stderr": 0.028626547912437378 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3094170403587444, + "acc_stderr": 0.031024411740572206, + "acc_norm": 0.3094170403587444, + "acc_norm_stderr": 0.031024411740572206 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2824427480916031, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.2824427480916031, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2231404958677686, + "acc_stderr": 0.03800754475228733, + "acc_norm": 0.2231404958677686, + "acc_norm_stderr": 0.03800754475228733 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.04284467968052192, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.04284467968052192 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3006134969325153, + "acc_stderr": 0.03602511318806771, + "acc_norm": 0.3006134969325153, + "acc_norm_stderr": 0.03602511318806771 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04547960999764376, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04547960999764376 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.18446601941747573, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.18446601941747573, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.27350427350427353, + "acc_stderr": 0.029202540153431177, + "acc_norm": 0.27350427350427353, + "acc_norm_stderr": 0.029202540153431177 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.015302380123542089, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.015302380123542089 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24277456647398843, + "acc_stderr": 0.023083658586984204, + "acc_norm": 0.24277456647398843, + "acc_norm_stderr": 0.023083658586984204 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.023805186524888146, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.023805186524888146 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.21543408360128619, + "acc_stderr": 0.023350225475471418, + "acc_norm": 0.21543408360128619, + "acc_norm_stderr": 0.023350225475471418 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.0227797190887334, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.0227797190887334 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23049645390070922, + "acc_stderr": 0.025123739226872402, + "acc_norm": 0.23049645390070922, + "acc_norm_stderr": 0.025123739226872402 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.242503259452412, + "acc_stderr": 0.010946570966348773, + "acc_norm": 0.242503259452412, + "acc_norm_stderr": 0.010946570966348773 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.3602941176470588, + "acc_stderr": 0.029163128570670733, + "acc_norm": 0.3602941176470588, + "acc_norm_stderr": 0.029163128570670733 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.017740899509177788, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.017740899509177788 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2, + "acc_stderr": 0.03831305140884601, + "acc_norm": 0.2, + "acc_norm_stderr": 0.03831305140884601 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.24897959183673468, + "acc_stderr": 0.027682979522960234, + "acc_norm": 0.24897959183673468, + "acc_norm_stderr": 0.027682979522960234 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24875621890547264, + "acc_stderr": 0.030567675938916707, + "acc_norm": 0.24875621890547264, + "acc_norm_stderr": 0.030567675938916707 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.23493975903614459, + "acc_stderr": 0.03300533186128922, + "acc_norm": 0.23493975903614459, + "acc_norm_stderr": 0.03300533186128922 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.03565079670708311, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.03565079670708311 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.20930232558139536, + "mc1_stderr": 0.01424121943478583, + "mc2": 0.39259427469965563, + "mc2_stderr": 0.014650271626814744 + }, + "harness|winogrande|5": { + "acc": 0.5880031570639306, + "acc_stderr": 0.013833112857645928 + }, + "harness|drop|3": { + "em": 0.0008389261744966443, + "em_stderr": 0.0002964962989801258, + "f1": 0.049648699664429606, + "f1_stderr": 0.0012748159733943205 + }, + "harness|gsm8k|5": { + "acc": 0.0075815011372251705, + "acc_stderr": 0.0023892815120772374 + }, + "all": { + "acc": 0.25073550713548176, + "acc_stderr": 0.030520589098567856, + "acc_norm": 0.2519817068951971, + "acc_norm_stderr": 0.03128857752862938, + "mc1": 0.20930232558139536, + "mc1_stderr": 0.01424121943478583, + "mc2": 0.39259427469965563, + "mc2_stderr": 0.014650271626814744, + "em": 0.0008389261744966443, + "em_stderr": 0.0002964962989801258, + "f1": 0.049648699664429606, + "f1_stderr": 0.0012748159733943205 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "c2d55d68c4441c39", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "38dc8458e001ab84", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "5e69bf9422c979cd", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "55065fe953492209", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non_truncated": -495, + "padded": 0, + "non_padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "0903f3aba4ea094f", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non_truncated": -612, + "padded": 0, + "non_padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non_truncated": 229, + "padded": 940, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non_truncated": 930, + "padded": 5524, + "non_padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "756a6fea3904e4fe" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "82d393161142a1bb" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "6c2529964ad5cacf", + "hash_cont_tokens": "d18d600684739daa" + }, + "truncated": 3351, + "non_truncated": 34844, + "padded": 111256, + "non_padded": 13152, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/vihangd/dopeyplats-1.1b-2T-v1/results_2023-12-02T14-28-36.585578.json b/eval-results/vihangd/dopeyplats-1.1b-2T-v1/results_2023-12-02T14-28-36.585578.json new file mode 100644 index 0000000000000000000000000000000000000000..5c50442806cd17ddadf63d440cf6c7462d7152a8 --- /dev/null +++ b/eval-results/vihangd/dopeyplats-1.1b-2T-v1/results_2023-12-02T14-28-36.585578.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1413526.283569938, + "end_time": 1415619.743220035, + "total_evaluation_time_secondes": "2093.459650096949", + "model_name": "vihangd/dopeyplats-1.1b-2T-v1", + "model_sha": "4ca47b470296de0e7bf3261e377aabaff9ad5c06", + "model_dtype": "torch.float16", + "model_size": "2.06 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.016679302501895376, + "acc_stderr": 0.0035275958887224295 + }, + "all": { + "acc": 0.016679302501895376, + "acc_stderr": 0.0035275958887224295 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "82d393161142a1bb" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "42036645de5ac59d", + "hash_cont_tokens": "45484ad61494203f" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/vihangd/dopeyshearedplats-1.3b-v1/results_2023-12-13T13-37-34.130815.json b/eval-results/vihangd/dopeyshearedplats-1.3b-v1/results_2023-12-13T13-37-34.130815.json new file mode 100644 index 0000000000000000000000000000000000000000..1e40e8f8173eca045baf1cb9bc5a7803087eb512 --- /dev/null +++ b/eval-results/vihangd/dopeyshearedplats-1.3b-v1/results_2023-12-13T13-37-34.130815.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 100874.996851918, + "end_time": 105614.133829549, + "total_evaluation_time_secondes": "4739.136977631002", + "model_name": "vihangd/dopeyshearedplats-1.3b-v1", + "model_sha": "45aa5d406bb6975deb801e5fffa27ca23e5724a5", + "model_dtype": "torch.float16", + "model_size": "2.55 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.3225255972696246, + "acc_stderr": 0.013659980894277368, + "acc_norm": 0.3438566552901024, + "acc_norm_stderr": 0.013880644570156215 + }, + "harness|hellaswag|10": { + "acc": 0.4848635729934276, + "acc_stderr": 0.004987494455523719, + "acc_norm": 0.6430989842660825, + "acc_norm_stderr": 0.004781061390873926 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2, + "acc_stderr": 0.034554737023254394, + "acc_norm": 0.2, + "acc_norm_stderr": 0.034554737023254394 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3223684210526316, + "acc_stderr": 0.03803510248351585, + "acc_norm": 0.3223684210526316, + "acc_norm_stderr": 0.03803510248351585 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.25660377358490566, + "acc_stderr": 0.026880647889051958, + "acc_norm": 0.25660377358490566, + "acc_norm_stderr": 0.026880647889051958 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.037455547914624576, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.037455547914624576 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.23, + "acc_stderr": 0.042295258468165044, + "acc_norm": 0.23, + "acc_norm_stderr": 0.042295258468165044 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816507, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816507 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.23699421965317918, + "acc_stderr": 0.03242414757483099, + "acc_norm": 0.23699421965317918, + "acc_norm_stderr": 0.03242414757483099 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.04336432707993177, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.04336432707993177 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816508, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816508 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3404255319148936, + "acc_stderr": 0.03097669299853443, + "acc_norm": 0.3404255319148936, + "acc_norm_stderr": 0.03097669299853443 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.22807017543859648, + "acc_stderr": 0.03947152782669415, + "acc_norm": 0.22807017543859648, + "acc_norm_stderr": 0.03947152782669415 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.21379310344827587, + "acc_stderr": 0.03416520447747549, + "acc_norm": 0.21379310344827587, + "acc_norm_stderr": 0.03416520447747549 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25132275132275134, + "acc_stderr": 0.022340482339643898, + "acc_norm": 0.25132275132275134, + "acc_norm_stderr": 0.022340482339643898 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.20634920634920634, + "acc_stderr": 0.036196045241242515, + "acc_norm": 0.20634920634920634, + "acc_norm_stderr": 0.036196045241242515 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.27419354838709675, + "acc_stderr": 0.025378139970885196, + "acc_norm": 0.27419354838709675, + "acc_norm_stderr": 0.025378139970885196 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.26108374384236455, + "acc_stderr": 0.030903796952114475, + "acc_norm": 0.26108374384236455, + "acc_norm_stderr": 0.030903796952114475 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.28484848484848485, + "acc_stderr": 0.03524390844511784, + "acc_norm": 0.28484848484848485, + "acc_norm_stderr": 0.03524390844511784 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.031911782267135466, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.031911782267135466 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.25906735751295334, + "acc_stderr": 0.03161877917935409, + "acc_norm": 0.25906735751295334, + "acc_norm_stderr": 0.03161877917935409 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3128205128205128, + "acc_stderr": 0.023507579020645333, + "acc_norm": 0.3128205128205128, + "acc_norm_stderr": 0.023507579020645333 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.25555555555555554, + "acc_stderr": 0.026593939101844082, + "acc_norm": 0.25555555555555554, + "acc_norm_stderr": 0.026593939101844082 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.226890756302521, + "acc_stderr": 0.027205371538279483, + "acc_norm": 0.226890756302521, + "acc_norm_stderr": 0.027205371538279483 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2847682119205298, + "acc_stderr": 0.03684881521389024, + "acc_norm": 0.2847682119205298, + "acc_norm_stderr": 0.03684881521389024 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.25871559633027524, + "acc_stderr": 0.01877605231961962, + "acc_norm": 0.25871559633027524, + "acc_norm_stderr": 0.01877605231961962 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.39351851851851855, + "acc_stderr": 0.03331747876370312, + "acc_norm": 0.39351851851851855, + "acc_norm_stderr": 0.03331747876370312 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.03198001660115071, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.03198001660115071 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.21518987341772153, + "acc_stderr": 0.026750826994676166, + "acc_norm": 0.21518987341772153, + "acc_norm_stderr": 0.026750826994676166 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.34080717488789236, + "acc_stderr": 0.0318114974705536, + "acc_norm": 0.34080717488789236, + "acc_norm_stderr": 0.0318114974705536 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2748091603053435, + "acc_stderr": 0.03915345408847834, + "acc_norm": 0.2748091603053435, + "acc_norm_stderr": 0.03915345408847834 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2809917355371901, + "acc_stderr": 0.04103203830514512, + "acc_norm": 0.2809917355371901, + "acc_norm_stderr": 0.04103203830514512 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.03259177392742178, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.03259177392742178 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.22321428571428573, + "acc_stderr": 0.039523019677025116, + "acc_norm": 0.22321428571428573, + "acc_norm_stderr": 0.039523019677025116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.18446601941747573, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.18446601941747573, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2564102564102564, + "acc_stderr": 0.028605953702004257, + "acc_norm": 0.2564102564102564, + "acc_norm_stderr": 0.028605953702004257 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.18, + "acc_stderr": 0.03861229196653696, + "acc_norm": 0.18, + "acc_norm_stderr": 0.03861229196653696 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2669220945083014, + "acc_stderr": 0.015818450894777573, + "acc_norm": 0.2669220945083014, + "acc_norm_stderr": 0.015818450894777573 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.21676300578034682, + "acc_stderr": 0.022183477668412856, + "acc_norm": 0.21676300578034682, + "acc_norm_stderr": 0.022183477668412856 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2446927374301676, + "acc_stderr": 0.014378169884098407, + "acc_norm": 0.2446927374301676, + "acc_norm_stderr": 0.014378169884098407 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.20261437908496732, + "acc_stderr": 0.023015446877985672, + "acc_norm": 0.20261437908496732, + "acc_norm_stderr": 0.023015446877985672 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.27009646302250806, + "acc_stderr": 0.025218040373410612, + "acc_norm": 0.27009646302250806, + "acc_norm_stderr": 0.025218040373410612 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.26851851851851855, + "acc_stderr": 0.024659685185967287, + "acc_norm": 0.26851851851851855, + "acc_norm_stderr": 0.024659685185967287 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.28368794326241137, + "acc_stderr": 0.02689170942834396, + "acc_norm": 0.28368794326241137, + "acc_norm_stderr": 0.02689170942834396 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24967405475880053, + "acc_stderr": 0.011054538377832327, + "acc_norm": 0.24967405475880053, + "acc_norm_stderr": 0.011054538377832327 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.16176470588235295, + "acc_stderr": 0.022368672562886754, + "acc_norm": 0.16176470588235295, + "acc_norm_stderr": 0.022368672562886754 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25163398692810457, + "acc_stderr": 0.017555818091322284, + "acc_norm": 0.25163398692810457, + "acc_norm_stderr": 0.017555818091322284 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2, + "acc_stderr": 0.03831305140884601, + "acc_norm": 0.2, + "acc_norm_stderr": 0.03831305140884601 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2693877551020408, + "acc_stderr": 0.02840125202902294, + "acc_norm": 0.2693877551020408, + "acc_norm_stderr": 0.02840125202902294 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23880597014925373, + "acc_stderr": 0.03014777593540922, + "acc_norm": 0.23880597014925373, + "acc_norm_stderr": 0.03014777593540922 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.21686746987951808, + "acc_stderr": 0.03208284450356365, + "acc_norm": 0.21686746987951808, + "acc_norm_stderr": 0.03208284450356365 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.21637426900584794, + "acc_stderr": 0.03158149539338735, + "acc_norm": 0.21637426900584794, + "acc_norm_stderr": 0.03158149539338735 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2460220318237454, + "mc1_stderr": 0.01507721920066259, + "mc2": 0.3821066604136214, + "mc2_stderr": 0.015269097668070952 + }, + "harness|winogrande|5": { + "acc": 0.5737963693764798, + "acc_stderr": 0.013898585965412338 + }, + "harness|gsm8k|5": { + "acc": 0.0075815011372251705, + "acc_stderr": 0.002389281512077212 + }, + "all": { + "acc": 0.26012302704770085, + "acc_stderr": 0.030820336255728206, + "acc_norm": 0.2621303940455793, + "acc_norm_stderr": 0.031589269063273896, + "mc1": 0.2460220318237454, + "mc1_stderr": 0.01507721920066259, + "mc2": 0.3821066604136214, + "mc2_stderr": 0.015269097668070952 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "27bf66427144a8f9" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "a8fa53915153e1db", + "hash_cont_tokens": "f4a45e36fde6c1e5" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/vihangd/dopeyshearedplats-2.7b-v1/results_2023-12-16T17-10-33.730644.json b/eval-results/vihangd/dopeyshearedplats-2.7b-v1/results_2023-12-16T17-10-33.730644.json new file mode 100644 index 0000000000000000000000000000000000000000..01daa59a9c47521fb789e27f38961f46ca122fd3 --- /dev/null +++ b/eval-results/vihangd/dopeyshearedplats-2.7b-v1/results_2023-12-16T17-10-33.730644.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 371675.002612858, + "end_time": 377590.613753801, + "total_evaluation_time_secondes": "5915.611140942958", + "model_name": "vihangd/dopeyshearedplats-2.7b-v1", + "model_sha": "c125218041c01662dc4c59b3f344aaa4e53dfd18", + "model_dtype": "torch.float16", + "model_size": "5.09 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.4121160409556314, + "acc_stderr": 0.014383915302225396, + "acc_norm": 0.46075085324232085, + "acc_norm_stderr": 0.014566303676636588 + }, + "harness|hellaswag|10": { + "acc": 0.5739892451702848, + "acc_stderr": 0.0049348468098272, + "acc_norm": 0.7517426807408882, + "acc_norm_stderr": 0.00431118988223835 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.37037037037037035, + "acc_stderr": 0.04171654161354543, + "acc_norm": 0.37037037037037035, + "acc_norm_stderr": 0.04171654161354543 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.034597776068105365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.034597776068105365 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.30566037735849055, + "acc_stderr": 0.028353298073322663, + "acc_norm": 0.30566037735849055, + "acc_norm_stderr": 0.028353298073322663 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3402777777777778, + "acc_stderr": 0.03962135573486219, + "acc_norm": 0.3402777777777778, + "acc_norm_stderr": 0.03962135573486219 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.23121387283236994, + "acc_stderr": 0.032147373020294696, + "acc_norm": 0.23121387283236994, + "acc_norm_stderr": 0.032147373020294696 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.1568627450980392, + "acc_stderr": 0.036186648199362466, + "acc_norm": 0.1568627450980392, + "acc_norm_stderr": 0.036186648199362466 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.42, + "acc_stderr": 0.04960449637488584, + "acc_norm": 0.42, + "acc_norm_stderr": 0.04960449637488584 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3148936170212766, + "acc_stderr": 0.03036358219723817, + "acc_norm": 0.3148936170212766, + "acc_norm_stderr": 0.03036358219723817 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.22807017543859648, + "acc_stderr": 0.03947152782669415, + "acc_norm": 0.22807017543859648, + "acc_norm_stderr": 0.03947152782669415 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2896551724137931, + "acc_stderr": 0.037800192304380135, + "acc_norm": 0.2896551724137931, + "acc_norm_stderr": 0.037800192304380135 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.26455026455026454, + "acc_stderr": 0.022717467897708624, + "acc_norm": 0.26455026455026454, + "acc_norm_stderr": 0.022717467897708624 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.18253968253968253, + "acc_stderr": 0.03455071019102148, + "acc_norm": 0.18253968253968253, + "acc_norm_stderr": 0.03455071019102148 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.27741935483870966, + "acc_stderr": 0.025470196835900055, + "acc_norm": 0.27741935483870966, + "acc_norm_stderr": 0.025470196835900055 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.27586206896551724, + "acc_stderr": 0.031447125816782405, + "acc_norm": 0.27586206896551724, + "acc_norm_stderr": 0.031447125816782405 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.296969696969697, + "acc_stderr": 0.035679697722680474, + "acc_norm": 0.296969696969697, + "acc_norm_stderr": 0.035679697722680474 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.29292929292929293, + "acc_stderr": 0.032424979581788166, + "acc_norm": 0.29292929292929293, + "acc_norm_stderr": 0.032424979581788166 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.29533678756476683, + "acc_stderr": 0.03292296639155141, + "acc_norm": 0.29533678756476683, + "acc_norm_stderr": 0.03292296639155141 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2641025641025641, + "acc_stderr": 0.02235219373745326, + "acc_norm": 0.2641025641025641, + "acc_norm_stderr": 0.02235219373745326 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.24444444444444444, + "acc_stderr": 0.02620276653465215, + "acc_norm": 0.24444444444444444, + "acc_norm_stderr": 0.02620276653465215 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2689075630252101, + "acc_stderr": 0.028801392193631273, + "acc_norm": 0.2689075630252101, + "acc_norm_stderr": 0.028801392193631273 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.18543046357615894, + "acc_stderr": 0.031732843842942865, + "acc_norm": 0.18543046357615894, + "acc_norm_stderr": 0.031732843842942865 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.28440366972477066, + "acc_stderr": 0.019342036587702602, + "acc_norm": 0.28440366972477066, + "acc_norm_stderr": 0.019342036587702602 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.02792096314799366, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.02792096314799366 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.27450980392156865, + "acc_stderr": 0.031321798030832904, + "acc_norm": 0.27450980392156865, + "acc_norm_stderr": 0.031321798030832904 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.29535864978902954, + "acc_stderr": 0.029696338713422882, + "acc_norm": 0.29535864978902954, + "acc_norm_stderr": 0.029696338713422882 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.27802690582959644, + "acc_stderr": 0.030069584874494053, + "acc_norm": 0.27802690582959644, + "acc_norm_stderr": 0.030069584874494053 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2824427480916031, + "acc_stderr": 0.03948406125768361, + "acc_norm": 0.2824427480916031, + "acc_norm_stderr": 0.03948406125768361 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.4793388429752066, + "acc_stderr": 0.04560456086387235, + "acc_norm": 0.4793388429752066, + "acc_norm_stderr": 0.04560456086387235 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04557239513497751, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04557239513497751 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3558282208588957, + "acc_stderr": 0.03761521380046734, + "acc_norm": 0.3558282208588957, + "acc_norm_stderr": 0.03761521380046734 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.29464285714285715, + "acc_stderr": 0.0432704093257873, + "acc_norm": 0.29464285714285715, + "acc_norm_stderr": 0.0432704093257873 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.27184466019417475, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.27184466019417475, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.32905982905982906, + "acc_stderr": 0.030782321577688156, + "acc_norm": 0.32905982905982906, + "acc_norm_stderr": 0.030782321577688156 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.38697318007662834, + "acc_stderr": 0.017417138059440136, + "acc_norm": 0.38697318007662834, + "acc_norm_stderr": 0.017417138059440136 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.28901734104046245, + "acc_stderr": 0.02440517393578323, + "acc_norm": 0.28901734104046245, + "acc_norm_stderr": 0.02440517393578323 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2547486033519553, + "acc_stderr": 0.014572650383409155, + "acc_norm": 0.2547486033519553, + "acc_norm_stderr": 0.014572650383409155 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.28104575163398693, + "acc_stderr": 0.02573885479781873, + "acc_norm": 0.28104575163398693, + "acc_norm_stderr": 0.02573885479781873 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.31189710610932475, + "acc_stderr": 0.02631185807185415, + "acc_norm": 0.31189710610932475, + "acc_norm_stderr": 0.02631185807185415 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.345679012345679, + "acc_stderr": 0.026462487777001886, + "acc_norm": 0.345679012345679, + "acc_norm_stderr": 0.026462487777001886 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2978723404255319, + "acc_stderr": 0.027281608344469414, + "acc_norm": 0.2978723404255319, + "acc_norm_stderr": 0.027281608344469414 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.27835723598435463, + "acc_stderr": 0.011446990197380985, + "acc_norm": 0.27835723598435463, + "acc_norm_stderr": 0.011446990197380985 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.17647058823529413, + "acc_stderr": 0.02315746830855935, + "acc_norm": 0.17647058823529413, + "acc_norm_stderr": 0.02315746830855935 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.3104575163398693, + "acc_stderr": 0.018718067052623216, + "acc_norm": 0.3104575163398693, + "acc_norm_stderr": 0.018718067052623216 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2909090909090909, + "acc_stderr": 0.04350271442923243, + "acc_norm": 0.2909090909090909, + "acc_norm_stderr": 0.04350271442923243 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2571428571428571, + "acc_stderr": 0.02797982353874455, + "acc_norm": 0.2571428571428571, + "acc_norm_stderr": 0.02797982353874455 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.033333333333333354, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.033333333333333354 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.27710843373493976, + "acc_stderr": 0.03484331592680588, + "acc_norm": 0.27710843373493976, + "acc_norm_stderr": 0.03484331592680588 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.38011695906432746, + "acc_stderr": 0.03722965741385539, + "acc_norm": 0.38011695906432746, + "acc_norm_stderr": 0.03722965741385539 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.28151774785801714, + "mc1_stderr": 0.01574402724825605, + "mc2": 0.44123500676119165, + "mc2_stderr": 0.015794257230996155 + }, + "harness|winogrande|5": { + "acc": 0.6266771902131019, + "acc_stderr": 0.013594002763035518 + }, + "harness|gsm8k|5": { + "acc": 0.0037907505686125853, + "acc_stderr": 0.0016927007401501904 + }, + "all": { + "acc": 0.29757038455080786, + "acc_stderr": 0.032057229895416275, + "acc_norm": 0.30080870787584507, + "acc_norm_stderr": 0.03287733985701745, + "mc1": 0.28151774785801714, + "mc1_stderr": 0.01574402724825605, + "mc2": 0.44123500676119165, + "mc2_stderr": 0.015794257230996155 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "81a30b511fed6b70" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "a8fa53915153e1db", + "hash_cont_tokens": "75abda68f252c07c" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/vihangd/neuralfalcon-1b-v1/results_2023-12-17T03-31-54.267536.json b/eval-results/vihangd/neuralfalcon-1b-v1/results_2023-12-17T03-31-54.267536.json new file mode 100644 index 0000000000000000000000000000000000000000..b537b27c8e9e09e7c08122cf25da93a808ceb534 --- /dev/null +++ b/eval-results/vihangd/neuralfalcon-1b-v1/results_2023-12-17T03-31-54.267536.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 411976.229779346, + "end_time": 414871.667249396, + "total_evaluation_time_secondes": "2895.437470050005", + "model_name": "vihangd/neuralfalcon-1b-v1", + "model_sha": "f788af66f22a933ad60e732ebaede3dfb5679bd4", + "model_dtype": "torch.float16", + "model_size": "2.44 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.2226962457337884, + "acc_stderr": 0.012158314774829936, + "acc_norm": 0.26791808873720135, + "acc_norm_stderr": 0.012942030195136412 + }, + "harness|hellaswag|10": { + "acc": 0.26090420235012945, + "acc_stderr": 0.004382303181183642, + "acc_norm": 0.26558454491137223, + "acc_norm_stderr": 0.004407413723383401 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.22962962962962963, + "acc_stderr": 0.03633384414073462, + "acc_norm": 0.22962962962962963, + "acc_norm_stderr": 0.03633384414073462 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3355263157894737, + "acc_stderr": 0.03842498559395269, + "acc_norm": 0.3355263157894737, + "acc_norm_stderr": 0.03842498559395269 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2037735849056604, + "acc_stderr": 0.024790784501775402, + "acc_norm": 0.2037735849056604, + "acc_norm_stderr": 0.024790784501775402 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2638888888888889, + "acc_stderr": 0.03685651095897532, + "acc_norm": 0.2638888888888889, + "acc_norm_stderr": 0.03685651095897532 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.03295304696818318, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.03295304696818318 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04690650298201942, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04690650298201942 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.20851063829787234, + "acc_stderr": 0.026556982117838728, + "acc_norm": 0.20851063829787234, + "acc_norm_stderr": 0.026556982117838728 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.22807017543859648, + "acc_stderr": 0.03947152782669415, + "acc_norm": 0.22807017543859648, + "acc_norm_stderr": 0.03947152782669415 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.23448275862068965, + "acc_stderr": 0.035306258743465914, + "acc_norm": 0.23448275862068965, + "acc_norm_stderr": 0.035306258743465914 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2724867724867725, + "acc_stderr": 0.022930973071633328, + "acc_norm": 0.2724867724867725, + "acc_norm_stderr": 0.022930973071633328 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.35714285714285715, + "acc_stderr": 0.04285714285714281, + "acc_norm": 0.35714285714285715, + "acc_norm_stderr": 0.04285714285714281 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3161290322580645, + "acc_stderr": 0.02645087448904277, + "acc_norm": 0.3161290322580645, + "acc_norm_stderr": 0.02645087448904277 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2660098522167488, + "acc_stderr": 0.03108982600293752, + "acc_norm": 0.2660098522167488, + "acc_norm_stderr": 0.03108982600293752 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.12, + "acc_stderr": 0.03265986323710905, + "acc_norm": 0.12, + "acc_norm_stderr": 0.03265986323710905 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2545454545454545, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.2545454545454545, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.3484848484848485, + "acc_stderr": 0.033948539651564025, + "acc_norm": 0.3484848484848485, + "acc_norm_stderr": 0.033948539651564025 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.30569948186528495, + "acc_stderr": 0.03324837939758159, + "acc_norm": 0.30569948186528495, + "acc_norm_stderr": 0.03324837939758159 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.3435897435897436, + "acc_stderr": 0.024078696580635467, + "acc_norm": 0.3435897435897436, + "acc_norm_stderr": 0.024078696580635467 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.026842057873833706, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.026842057873833706 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.31932773109243695, + "acc_stderr": 0.0302839955258844, + "acc_norm": 0.31932773109243695, + "acc_norm_stderr": 0.0302839955258844 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.304635761589404, + "acc_stderr": 0.03757949922943342, + "acc_norm": 0.304635761589404, + "acc_norm_stderr": 0.03757949922943342 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3321100917431193, + "acc_stderr": 0.02019268298542334, + "acc_norm": 0.3321100917431193, + "acc_norm_stderr": 0.02019268298542334 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.46296296296296297, + "acc_stderr": 0.03400603625538272, + "acc_norm": 0.46296296296296297, + "acc_norm_stderr": 0.03400603625538272 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.030587591351604246, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.030587591351604246 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.24050632911392406, + "acc_stderr": 0.027820781981149675, + "acc_norm": 0.24050632911392406, + "acc_norm_stderr": 0.027820781981149675 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.10762331838565023, + "acc_stderr": 0.020799400082879997, + "acc_norm": 0.10762331838565023, + "acc_norm_stderr": 0.020799400082879997 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.3053435114503817, + "acc_stderr": 0.040393149787245605, + "acc_norm": 0.3053435114503817, + "acc_norm_stderr": 0.040393149787245605 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.14049586776859505, + "acc_stderr": 0.0317223342600216, + "acc_norm": 0.14049586776859505, + "acc_norm_stderr": 0.0317223342600216 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.04414343666854933, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.04414343666854933 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.20535714285714285, + "acc_stderr": 0.038342410214190735, + "acc_norm": 0.20535714285714285, + "acc_norm_stderr": 0.038342410214190735 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.3786407766990291, + "acc_stderr": 0.04802694698258972, + "acc_norm": 0.3786407766990291, + "acc_norm_stderr": 0.04802694698258972 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.24786324786324787, + "acc_stderr": 0.028286324075564407, + "acc_norm": 0.24786324786324787, + "acc_norm_stderr": 0.028286324075564407 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768077, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768077 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.20306513409961685, + "acc_stderr": 0.014385525076611581, + "acc_norm": 0.20306513409961685, + "acc_norm_stderr": 0.014385525076611581 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.25722543352601157, + "acc_stderr": 0.02353292543104429, + "acc_norm": 0.25722543352601157, + "acc_norm_stderr": 0.02353292543104429 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.27262569832402234, + "acc_stderr": 0.014893391735249588, + "acc_norm": 0.27262569832402234, + "acc_norm_stderr": 0.014893391735249588 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.2875816993464052, + "acc_stderr": 0.02591780611714716, + "acc_norm": 0.2875816993464052, + "acc_norm_stderr": 0.02591780611714716 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24758842443729903, + "acc_stderr": 0.024513879973621967, + "acc_norm": 0.24758842443729903, + "acc_norm_stderr": 0.024513879973621967 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21604938271604937, + "acc_stderr": 0.0228991629184458, + "acc_norm": 0.21604938271604937, + "acc_norm_stderr": 0.0228991629184458 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2375886524822695, + "acc_stderr": 0.025389512552729903, + "acc_norm": 0.2375886524822695, + "acc_norm_stderr": 0.025389512552729903 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2379400260756193, + "acc_stderr": 0.010875700787694243, + "acc_norm": 0.2379400260756193, + "acc_norm_stderr": 0.010875700787694243 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.1875, + "acc_stderr": 0.023709788253811766, + "acc_norm": 0.1875, + "acc_norm_stderr": 0.023709788253811766 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.21895424836601307, + "acc_stderr": 0.016729937565537544, + "acc_norm": 0.21895424836601307, + "acc_norm_stderr": 0.016729937565537544 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2, + "acc_stderr": 0.038313051408846034, + "acc_norm": 0.2, + "acc_norm_stderr": 0.038313051408846034 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.2163265306122449, + "acc_stderr": 0.026358916334904028, + "acc_norm": 0.2163265306122449, + "acc_norm_stderr": 0.026358916334904028 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.3034825870646766, + "acc_stderr": 0.0325100681645862, + "acc_norm": 0.3034825870646766, + "acc_norm_stderr": 0.0325100681645862 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.23493975903614459, + "acc_stderr": 0.03300533186128922, + "acc_norm": 0.23493975903614459, + "acc_norm_stderr": 0.03300533186128922 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.17543859649122806, + "acc_stderr": 0.029170885500727654, + "acc_norm": 0.17543859649122806, + "acc_norm_stderr": 0.029170885500727654 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23378212974296206, + "mc1_stderr": 0.014816195991931598, + "mc2": 0.48930714365824785, + "mc2_stderr": 0.016679848749840336 + }, + "harness|winogrande|5": { + "acc": 0.4956590370955012, + "acc_stderr": 0.014051956064076911 + }, + "harness|gsm8k|5": { + "acc": 0.002274450341167551, + "acc_stderr": 0.0013121578148674025 + }, + "all": { + "acc": 0.26110795485511296, + "acc_stderr": 0.03090020464432457, + "acc_norm": 0.26236532108965893, + "acc_norm_stderr": 0.0317009694133451, + "mc1": 0.23378212974296206, + "mc1_stderr": 0.014816195991931598, + "mc2": 0.48930714365824785, + "mc2_stderr": 0.016679848749840336 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "cf9c7646f3c8d4c0", + "hash_cont_tokens": "ed17e576dbafa5da" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "dfa6446d4905130e", + "hash_cont_tokens": "0875c25c8fc0a94d" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40144, + "non_padded": 24, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "02b38e65730b4712", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "1fbed4b4bb27d865", + "hash_cont_tokens": "aa3ffb1a6e4356f5" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "98497e888319b56e", + "hash_cont_tokens": "18cfffb76bc8f0d1" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "034541338d86a1f8", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "e1e150bdc850c136", + "hash_cont_tokens": "cd61f7de0830a75a" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1052, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "20a799d5f9c9a1a9", + "hash_cont_tokens": "16b3626c8a5e3797" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "41a4597e36c19ef0", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "105bbe033341ea0c", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "d41bd2267dc69a8e", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "3ff1dd65a1f9c7e0", + "hash_cont_tokens": "62bb469d2a319d91" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 684, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "407265e46dfeaf24", + "hash_cont_tokens": "bf103c9a1f61ec12" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 400, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "189af9a9e0c85513", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "e23e27a5cb5fade6", + "hash_cont_tokens": "ff5ca3d84bb47a0b" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "8b2cbba16cd354a4", + "hash_cont_tokens": "21f0989f5760198a" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "8007052787e63032", + "hash_cont_tokens": "35bf6c0c1a7ee403" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2e17edbbe8c5aa19", + "hash_cont_tokens": "f7d801bfd913884d" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "956704efed2d3de9", + "hash_cont_tokens": "23f9089575432d5a" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "56e24a6936981317", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "9280d83ca94167a7", + "hash_cont_tokens": "04b8293f2ab7fbbf" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a5c6dfe388cd8931", + "hash_cont_tokens": "c3deabee1deab3a3" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 800, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "559e2cb0d4788604", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "3e24478a8854bd77", + "hash_cont_tokens": "c4f2565ca36881d5" + }, + "truncated": 660, + "non_truncated": -495, + "padded": 0, + "non_padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "5d284ce4c7b0ca9a", + "hash_cont_tokens": "780e569058de22be" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 788, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "2dd840e14eacd6bd", + "hash_cont_tokens": "7994d94bfa36d003" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "562915cf47265af9", + "hash_cont_tokens": "8f5c8baf02161f10" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "563fd8cde62df13f", + "hash_cont_tokens": "a2c91752be5b1798" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "0310fb471b15978e", + "hash_cont_tokens": "985403b262df21a4" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "ccf86436451daecc", + "hash_cont_tokens": "db71da66ed82b921" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "ec2f001bd307f9a5", + "hash_cont_tokens": "e81cf9738ad7e157" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9e7262228c2fbd53", + "hash_cont_tokens": "4a2d5f00cb00d9b7" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "4ab213491f557f31", + "hash_cont_tokens": "eab825cf8fbdd085" + }, + "truncated": 816, + "non_truncated": -612, + "padded": 0, + "non_padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "2a04fb615e6717ea", + "hash_cont_tokens": "e9bcfaa6beefb456" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "e1878600f1df37c7", + "hash_cont_tokens": "38eafdb22e9fca11" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "0fdde6eb0830bf5f", + "hash_cont_tokens": "11de075f88fc7cd2" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "6dc5ed9fa471d27d", + "hash_cont_tokens": "6f8215a3de7eebd1" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "8a0d33cb57eadb93", + "hash_cont_tokens": "5c77c6f472688075" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 428, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "33bc8cbaf4b148b6", + "hash_cont_tokens": "25a46284b3589e0d" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "a0e12130e19d9a02", + "hash_cont_tokens": "aacac708cd4c5a61" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "e6b0b33a41fda02f", + "hash_cont_tokens": "d37808f586a9e9b5" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "c1d59b968d6d5787", + "hash_cont_tokens": "95faf210efa02f90" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "95a56c538b0a74ae", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "0734c11b6c0450c2", + "hash_cont_tokens": "ef1ae838a09a7521" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "12b681baaab8e9c9", + "hash_cont_tokens": "16b6c6e390eb7cea" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "d4f3662defa0365d", + "hash_cont_tokens": "4130880a19c4edb0" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "224661463bd8aae6", + "hash_cont_tokens": "96b81f570a84328b" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "ca40d870dd2c13f9", + "hash_cont_tokens": "dddff9925c9b675a" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1240, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "06681ff31df5feac", + "hash_cont_tokens": "e3a7592f84b44888" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "b2c1589afc80dbdd", + "hash_cont_tokens": "f9edf462e8201551" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "999e8c7cf55b590c", + "hash_cont_tokens": "a2de48df0afbaff7" + }, + "truncated": 16, + "non_truncated": 1518, + "padded": 6120, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "cb68733b835e69f0", + "hash_cont_tokens": "ecf7754754c2bb76" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "a428fe3d64b0ef43", + "hash_cont_tokens": "30b07e31cf9b5c6f" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "2c0e453c0a702736", + "hash_cont_tokens": "cf3600a50782c6c5" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 436, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "c2b75c24a925a416", + "hash_cont_tokens": "4d1dc7c4ad251829" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "52d02a4f41926abc", + "hash_cont_tokens": "d36b9d9f0f4424fe" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 792, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "00c4ee3a60217a8b", + "hash_cont_tokens": "844bd0bf669e8136" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "728002327bd9798a", + "hash_cont_tokens": "30d4fa4828c5468f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "3b8028edcd45c58b", + "hash_cont_tokens": "a0a7af55ac7ae037" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "70a938aa2b5afaa9", + "hash_cont_tokens": "84fd36aa004c8578" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "0c6a4d96ca45d712", + "hash_cont_tokens": "64ca3ed9b5dacc6e" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "3ab9b4c5105492a3", + "hash_cont_tokens": "75cf5fed2ecea0cf" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "6a624838bc7e8350", + "hash_cont_tokens": "415a9fb1ca46a8af" + }, + "truncated": 1492, + "non_truncated": 27167, + "padded": 111961, + "non_padded": 2911, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/vihangd/shearedplats-1.3b-v1/results_2023-11-18T21-27-03.574383.json b/eval-results/vihangd/shearedplats-1.3b-v1/results_2023-11-18T21-27-03.574383.json new file mode 100644 index 0000000000000000000000000000000000000000..580e0d4dd597fd4403ce9583e6e068e0cc6d7999 --- /dev/null +++ b/eval-results/vihangd/shearedplats-1.3b-v1/results_2023-11-18T21-27-03.574383.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 243897.230128125, + "end_time": 255841.278717637, + "total_evaluation_time_secondes": "11944.048589512007", + "model_name": "vihangd/shearedplats-1.3b-v1", + "model_sha": "7ac93152e1807ec1d732500255a747e27922fb1a", + "model_dtype": "torch.float16", + "model_size": "2.55 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.3174061433447099, + "acc_stderr": 0.013602239088038173, + "acc_norm": 0.35409556313993173, + "acc_norm_stderr": 0.013975454122756557 + }, + "harness|hellaswag|10": { + "acc": 0.4705238000398327, + "acc_stderr": 0.0049811031579404495, + "acc_norm": 0.6274646484763992, + "acc_norm_stderr": 0.004824917516374187 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768081, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768081 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.03591444084196969, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.03591444084196969 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3223684210526316, + "acc_stderr": 0.03803510248351585, + "acc_norm": 0.3223684210526316, + "acc_norm_stderr": 0.03803510248351585 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2528301886792453, + "acc_stderr": 0.026749899771241238, + "acc_norm": 0.2528301886792453, + "acc_norm_stderr": 0.026749899771241238 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.03476590104304134, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.03476590104304134 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.15, + "acc_stderr": 0.035887028128263714, + "acc_norm": 0.15, + "acc_norm_stderr": 0.035887028128263714 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909281, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909281 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.03295304696818318, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.03295304696818318 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.04389869956808778, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.04389869956808778 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.27, + "acc_stderr": 0.04461960433384739, + "acc_norm": 0.27, + "acc_norm_stderr": 0.04461960433384739 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3276595744680851, + "acc_stderr": 0.030683020843231004, + "acc_norm": 0.3276595744680851, + "acc_norm_stderr": 0.030683020843231004 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21929824561403508, + "acc_stderr": 0.03892431106518754, + "acc_norm": 0.21929824561403508, + "acc_norm_stderr": 0.03892431106518754 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.23448275862068965, + "acc_stderr": 0.035306258743465914, + "acc_norm": 0.23448275862068965, + "acc_norm_stderr": 0.035306258743465914 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.022569897074918417, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.022569897074918417 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.15873015873015872, + "acc_stderr": 0.03268454013011744, + "acc_norm": 0.15873015873015872, + "acc_norm_stderr": 0.03268454013011744 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.27419354838709675, + "acc_stderr": 0.025378139970885196, + "acc_norm": 0.27419354838709675, + "acc_norm_stderr": 0.025378139970885196 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.270935960591133, + "acc_stderr": 0.031270907132976984, + "acc_norm": 0.270935960591133, + "acc_norm_stderr": 0.031270907132976984 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.2606060606060606, + "acc_stderr": 0.034277431758165236, + "acc_norm": 0.2606060606060606, + "acc_norm_stderr": 0.034277431758165236 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.21717171717171718, + "acc_stderr": 0.029376616484945633, + "acc_norm": 0.21717171717171718, + "acc_norm_stderr": 0.029376616484945633 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.21243523316062177, + "acc_stderr": 0.029519282616817244, + "acc_norm": 0.21243523316062177, + "acc_norm_stderr": 0.029519282616817244 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.24615384615384617, + "acc_stderr": 0.021840866990423084, + "acc_norm": 0.24615384615384617, + "acc_norm_stderr": 0.021840866990423084 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.028037929969114993, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.028037929969114993 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.23109243697478993, + "acc_stderr": 0.027381406927868966, + "acc_norm": 0.23109243697478993, + "acc_norm_stderr": 0.027381406927868966 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2251655629139073, + "acc_stderr": 0.03410435282008936, + "acc_norm": 0.2251655629139073, + "acc_norm_stderr": 0.03410435282008936 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.23669724770642203, + "acc_stderr": 0.01822407811729907, + "acc_norm": 0.23669724770642203, + "acc_norm_stderr": 0.01822407811729907 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.37037037037037035, + "acc_stderr": 0.03293377139415191, + "acc_norm": 0.37037037037037035, + "acc_norm_stderr": 0.03293377139415191 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2107843137254902, + "acc_stderr": 0.028626547912437388, + "acc_norm": 0.2107843137254902, + "acc_norm_stderr": 0.028626547912437388 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.27848101265822783, + "acc_stderr": 0.029178682304842548, + "acc_norm": 0.27848101265822783, + "acc_norm_stderr": 0.029178682304842548 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.336322869955157, + "acc_stderr": 0.031708824268455, + "acc_norm": 0.336322869955157, + "acc_norm_stderr": 0.031708824268455 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.22900763358778625, + "acc_stderr": 0.036853466317118506, + "acc_norm": 0.22900763358778625, + "acc_norm_stderr": 0.036853466317118506 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.38016528925619836, + "acc_stderr": 0.04431324501968432, + "acc_norm": 0.38016528925619836, + "acc_norm_stderr": 0.04431324501968432 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.23148148148148148, + "acc_stderr": 0.04077494709252628, + "acc_norm": 0.23148148148148148, + "acc_norm_stderr": 0.04077494709252628 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22699386503067484, + "acc_stderr": 0.032910995786157686, + "acc_norm": 0.22699386503067484, + "acc_norm_stderr": 0.032910995786157686 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.23214285714285715, + "acc_stderr": 0.04007341809755805, + "acc_norm": 0.23214285714285715, + "acc_norm_stderr": 0.04007341809755805 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.23300970873786409, + "acc_stderr": 0.041858325989283164, + "acc_norm": 0.23300970873786409, + "acc_norm_stderr": 0.041858325989283164 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.23076923076923078, + "acc_stderr": 0.027601921381417586, + "acc_norm": 0.23076923076923078, + "acc_norm_stderr": 0.027601921381417586 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.22, + "acc_stderr": 0.0416333199893227, + "acc_norm": 0.22, + "acc_norm_stderr": 0.0416333199893227 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.26947637292464877, + "acc_stderr": 0.01586624307321505, + "acc_norm": 0.26947637292464877, + "acc_norm_stderr": 0.01586624307321505 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.21676300578034682, + "acc_stderr": 0.02218347766841286, + "acc_norm": 0.21676300578034682, + "acc_norm_stderr": 0.02218347766841286 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.25251396648044694, + "acc_stderr": 0.014530330201468641, + "acc_norm": 0.25251396648044694, + "acc_norm_stderr": 0.014530330201468641 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.0239291555173513, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.0239291555173513 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.24437299035369775, + "acc_stderr": 0.024406162094668893, + "acc_norm": 0.24437299035369775, + "acc_norm_stderr": 0.024406162094668893 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2962962962962963, + "acc_stderr": 0.02540719779889016, + "acc_norm": 0.2962962962962963, + "acc_norm_stderr": 0.02540719779889016 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2730496453900709, + "acc_stderr": 0.02657786094330785, + "acc_norm": 0.2730496453900709, + "acc_norm_stderr": 0.02657786094330785 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.24511082138200782, + "acc_stderr": 0.010986307870045514, + "acc_norm": 0.24511082138200782, + "acc_norm_stderr": 0.010986307870045514 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.1948529411764706, + "acc_stderr": 0.024060599423487424, + "acc_norm": 0.1948529411764706, + "acc_norm_stderr": 0.024060599423487424 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.24673202614379086, + "acc_stderr": 0.017440820367402507, + "acc_norm": 0.24673202614379086, + "acc_norm_stderr": 0.017440820367402507 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.23636363636363636, + "acc_stderr": 0.04069306319721378, + "acc_norm": 0.23636363636363636, + "acc_norm_stderr": 0.04069306319721378 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.20816326530612245, + "acc_stderr": 0.025991117672813292, + "acc_norm": 0.20816326530612245, + "acc_norm_stderr": 0.025991117672813292 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.22885572139303484, + "acc_stderr": 0.029705284056772432, + "acc_norm": 0.22885572139303484, + "acc_norm_stderr": 0.029705284056772432 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.19879518072289157, + "acc_stderr": 0.03106939026078942, + "acc_norm": 0.19879518072289157, + "acc_norm_stderr": 0.03106939026078942 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.23976608187134502, + "acc_stderr": 0.03274485211946956, + "acc_norm": 0.23976608187134502, + "acc_norm_stderr": 0.03274485211946956 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.204406364749082, + "mc1_stderr": 0.014117174337432618, + "mc2": 0.3392533208873607, + "mc2_stderr": 0.014078645743359227 + }, + "harness|winogrande|5": { + "acc": 0.584846093133386, + "acc_stderr": 0.013848684086658585 + }, + "harness|drop|3": { + "em": 0.003355704697986577, + "em_stderr": 0.0005922452850005238, + "f1": 0.0555180369127516, + "f1_stderr": 0.0013765753121727882 + }, + "harness|gsm8k|5": { + "acc": 0.00530705079605762, + "acc_stderr": 0.0020013057209480375 + }, + "all": { + "acc": 0.253847590681609, + "acc_stderr": 0.030523099331108815, + "acc_norm": 0.25573186704882195, + "acc_norm_stderr": 0.031292815233613276, + "mc1": 0.204406364749082, + "mc1_stderr": 0.014117174337432618, + "mc2": 0.3392533208873607, + "mc2_stderr": 0.014078645743359227, + "em": 0.003355704697986577, + "em_stderr": 0.0005922452850005238, + "f1": 0.0555180369127516, + "f1_stderr": 0.0013765753121727882 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "afb0fa590d8f301d" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "904568760f7555a7" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "379266f3a5365f9d", + "hash_cont_tokens": "83d01c3a76dcea2f" + }, + "truncated": 3, + "non_truncated": 38192, + "padded": 113348, + "non_padded": 11060, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/vihangd/shearedplats-2.7b-v2/results_2023-11-19T15-14-51.109565.json b/eval-results/vihangd/shearedplats-2.7b-v2/results_2023-11-19T15-14-51.109565.json new file mode 100644 index 0000000000000000000000000000000000000000..de1f53ab6a45ccebfe92c46cceaa208365a354e1 --- /dev/null +++ b/eval-results/vihangd/shearedplats-2.7b-v2/results_2023-11-19T15-14-51.109565.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 247888.51684059, + "end_time": 260993.166587484, + "total_evaluation_time_secondes": "13104.649746894022", + "model_name": "vihangd/shearedplats-2.7b-v2", + "model_sha": "2837296f28d6aa0fb6c1fe382f553e65c8e1e5f3", + "model_dtype": "torch.float16", + "model_size": "5.09 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.38993174061433444, + "acc_stderr": 0.014252959848892884, + "acc_norm": 0.42406143344709896, + "acc_norm_stderr": 0.014441889627464401 + }, + "harness|hellaswag|10": { + "acc": 0.5428201553475404, + "acc_stderr": 0.0049714495527871765, + "acc_norm": 0.7257518422624976, + "acc_norm_stderr": 0.004452228541043551 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.34074074074074073, + "acc_stderr": 0.04094376269996794, + "acc_norm": 0.34074074074074073, + "acc_norm_stderr": 0.04094376269996794 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.19736842105263158, + "acc_stderr": 0.03238981601699397, + "acc_norm": 0.19736842105263158, + "acc_norm_stderr": 0.03238981601699397 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.30943396226415093, + "acc_stderr": 0.028450154794118627, + "acc_norm": 0.30943396226415093, + "acc_norm_stderr": 0.028450154794118627 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.3055555555555556, + "acc_stderr": 0.03852084696008534, + "acc_norm": 0.3055555555555556, + "acc_norm_stderr": 0.03852084696008534 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036625, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036625 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.17341040462427745, + "acc_stderr": 0.02886810787497064, + "acc_norm": 0.17341040462427745, + "acc_norm_stderr": 0.02886810787497064 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237101, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237101 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2851063829787234, + "acc_stderr": 0.029513196625539355, + "acc_norm": 0.2851063829787234, + "acc_norm_stderr": 0.029513196625539355 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.22807017543859648, + "acc_stderr": 0.03947152782669415, + "acc_norm": 0.22807017543859648, + "acc_norm_stderr": 0.03947152782669415 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2827586206896552, + "acc_stderr": 0.037528339580033376, + "acc_norm": 0.2827586206896552, + "acc_norm_stderr": 0.037528339580033376 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.1984126984126984, + "acc_stderr": 0.02053948126188688, + "acc_norm": 0.1984126984126984, + "acc_norm_stderr": 0.02053948126188688 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.04006168083848877, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.04006168083848877 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2838709677419355, + "acc_stderr": 0.025649381063029254, + "acc_norm": 0.2838709677419355, + "acc_norm_stderr": 0.025649381063029254 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.23645320197044334, + "acc_stderr": 0.02989611429173355, + "acc_norm": 0.23645320197044334, + "acc_norm_stderr": 0.02989611429173355 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.3393939393939394, + "acc_stderr": 0.03697442205031595, + "acc_norm": 0.3393939393939394, + "acc_norm_stderr": 0.03697442205031595 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.25252525252525254, + "acc_stderr": 0.030954055470365897, + "acc_norm": 0.25252525252525254, + "acc_norm_stderr": 0.030954055470365897 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.2849740932642487, + "acc_stderr": 0.03257714077709662, + "acc_norm": 0.2849740932642487, + "acc_norm_stderr": 0.03257714077709662 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.24358974358974358, + "acc_stderr": 0.021763733684173923, + "acc_norm": 0.24358974358974358, + "acc_norm_stderr": 0.021763733684173923 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2037037037037037, + "acc_stderr": 0.024556172219141272, + "acc_norm": 0.2037037037037037, + "acc_norm_stderr": 0.024556172219141272 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2184873949579832, + "acc_stderr": 0.02684151432295896, + "acc_norm": 0.2184873949579832, + "acc_norm_stderr": 0.02684151432295896 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.24503311258278146, + "acc_stderr": 0.03511807571804725, + "acc_norm": 0.24503311258278146, + "acc_norm_stderr": 0.03511807571804725 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.29174311926605506, + "acc_stderr": 0.01948930096887653, + "acc_norm": 0.29174311926605506, + "acc_norm_stderr": 0.01948930096887653 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.2361111111111111, + "acc_stderr": 0.02896370257079102, + "acc_norm": 0.2361111111111111, + "acc_norm_stderr": 0.02896370257079102 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.030587591351604243, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.030587591351604243 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.2742616033755274, + "acc_stderr": 0.02904133351059804, + "acc_norm": 0.2742616033755274, + "acc_norm_stderr": 0.02904133351059804 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.3721973094170404, + "acc_stderr": 0.03244305283008731, + "acc_norm": 0.3721973094170404, + "acc_norm_stderr": 0.03244305283008731 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.371900826446281, + "acc_stderr": 0.04412015806624504, + "acc_norm": 0.371900826446281, + "acc_norm_stderr": 0.04412015806624504 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04557239513497751, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04557239513497751 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.26993865030674846, + "acc_stderr": 0.034878251684978906, + "acc_norm": 0.26993865030674846, + "acc_norm_stderr": 0.034878251684978906 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.29464285714285715, + "acc_stderr": 0.04327040932578728, + "acc_norm": 0.29464285714285715, + "acc_norm_stderr": 0.04327040932578728 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.23300970873786409, + "acc_stderr": 0.041858325989283136, + "acc_norm": 0.23300970873786409, + "acc_norm_stderr": 0.041858325989283136 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.28205128205128205, + "acc_stderr": 0.02948036054954119, + "acc_norm": 0.28205128205128205, + "acc_norm_stderr": 0.02948036054954119 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.3448275862068966, + "acc_stderr": 0.016997123346113436, + "acc_norm": 0.3448275862068966, + "acc_norm_stderr": 0.016997123346113436 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2774566473988439, + "acc_stderr": 0.024105712607754307, + "acc_norm": 0.2774566473988439, + "acc_norm_stderr": 0.024105712607754307 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2435754189944134, + "acc_stderr": 0.014355911964767865, + "acc_norm": 0.2435754189944134, + "acc_norm_stderr": 0.014355911964767865 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.025829163272757475, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.025829163272757475 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.3022508038585209, + "acc_stderr": 0.02608270069539966, + "acc_norm": 0.3022508038585209, + "acc_norm_stderr": 0.02608270069539966 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.3117283950617284, + "acc_stderr": 0.025773111169630453, + "acc_norm": 0.3117283950617284, + "acc_norm_stderr": 0.025773111169630453 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23049645390070922, + "acc_stderr": 0.025123739226872405, + "acc_norm": 0.23049645390070922, + "acc_norm_stderr": 0.025123739226872405 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2653194263363755, + "acc_stderr": 0.011276198843958871, + "acc_norm": 0.2653194263363755, + "acc_norm_stderr": 0.011276198843958871 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.1801470588235294, + "acc_stderr": 0.02334516361654485, + "acc_norm": 0.1801470588235294, + "acc_norm_stderr": 0.02334516361654485 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.27941176470588236, + "acc_stderr": 0.018152871051538816, + "acc_norm": 0.27941176470588236, + "acc_norm_stderr": 0.018152871051538816 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2909090909090909, + "acc_stderr": 0.04350271442923243, + "acc_norm": 0.2909090909090909, + "acc_norm_stderr": 0.04350271442923243 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.19591836734693877, + "acc_stderr": 0.025409301953225678, + "acc_norm": 0.19591836734693877, + "acc_norm_stderr": 0.025409301953225678 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.25870646766169153, + "acc_stderr": 0.030965903123573037, + "acc_norm": 0.25870646766169153, + "acc_norm_stderr": 0.030965903123573037 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3253012048192771, + "acc_stderr": 0.036471685236832266, + "acc_norm": 0.3253012048192771, + "acc_norm_stderr": 0.036471685236832266 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.036996580176568775, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.036996580176568775 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2668298653610771, + "mc1_stderr": 0.015483691939237265, + "mc2": 0.3975561831004256, + "mc2_stderr": 0.01443999930404212 + }, + "harness|winogrande|5": { + "acc": 0.659037095501184, + "acc_stderr": 0.013322681435934786 + }, + "harness|drop|3": { + "em": 0.02097315436241611, + "em_stderr": 0.0014674686372139715, + "f1": 0.07344798657718132, + "f1_stderr": 0.0018673519634175401 + }, + "harness|gsm8k|5": { + "acc": 0.015163002274450341, + "acc_stderr": 0.003366022949726365 + }, + "all": { + "acc": 0.2834708463666077, + "acc_stderr": 0.0316438041216984, + "acc_norm": 0.28533191373449407, + "acc_norm_stderr": 0.03242801789499609, + "mc1": 0.2668298653610771, + "mc1_stderr": 0.015483691939237265, + "mc2": 0.3975561831004256, + "mc2_stderr": 0.01443999930404212, + "em": 0.02097315436241611, + "em_stderr": 0.0014674686372139715, + "f1": 0.07344798657718132, + "f1_stderr": 0.0018673519634175401 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "7fb710e5862d018e" + }, + "truncated": 3, + "non_truncated": 9533, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "615bb7b95330de62" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "379266f3a5365f9d", + "hash_cont_tokens": "3c36803fb642abcb" + }, + "truncated": 3, + "non_truncated": 38192, + "padded": 113348, + "non_padded": 11060, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/vihangd/smartyplats-3b-v1/results_2023-09-13T04-45-46.348158.json b/eval-results/vihangd/smartyplats-3b-v1/results_2023-09-13T04-45-46.348158.json new file mode 100644 index 0000000000000000000000000000000000000000..2720f0849503c9191f7e3371546fe4eeb092f85e --- /dev/null +++ b/eval-results/vihangd/smartyplats-3b-v1/results_2023-09-13T04-45-46.348158.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "vihangd/smartyplats-3b-v1", + "model_sha": "89272b9edb323f5ace09e097a6449554c0dcd4e7", + "model_size": "6.4 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.3779863481228669, + "acc_stderr": 0.014169664520303103, + "acc_norm": 0.4052901023890785, + "acc_norm_stderr": 0.014346869060229321 + }, + "harness|hellaswag|10": { + "acc": 0.5236008763194583, + "acc_stderr": 0.004984219681732655, + "acc_norm": 0.7085241983668592, + "acc_norm_stderr": 0.004535133886462033 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.17777777777777778, + "acc_stderr": 0.033027898599017176, + "acc_norm": 0.17777777777777778, + "acc_norm_stderr": 0.033027898599017176 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.25, + "acc_stderr": 0.03523807393012047, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03523807393012047 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.2339622641509434, + "acc_stderr": 0.02605529690115292, + "acc_norm": 0.2339622641509434, + "acc_norm_stderr": 0.02605529690115292 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.24305555555555555, + "acc_stderr": 0.03586879280080339, + "acc_norm": 0.24305555555555555, + "acc_norm_stderr": 0.03586879280080339 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.16, + "acc_stderr": 0.03684529491774708, + "acc_norm": 0.16, + "acc_norm_stderr": 0.03684529491774708 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.04560480215720684, + "acc_norm": 0.29, + "acc_norm_stderr": 0.04560480215720684 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.23699421965317918, + "acc_stderr": 0.03242414757483098, + "acc_norm": 0.23699421965317918, + "acc_norm_stderr": 0.03242414757483098 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.23529411764705882, + "acc_stderr": 0.04220773659171452, + "acc_norm": 0.23529411764705882, + "acc_norm_stderr": 0.04220773659171452 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.32340425531914896, + "acc_stderr": 0.030579442773610334, + "acc_norm": 0.32340425531914896, + "acc_norm_stderr": 0.030579442773610334 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.04372748290278008, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.04372748290278008 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.25517241379310346, + "acc_stderr": 0.03632984052707842, + "acc_norm": 0.25517241379310346, + "acc_norm_stderr": 0.03632984052707842 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.21164021164021163, + "acc_stderr": 0.021037331505262883, + "acc_norm": 0.21164021164021163, + "acc_norm_stderr": 0.021037331505262883 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04040610178208841, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04040610178208841 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.2064516129032258, + "acc_stderr": 0.02302589961718872, + "acc_norm": 0.2064516129032258, + "acc_norm_stderr": 0.02302589961718872 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.17733990147783252, + "acc_stderr": 0.026874337276808345, + "acc_norm": 0.17733990147783252, + "acc_norm_stderr": 0.026874337276808345 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.03453131801885416, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.03453131801885416 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.1919191919191919, + "acc_stderr": 0.028057791672989017, + "acc_norm": 0.1919191919191919, + "acc_norm_stderr": 0.028057791672989017 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.20725388601036268, + "acc_stderr": 0.029252823291803613, + "acc_norm": 0.20725388601036268, + "acc_norm_stderr": 0.029252823291803613 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2205128205128205, + "acc_stderr": 0.02102067268082791, + "acc_norm": 0.2205128205128205, + "acc_norm_stderr": 0.02102067268082791 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2518518518518518, + "acc_stderr": 0.026466117538959912, + "acc_norm": 0.2518518518518518, + "acc_norm_stderr": 0.026466117538959912 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.02665353159671548, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.02665353159671548 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.23841059602649006, + "acc_stderr": 0.03479185572599659, + "acc_norm": 0.23841059602649006, + "acc_norm_stderr": 0.03479185572599659 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.23853211009174313, + "acc_stderr": 0.01827257581023187, + "acc_norm": 0.23853211009174313, + "acc_norm_stderr": 0.01827257581023187 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.18055555555555555, + "acc_stderr": 0.026232878971491652, + "acc_norm": 0.18055555555555555, + "acc_norm_stderr": 0.026232878971491652 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.24509803921568626, + "acc_stderr": 0.03019028245350195, + "acc_norm": 0.24509803921568626, + "acc_norm_stderr": 0.03019028245350195 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.26582278481012656, + "acc_stderr": 0.02875679962965834, + "acc_norm": 0.26582278481012656, + "acc_norm_stderr": 0.02875679962965834 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.336322869955157, + "acc_stderr": 0.031708824268455, + "acc_norm": 0.336322869955157, + "acc_norm_stderr": 0.031708824268455 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2366412213740458, + "acc_stderr": 0.03727673575596919, + "acc_norm": 0.2366412213740458, + "acc_norm_stderr": 0.03727673575596919 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.2392638036809816, + "acc_stderr": 0.033519538795212696, + "acc_norm": 0.2392638036809816, + "acc_norm_stderr": 0.033519538795212696 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.24107142857142858, + "acc_stderr": 0.04059867246952687, + "acc_norm": 0.24107142857142858, + "acc_norm_stderr": 0.04059867246952687 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.3034188034188034, + "acc_stderr": 0.03011821010694266, + "acc_norm": 0.3034188034188034, + "acc_norm_stderr": 0.03011821010694266 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.04605661864718381, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04605661864718381 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.26436781609195403, + "acc_stderr": 0.015769984840690518, + "acc_norm": 0.26436781609195403, + "acc_norm_stderr": 0.015769984840690518 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2514450867052023, + "acc_stderr": 0.02335736578587404, + "acc_norm": 0.2514450867052023, + "acc_norm_stderr": 0.02335736578587404 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2547486033519553, + "acc_stderr": 0.014572650383409153, + "acc_norm": 0.2547486033519553, + "acc_norm_stderr": 0.014572650383409153 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.23202614379084968, + "acc_stderr": 0.024170840879341016, + "acc_norm": 0.23202614379084968, + "acc_norm_stderr": 0.024170840879341016 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2861736334405145, + "acc_stderr": 0.02567025924218894, + "acc_norm": 0.2861736334405145, + "acc_norm_stderr": 0.02567025924218894 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2993827160493827, + "acc_stderr": 0.02548311560119546, + "acc_norm": 0.2993827160493827, + "acc_norm_stderr": 0.02548311560119546 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2375886524822695, + "acc_stderr": 0.025389512552729906, + "acc_norm": 0.2375886524822695, + "acc_norm_stderr": 0.025389512552729906 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.23989569752281617, + "acc_stderr": 0.010906282617981643, + "acc_norm": 0.23989569752281617, + "acc_norm_stderr": 0.010906282617981643 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.19852941176470587, + "acc_stderr": 0.0242310133705411, + "acc_norm": 0.19852941176470587, + "acc_norm_stderr": 0.0242310133705411 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.017848089574913226, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.017848089574913226 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2818181818181818, + "acc_stderr": 0.043091187099464585, + "acc_norm": 0.2818181818181818, + "acc_norm_stderr": 0.043091187099464585 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.20408163265306123, + "acc_stderr": 0.025801283475090506, + "acc_norm": 0.20408163265306123, + "acc_norm_stderr": 0.025801283475090506 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.2935323383084577, + "acc_stderr": 0.03220024104534205, + "acc_norm": 0.2935323383084577, + "acc_norm_stderr": 0.03220024104534205 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3192771084337349, + "acc_stderr": 0.0362933532994786, + "acc_norm": 0.3192771084337349, + "acc_norm_stderr": 0.0362933532994786 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.035650796707083106, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.035650796707083106 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23255813953488372, + "mc1_stderr": 0.014789157531080508, + "mc2": 0.3652581798300609, + "mc2_stderr": 0.013914438833995325 + }, + "all": { + "acc": 0.25982372439316487, + "acc_stderr": 0.03168373002574038, + "acc_norm": 0.26342079348322606, + "acc_norm_stderr": 0.03167912186887014, + "mc1": 0.23255813953488372, + "mc1_stderr": 0.014789157531080508, + "mc2": 0.3652581798300609, + "mc2_stderr": 0.013914438833995325 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "7cefb32e2563a8e3", + "hash_cont_tokens": "2e8835aa03b9c2cf" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "e4a72fc2bbea66ff", + "hash_cont_tokens": "18a48de3edcef462" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40144, + "non-padded": 24, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "1430bf2cb1d054e2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "c4f45f8ebf944893", + "hash_cont_tokens": "1d81fa80e3039a08" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "7b6c0659a104d6af", + "hash_cont_tokens": "247dc44c6b578728" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ca33ffee63980ac1", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "a6aba95384c46b37", + "hash_cont_tokens": "26e3b69d5fb27bb2" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "95d92a1a2c158e2c", + "hash_cont_tokens": "bbda31842f3930d5" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "70284e3c06933186", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "028608b4301fcfd2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "02619f96ae20cf1e", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0282a73e02cf4b34", + "hash_cont_tokens": "894854ed7bec57f7" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5d0425cf2abddd51", + "hash_cont_tokens": "13130ec6de384bbb" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "560574f683641143", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "dc3987c35bc329e5", + "hash_cont_tokens": "29089b8b7020611e" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "be83fdd674b48356", + "hash_cont_tokens": "efc596dfa1a1f073" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "00155bf1a1a1ebc7", + "hash_cont_tokens": "70817a7ac9f44af2" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "ce05b52b00498cf6", + "hash_cont_tokens": "937cd53d06cc6e16" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "728bd41242158358", + "hash_cont_tokens": "eec972abe0fc0f5a" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "190511206bf21530", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "2bc219567947ac68", + "hash_cont_tokens": "94971ccfe8e59c25" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "8477b93b8643d23f", + "hash_cont_tokens": "a78e38b59778a04c" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "0e15ea7b43890b3c", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "142b719c7d7d4fe0", + "hash_cont_tokens": "91dc522e4e4e91c3" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4bf76efe7796945e", + "hash_cont_tokens": "f275c901b3d285f9" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "e3a453e5fb044f52", + "hash_cont_tokens": "85eb58f423437cce" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "f47a1c2b0c018aff", + "hash_cont_tokens": "39a93706184f896b" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "35bc9ee85a563c15", + "hash_cont_tokens": "d41065d20b689af3" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62a083d4ceb83864", + "hash_cont_tokens": "28c1f7c11bf85409" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "cd96d409604783e4", + "hash_cont_tokens": "78c510e6c5d316ac" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "3c716ffc27f83e15", + "hash_cont_tokens": "0ba4ecffc67603c5" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "fd8217f7edf722f8", + "hash_cont_tokens": "4a0339e9ad3efa6d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a54112084a848a44", + "hash_cont_tokens": "2529d55ec490f81f" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "89cf33fb840f27be", + "hash_cont_tokens": "21808b54f5df97b2" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "0a2b6ab3ae0e3b7c", + "hash_cont_tokens": "92acdd467ed943e1" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f28777a6fdce1d2b", + "hash_cont_tokens": "a6034ed95a124315" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "8282921a7a07bd5a", + "hash_cont_tokens": "223fbf3fd106c04b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "3aa62568b80ee7ca", + "hash_cont_tokens": "7c8e30f486ff156a" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "731b1d04f2da3d9a", + "hash_cont_tokens": "b4cc4a8d31bbaa03" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96e1af14c8358ac2", + "hash_cont_tokens": "7f0e1289ec188e82" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "bc2e4bf4e7cf5c39", + "hash_cont_tokens": "66b726b356a02feb" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "abed130d5c3867a4", + "hash_cont_tokens": "f08457005b652d25" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "83d7d50bc2ebab43", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "57004a232a08258a", + "hash_cont_tokens": "647bcbd68f292558" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "bb9518d436087f70", + "hash_cont_tokens": "6849b7fe56c50dda" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1365, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3edebd0b46a85682", + "hash_cont_tokens": "81585ec455b1e3e5" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "815607301732a13f", + "hash_cont_tokens": "471b68eb20e5d34b" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "952254859587db3e", + "hash_cont_tokens": "6e39384b9c0a8cc2" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "1429d150f124f76e", + "hash_cont_tokens": "bfe513578190093f" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "9f8bfa3b87b58a38", + "hash_cont_tokens": "9ce431b67350b312" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "f638aace411a0bd9", + "hash_cont_tokens": "0ff990d9cc38024d" + }, + "truncated": 168, + "non-truncated": 5968, + "padded": 5968, + "non-padded": 168, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c0f160879d378d4d", + "hash_cont_tokens": "bc3c70e15bc7dce0" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "548450e483004f15", + "hash_cont_tokens": "58464ea26d81f908" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "47f43ebfaa773712", + "hash_cont_tokens": "eaf6a5d3ddd39a12" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "0350ab02a3d50c5f", + "hash_cont_tokens": "618fd4f954253134" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "e010003b38f6d86a", + "hash_cont_tokens": "b4962d9e583b12c0" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "99959731e92e9eb1", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "841a69043fcd7645", + "hash_cont_tokens": "397a75462a9735e3" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6faa0998b440e497", + "hash_cont_tokens": "de629d1414e01de8" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "fe347abbeff2a4c1", + "hash_cont_tokens": "df48bc66e06781f2" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3f79e8edf26f0efd", + "hash_cont_tokens": "4a4fb8e86dc2fb9d" + }, + "total_evaluation_time_secondes": "2145.3454196453094", + "truncated": 1644, + "non-truncated": 109375, + "padded": 109332, + "non-padded": 1687, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/vihangd/smartyplats-3b-v1/results_2023-10-23T05-25-12.646031.json b/eval-results/vihangd/smartyplats-3b-v1/results_2023-10-23T05-25-12.646031.json new file mode 100644 index 0000000000000000000000000000000000000000..f7f8cab829dda39e9f26450023bb62ef09fe04c5 --- /dev/null +++ b/eval-results/vihangd/smartyplats-3b-v1/results_2023-10-23T05-25-12.646031.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "vihangd/smartyplats-3b-v1", + "model_sha": "89272b9edb323f5ace09e097a6449554c0dcd4e7", + "model_size": "6.4 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0017827181208053692, + "em_stderr": 0.0004320097346039121, + "f1": 0.054003775167785366, + "f1_stderr": 0.0013390559797939118 + }, + "harness|gsm8k|5": { + "acc": 0.01061410159211524, + "acc_stderr": 0.002822713322387704 + }, + "harness|winogrande|5": { + "acc": 0.6574585635359116, + "acc_stderr": 0.013337483579075925 + }, + "all": { + "em": 0.0017827181208053692, + "em_stderr": 0.0004320097346039121, + "f1": 0.054003775167785366, + "f1_stderr": 0.0013390559797939118, + "acc": 0.33403633256401344, + "acc_stderr": 0.008080098450731814 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a65c9eacad86ea52", + "hash_cont_tokens": "d57220c2500b4cd3" + }, + "truncated": 980, + "non-truncated": 8556, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bf7d8c6b5e4f7948", + "hash_cont_tokens": "9b36b94c45c6b3f1" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "647d8b2cafc100bc", + "hash_cont_tokens": "828897df1f4f08a1" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2433, + "non-padded": 101, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a65e1c92b9137d17", + "hash_cont_tokens": "49f3f154813b2613" + }, + "total_evaluation_time_secondes": "9198.972011089325", + "truncated": 980, + "non-truncated": 12409, + "padded": 2433, + "non-padded": 10956, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/vihangd/smartyplats-3b-v2/results_2023-09-14T07-53-11.714726.json b/eval-results/vihangd/smartyplats-3b-v2/results_2023-09-14T07-53-11.714726.json new file mode 100644 index 0000000000000000000000000000000000000000..77865de9f66db8fce082f5a0312f9651fa2e1939 --- /dev/null +++ b/eval-results/vihangd/smartyplats-3b-v2/results_2023-09-14T07-53-11.714726.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "vihangd/smartyplats-3b-v2", + "model_sha": "920609897049f674bc4a9678579f6869f6cbed13", + "model_size": "6.4 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.36689419795221845, + "acc_stderr": 0.014084133118104289, + "acc_norm": 0.4104095563139932, + "acc_norm_stderr": 0.014374922192642662 + }, + "harness|hellaswag|10": { + "acc": 0.5288787094204341, + "acc_stderr": 0.004981451704451047, + "acc_norm": 0.7119099780920135, + "acc_norm_stderr": 0.004519476835646771 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036843, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036843 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.1925925925925926, + "acc_stderr": 0.03406542058502653, + "acc_norm": 0.1925925925925926, + "acc_norm_stderr": 0.03406542058502653 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.21710526315789475, + "acc_stderr": 0.03355045304882922, + "acc_norm": 0.21710526315789475, + "acc_norm_stderr": 0.03355045304882922 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.23018867924528302, + "acc_stderr": 0.02590789712240817, + "acc_norm": 0.23018867924528302, + "acc_norm_stderr": 0.02590789712240817 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.25, + "acc_stderr": 0.03621034121889507, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03621034121889507 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036623, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036623 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.26, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.26, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.19653179190751446, + "acc_stderr": 0.030299574664788137, + "acc_norm": 0.19653179190751446, + "acc_norm_stderr": 0.030299574664788137 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.2647058823529412, + "acc_stderr": 0.04389869956808778, + "acc_norm": 0.2647058823529412, + "acc_norm_stderr": 0.04389869956808778 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.3021276595744681, + "acc_stderr": 0.030017554471880557, + "acc_norm": 0.3021276595744681, + "acc_norm_stderr": 0.030017554471880557 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21929824561403508, + "acc_stderr": 0.038924311065187525, + "acc_norm": 0.21929824561403508, + "acc_norm_stderr": 0.038924311065187525 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2689655172413793, + "acc_stderr": 0.03695183311650232, + "acc_norm": 0.2689655172413793, + "acc_norm_stderr": 0.03695183311650232 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.20634920634920634, + "acc_stderr": 0.020842290930114676, + "acc_norm": 0.20634920634920634, + "acc_norm_stderr": 0.020842290930114676 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.29365079365079366, + "acc_stderr": 0.040735243221471276, + "acc_norm": 0.29365079365079366, + "acc_norm_stderr": 0.040735243221471276 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.17096774193548386, + "acc_stderr": 0.021417242936321575, + "acc_norm": 0.17096774193548386, + "acc_norm_stderr": 0.021417242936321575 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.1724137931034483, + "acc_stderr": 0.026577672183036583, + "acc_norm": 0.1724137931034483, + "acc_norm_stderr": 0.026577672183036583 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.22424242424242424, + "acc_stderr": 0.032568666616811015, + "acc_norm": 0.22424242424242424, + "acc_norm_stderr": 0.032568666616811015 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.15151515151515152, + "acc_stderr": 0.025545650426603592, + "acc_norm": 0.15151515151515152, + "acc_norm_stderr": 0.025545650426603592 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.19170984455958548, + "acc_stderr": 0.028408953626245296, + "acc_norm": 0.19170984455958548, + "acc_norm_stderr": 0.028408953626245296 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2230769230769231, + "acc_stderr": 0.021107730127243998, + "acc_norm": 0.2230769230769231, + "acc_norm_stderr": 0.021107730127243998 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.22962962962962963, + "acc_stderr": 0.02564410863926763, + "acc_norm": 0.22962962962962963, + "acc_norm_stderr": 0.02564410863926763 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.226890756302521, + "acc_stderr": 0.02720537153827946, + "acc_norm": 0.226890756302521, + "acc_norm_stderr": 0.02720537153827946 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2251655629139073, + "acc_stderr": 0.03410435282008937, + "acc_norm": 0.2251655629139073, + "acc_norm_stderr": 0.03410435282008937 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.22568807339449543, + "acc_stderr": 0.017923087667803046, + "acc_norm": 0.22568807339449543, + "acc_norm_stderr": 0.017923087667803046 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.18981481481481483, + "acc_stderr": 0.026744714834691916, + "acc_norm": 0.18981481481481483, + "acc_norm_stderr": 0.026744714834691916 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.33183856502242154, + "acc_stderr": 0.03160295143776679, + "acc_norm": 0.33183856502242154, + "acc_norm_stderr": 0.03160295143776679 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.29770992366412213, + "acc_stderr": 0.04010358942462203, + "acc_norm": 0.29770992366412213, + "acc_norm_stderr": 0.04010358942462203 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2066115702479339, + "acc_stderr": 0.03695980128098824, + "acc_norm": 0.2066115702479339, + "acc_norm_stderr": 0.03695980128098824 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22699386503067484, + "acc_stderr": 0.032910995786157686, + "acc_norm": 0.22699386503067484, + "acc_norm_stderr": 0.032910995786157686 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285713, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285713 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.1262135922330097, + "acc_stderr": 0.03288180278808628, + "acc_norm": 0.1262135922330097, + "acc_norm_stderr": 0.03288180278808628 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.32051282051282054, + "acc_stderr": 0.030572811310299607, + "acc_norm": 0.32051282051282054, + "acc_norm_stderr": 0.030572811310299607 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.24521072796934865, + "acc_stderr": 0.01538435228454394, + "acc_norm": 0.24521072796934865, + "acc_norm_stderr": 0.01538435228454394 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24692737430167597, + "acc_stderr": 0.014422292204808835, + "acc_norm": 0.24692737430167597, + "acc_norm_stderr": 0.014422292204808835 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.02355083135199509, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.02355083135199509 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.26688102893890675, + "acc_stderr": 0.02512263760881665, + "acc_norm": 0.26688102893890675, + "acc_norm_stderr": 0.02512263760881665 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2623456790123457, + "acc_stderr": 0.02447722285613511, + "acc_norm": 0.2623456790123457, + "acc_norm_stderr": 0.02447722285613511 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.25177304964539005, + "acc_stderr": 0.025892151156709405, + "acc_norm": 0.25177304964539005, + "acc_norm_stderr": 0.025892151156709405 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.22816166883963493, + "acc_stderr": 0.010717992192047889, + "acc_norm": 0.22816166883963493, + "acc_norm_stderr": 0.010717992192047889 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.19117647058823528, + "acc_stderr": 0.02388688192244036, + "acc_norm": 0.19117647058823528, + "acc_norm_stderr": 0.02388688192244036 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25163398692810457, + "acc_stderr": 0.017555818091322267, + "acc_norm": 0.25163398692810457, + "acc_norm_stderr": 0.017555818091322267 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.2727272727272727, + "acc_stderr": 0.04265792110940588, + "acc_norm": 0.2727272727272727, + "acc_norm_stderr": 0.04265792110940588 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.19183673469387755, + "acc_stderr": 0.025206963154225406, + "acc_norm": 0.19183673469387755, + "acc_norm_stderr": 0.025206963154225406 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23880597014925373, + "acc_stderr": 0.03014777593540922, + "acc_norm": 0.23880597014925373, + "acc_norm_stderr": 0.03014777593540922 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.28313253012048195, + "acc_stderr": 0.03507295431370518, + "acc_norm": 0.28313253012048195, + "acc_norm_stderr": 0.03507295431370518 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3157894736842105, + "acc_stderr": 0.035650796707083106, + "acc_norm": 0.3157894736842105, + "acc_norm_stderr": 0.035650796707083106 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2252141982864137, + "mc1_stderr": 0.014623240768023493, + "mc2": 0.36661381284093036, + "mc2_stderr": 0.01373944353058763 + }, + "all": { + "acc": 0.25015806921375316, + "acc_stderr": 0.031235552038968953, + "acc_norm": 0.25399784255330154, + "acc_norm_stderr": 0.03123265058482885, + "mc1": 0.2252141982864137, + "mc1_stderr": 0.014623240768023493, + "mc2": 0.36661381284093036, + "mc2_stderr": 0.01373944353058763 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "7cefb32e2563a8e3", + "hash_cont_tokens": "2e8835aa03b9c2cf" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "e4a72fc2bbea66ff", + "hash_cont_tokens": "18a48de3edcef462" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40144, + "non-padded": 24, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "1430bf2cb1d054e2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "c4f45f8ebf944893", + "hash_cont_tokens": "1d81fa80e3039a08" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "7b6c0659a104d6af", + "hash_cont_tokens": "247dc44c6b578728" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ca33ffee63980ac1", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "a6aba95384c46b37", + "hash_cont_tokens": "26e3b69d5fb27bb2" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "95d92a1a2c158e2c", + "hash_cont_tokens": "bbda31842f3930d5" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "70284e3c06933186", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "028608b4301fcfd2", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "02619f96ae20cf1e", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0282a73e02cf4b34", + "hash_cont_tokens": "894854ed7bec57f7" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5d0425cf2abddd51", + "hash_cont_tokens": "13130ec6de384bbb" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "560574f683641143", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "dc3987c35bc329e5", + "hash_cont_tokens": "29089b8b7020611e" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "be83fdd674b48356", + "hash_cont_tokens": "efc596dfa1a1f073" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "00155bf1a1a1ebc7", + "hash_cont_tokens": "70817a7ac9f44af2" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "ce05b52b00498cf6", + "hash_cont_tokens": "937cd53d06cc6e16" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "728bd41242158358", + "hash_cont_tokens": "eec972abe0fc0f5a" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "190511206bf21530", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "2bc219567947ac68", + "hash_cont_tokens": "94971ccfe8e59c25" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "8477b93b8643d23f", + "hash_cont_tokens": "a78e38b59778a04c" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "0e15ea7b43890b3c", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "142b719c7d7d4fe0", + "hash_cont_tokens": "91dc522e4e4e91c3" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4bf76efe7796945e", + "hash_cont_tokens": "f275c901b3d285f9" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "e3a453e5fb044f52", + "hash_cont_tokens": "85eb58f423437cce" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "f47a1c2b0c018aff", + "hash_cont_tokens": "39a93706184f896b" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "35bc9ee85a563c15", + "hash_cont_tokens": "d41065d20b689af3" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "62a083d4ceb83864", + "hash_cont_tokens": "28c1f7c11bf85409" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "cd96d409604783e4", + "hash_cont_tokens": "78c510e6c5d316ac" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "3c716ffc27f83e15", + "hash_cont_tokens": "0ba4ecffc67603c5" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "fd8217f7edf722f8", + "hash_cont_tokens": "4a0339e9ad3efa6d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "a54112084a848a44", + "hash_cont_tokens": "2529d55ec490f81f" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "89cf33fb840f27be", + "hash_cont_tokens": "21808b54f5df97b2" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "0a2b6ab3ae0e3b7c", + "hash_cont_tokens": "92acdd467ed943e1" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f28777a6fdce1d2b", + "hash_cont_tokens": "a6034ed95a124315" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "8282921a7a07bd5a", + "hash_cont_tokens": "223fbf3fd106c04b" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "3aa62568b80ee7ca", + "hash_cont_tokens": "7c8e30f486ff156a" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "731b1d04f2da3d9a", + "hash_cont_tokens": "b4cc4a8d31bbaa03" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 652, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96e1af14c8358ac2", + "hash_cont_tokens": "7f0e1289ec188e82" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "bc2e4bf4e7cf5c39", + "hash_cont_tokens": "66b726b356a02feb" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "abed130d5c3867a4", + "hash_cont_tokens": "f08457005b652d25" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "83d7d50bc2ebab43", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "57004a232a08258a", + "hash_cont_tokens": "647bcbd68f292558" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "bb9518d436087f70", + "hash_cont_tokens": "6849b7fe56c50dda" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1365, + "non-padded": 19, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3edebd0b46a85682", + "hash_cont_tokens": "81585ec455b1e3e5" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "815607301732a13f", + "hash_cont_tokens": "471b68eb20e5d34b" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "952254859587db3e", + "hash_cont_tokens": "6e39384b9c0a8cc2" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "1429d150f124f76e", + "hash_cont_tokens": "bfe513578190093f" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "9f8bfa3b87b58a38", + "hash_cont_tokens": "9ce431b67350b312" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "f638aace411a0bd9", + "hash_cont_tokens": "0ff990d9cc38024d" + }, + "truncated": 168, + "non-truncated": 5968, + "padded": 5968, + "non-padded": 168, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c0f160879d378d4d", + "hash_cont_tokens": "bc3c70e15bc7dce0" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "548450e483004f15", + "hash_cont_tokens": "58464ea26d81f908" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "47f43ebfaa773712", + "hash_cont_tokens": "eaf6a5d3ddd39a12" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "0350ab02a3d50c5f", + "hash_cont_tokens": "618fd4f954253134" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "e010003b38f6d86a", + "hash_cont_tokens": "b4962d9e583b12c0" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "99959731e92e9eb1", + "hash_cont_tokens": "ce26aac83e938006" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "841a69043fcd7645", + "hash_cont_tokens": "397a75462a9735e3" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6faa0998b440e497", + "hash_cont_tokens": "de629d1414e01de8" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "fe347abbeff2a4c1", + "hash_cont_tokens": "df48bc66e06781f2" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "3f79e8edf26f0efd", + "hash_cont_tokens": "4a4fb8e86dc2fb9d" + }, + "total_evaluation_time_secondes": "2114.7251467704773", + "truncated": 1644, + "non-truncated": 109375, + "padded": 109332, + "non-padded": 1687, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/vihangd/smartyplats-3b-v2/results_2023-10-28T18-23-12.485611.json b/eval-results/vihangd/smartyplats-3b-v2/results_2023-10-28T18-23-12.485611.json new file mode 100644 index 0000000000000000000000000000000000000000..cccfda000459005209350bf16c794e0a9052db62 --- /dev/null +++ b/eval-results/vihangd/smartyplats-3b-v2/results_2023-10-28T18-23-12.485611.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "vihangd/smartyplats-3b-v2", + "model_sha": "920609897049f674bc4a9678579f6869f6cbed13", + "model_size": "6.4 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0016778523489932886, + "em_stderr": 0.00041913301788268375, + "f1": 0.05385906040268484, + "f1_stderr": 0.0013190145725969279 + }, + "harness|gsm8k|5": { + "acc": 0.01592115238817286, + "acc_stderr": 0.0034478192723889915 + }, + "harness|winogrande|5": { + "acc": 0.6692975532754538, + "acc_stderr": 0.013222435887002695 + }, + "all": { + "em": 0.0016778523489932886, + "em_stderr": 0.00041913301788268375, + "f1": 0.05385906040268484, + "f1_stderr": 0.0013190145725969279, + "acc": 0.3426093528318133, + "acc_stderr": 0.008335127579695843 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a65c9eacad86ea52", + "hash_cont_tokens": "8b77e5a828ba36f0" + }, + "truncated": 980, + "non-truncated": 8556, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bf7d8c6b5e4f7948", + "hash_cont_tokens": "be707446cc83cbec" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "647d8b2cafc100bc", + "hash_cont_tokens": "828897df1f4f08a1" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2433, + "non-padded": 101, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a65e1c92b9137d17", + "hash_cont_tokens": "6b60bd35b74c6fb9" + }, + "total_evaluation_time_secondes": "9426.900059700012", + "truncated": 980, + "non-truncated": 12409, + "padded": 2433, + "non-padded": 10956, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/vihangd/smartyplats-7b-v2/results_2023-11-26T16-52-00.808559.json b/eval-results/vihangd/smartyplats-7b-v2/results_2023-11-26T16-52-00.808559.json new file mode 100644 index 0000000000000000000000000000000000000000..497bfc928348284eccaa31f6acdcfe71121d545a --- /dev/null +++ b/eval-results/vihangd/smartyplats-7b-v2/results_2023-11-26T16-52-00.808559.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 836837.462962842, + "end_time": 850623.869812916, + "total_evaluation_time_secondes": "13786.406850074069", + "model_name": "vihangd/smartyplats-7b-v2", + "model_sha": "99049eb184b9b3ef074043d6e626fe3db09f5a19", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5358361774744027, + "acc_stderr": 0.014573813664735718, + "acc_norm": 0.5793515358361775, + "acc_norm_stderr": 0.014426211252508401 + }, + "harness|hellaswag|10": { + "acc": 0.6197968532164907, + "acc_stderr": 0.004844445265582654, + "acc_norm": 0.8076080462059351, + "acc_norm_stderr": 0.003933736699983617 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4740740740740741, + "acc_stderr": 0.04313531696750574, + "acc_norm": 0.4740740740740741, + "acc_norm_stderr": 0.04313531696750574 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6513157894736842, + "acc_stderr": 0.03878139888797611, + "acc_norm": 0.6513157894736842, + "acc_norm_stderr": 0.03878139888797611 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.55, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6226415094339622, + "acc_stderr": 0.029832808114796005, + "acc_norm": 0.6226415094339622, + "acc_norm_stderr": 0.029832808114796005 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6805555555555556, + "acc_stderr": 0.038990736873573344, + "acc_norm": 0.6805555555555556, + "acc_norm_stderr": 0.038990736873573344 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542129, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542129 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6127167630057804, + "acc_stderr": 0.03714325906302064, + "acc_norm": 0.6127167630057804, + "acc_norm_stderr": 0.03714325906302064 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04690650298201943, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04690650298201943 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.5319148936170213, + "acc_stderr": 0.03261936918467381, + "acc_norm": 0.5319148936170213, + "acc_norm_stderr": 0.03261936918467381 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.38596491228070173, + "acc_stderr": 0.045796394220704334, + "acc_norm": 0.38596491228070173, + "acc_norm_stderr": 0.045796394220704334 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.025197101074246483, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.025197101074246483 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.46825396825396826, + "acc_stderr": 0.04463112720677173, + "acc_norm": 0.46825396825396826, + "acc_norm_stderr": 0.04463112720677173 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.44, + "acc_stderr": 0.049888765156985884, + "acc_norm": 0.44, + "acc_norm_stderr": 0.049888765156985884 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6870967741935484, + "acc_stderr": 0.02637756702864586, + "acc_norm": 0.6870967741935484, + "acc_norm_stderr": 0.02637756702864586 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.458128078817734, + "acc_stderr": 0.03505630140785741, + "acc_norm": 0.458128078817734, + "acc_norm_stderr": 0.03505630140785741 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.55, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7454545454545455, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.7454545454545455, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7171717171717171, + "acc_stderr": 0.032087795587867514, + "acc_norm": 0.7171717171717171, + "acc_norm_stderr": 0.032087795587867514 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7979274611398963, + "acc_stderr": 0.02897908979429673, + "acc_norm": 0.7979274611398963, + "acc_norm_stderr": 0.02897908979429673 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5384615384615384, + "acc_stderr": 0.025275892070240644, + "acc_norm": 0.5384615384615384, + "acc_norm_stderr": 0.025275892070240644 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.027634907264178544, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.027634907264178544 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5966386554621849, + "acc_stderr": 0.03186608121408832, + "acc_norm": 0.5966386554621849, + "acc_norm_stderr": 0.03186608121408832 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2913907284768212, + "acc_stderr": 0.037101857261199946, + "acc_norm": 0.2913907284768212, + "acc_norm_stderr": 0.037101857261199946 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.781651376146789, + "acc_stderr": 0.017712600528722724, + "acc_norm": 0.781651376146789, + "acc_norm_stderr": 0.017712600528722724 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4537037037037037, + "acc_stderr": 0.03395322726375797, + "acc_norm": 0.4537037037037037, + "acc_norm_stderr": 0.03395322726375797 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7794117647058824, + "acc_stderr": 0.02910225438967408, + "acc_norm": 0.7794117647058824, + "acc_norm_stderr": 0.02910225438967408 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.810126582278481, + "acc_stderr": 0.025530100460233497, + "acc_norm": 0.810126582278481, + "acc_norm_stderr": 0.025530100460233497 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6322869955156951, + "acc_stderr": 0.03236198350928275, + "acc_norm": 0.6322869955156951, + "acc_norm_stderr": 0.03236198350928275 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6564885496183206, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.6564885496183206, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7520661157024794, + "acc_stderr": 0.03941897526516302, + "acc_norm": 0.7520661157024794, + "acc_norm_stderr": 0.03941897526516302 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.04557239513497752, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.04557239513497752 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6319018404907976, + "acc_stderr": 0.03789213935838396, + "acc_norm": 0.6319018404907976, + "acc_norm_stderr": 0.03789213935838396 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.4017857142857143, + "acc_stderr": 0.04653333146973646, + "acc_norm": 0.4017857142857143, + "acc_norm_stderr": 0.04653333146973646 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8376068376068376, + "acc_stderr": 0.02416161812798774, + "acc_norm": 0.8376068376068376, + "acc_norm_stderr": 0.02416161812798774 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.63, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.63, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7624521072796935, + "acc_stderr": 0.015218733046150193, + "acc_norm": 0.7624521072796935, + "acc_norm_stderr": 0.015218733046150193 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6416184971098265, + "acc_stderr": 0.0258167567915842, + "acc_norm": 0.6416184971098265, + "acc_norm_stderr": 0.0258167567915842 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2100558659217877, + "acc_stderr": 0.013623755371333509, + "acc_norm": 0.2100558659217877, + "acc_norm_stderr": 0.013623755371333509 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6437908496732027, + "acc_stderr": 0.02742047766262924, + "acc_norm": 0.6437908496732027, + "acc_norm_stderr": 0.02742047766262924 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6945337620578779, + "acc_stderr": 0.026160584450140453, + "acc_norm": 0.6945337620578779, + "acc_norm_stderr": 0.026160584450140453 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6234567901234568, + "acc_stderr": 0.02695934451874778, + "acc_norm": 0.6234567901234568, + "acc_norm_stderr": 0.02695934451874778 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.42907801418439717, + "acc_stderr": 0.029525914302558562, + "acc_norm": 0.42907801418439717, + "acc_norm_stderr": 0.029525914302558562 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.43285528031290743, + "acc_stderr": 0.01265456523462286, + "acc_norm": 0.43285528031290743, + "acc_norm_stderr": 0.01265456523462286 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5919117647058824, + "acc_stderr": 0.029855261393483924, + "acc_norm": 0.5919117647058824, + "acc_norm_stderr": 0.029855261393483924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6209150326797386, + "acc_stderr": 0.019627444748412243, + "acc_norm": 0.6209150326797386, + "acc_norm_stderr": 0.019627444748412243 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6272727272727273, + "acc_stderr": 0.046313813194254656, + "acc_norm": 0.6272727272727273, + "acc_norm_stderr": 0.046313813194254656 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6408163265306123, + "acc_stderr": 0.030713560455108493, + "acc_norm": 0.6408163265306123, + "acc_norm_stderr": 0.030713560455108493 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7512437810945274, + "acc_stderr": 0.030567675938916714, + "acc_norm": 0.7512437810945274, + "acc_norm_stderr": 0.030567675938916714 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.76, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.76, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4879518072289157, + "acc_stderr": 0.03891364495835821, + "acc_norm": 0.4879518072289157, + "acc_norm_stderr": 0.03891364495835821 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8187134502923976, + "acc_stderr": 0.029547741687640038, + "acc_norm": 0.8187134502923976, + "acc_norm_stderr": 0.029547741687640038 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.34394124847001223, + "mc1_stderr": 0.016629087514276775, + "mc2": 0.5025785124983252, + "mc2_stderr": 0.015312588710955062 + }, + "harness|winogrande|5": { + "acc": 0.755327545382794, + "acc_stderr": 0.012082125654159738 + }, + "harness|drop|3": { + "em": 0.12479026845637584, + "em_stderr": 0.003384434237508591, + "f1": 0.18749790268456293, + "f1_stderr": 0.0034999477434115495 + }, + "harness|gsm8k|5": { + "acc": 0.177407126611069, + "acc_stderr": 0.010522533016890776 + }, + "all": { + "acc": 0.5777282770201803, + "acc_stderr": 0.033510634794133916, + "acc_norm": 0.5854240131794636, + "acc_norm_stderr": 0.03424552123378461, + "mc1": 0.34394124847001223, + "mc1_stderr": 0.016629087514276775, + "mc2": 0.5025785124983252, + "mc2_stderr": 0.015312588710955062, + "em": 0.12479026845637584, + "em_stderr": 0.003384434237508591, + "f1": 0.18749790268456293, + "f1_stderr": 0.0034999477434115495 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "9bcd0d1d37471713", + "hash_cont_tokens": "289aa98c400841d8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4670, + "non_padded": 17, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "80b8c6d79740318e", + "hash_cont_tokens": "ac460260c3e6efc9" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40101, + "non_padded": 67, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "b813d36287c6556c", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "09dc2380497f7a47", + "hash_cont_tokens": "a52a4f60d98cbe5c" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "68ca3220b0fdd1f3", + "hash_cont_tokens": "10f7d8eeba97841d" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "bd14ef1320de241e", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "d96186ab98017c43", + "hash_cont_tokens": "edef9975ba9165b5" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "424136b34e95b200", + "hash_cont_tokens": "0aa103ec6602280b" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8dd8b80e336bbe54", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "145d4cef8ca2261d", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "561995d32d2b25c4", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "6a258a9d4418599c", + "hash_cont_tokens": "1979021dbc698754" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "fa5e0d5b5f97b66a", + "hash_cont_tokens": "7cf7fe2bab00acbd" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "07d27397edfae492", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "da5e6c3c8eb17da6", + "hash_cont_tokens": "903f64eed2b0d217" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f6ba8e358bdb523e", + "hash_cont_tokens": "721ae6c5302c4bf2" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "b2459da4c5ca8590", + "hash_cont_tokens": "15a738960ed3e587" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 575, + "non_padded": 5, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "0b969d9ad706a13a", + "hash_cont_tokens": "c96470462fc71683" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "02bc3eb5f90da86e", + "hash_cont_tokens": "0e1ce025c9d6ee7e" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "3d5106918bcbeb43", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "7b089392db2dabbd", + "hash_cont_tokens": "e34d57f7d3c4ca16" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "ba90b2ffed1c067d", + "hash_cont_tokens": "e8482d44df4b3740" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "60eeec309ef0717f", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5e5e8bf3808e0ead", + "hash_cont_tokens": "d63e679a49418339" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "4da9b741d4e7ea78", + "hash_cont_tokens": "d78483e286d06f1a" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "acb4bc872ac86ed7", + "hash_cont_tokens": "691cdff71ff5fe57" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "840fc6403eb69ab0", + "hash_cont_tokens": "d5ad4c5bdca967ad" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3629a7f2cd17faeb", + "hash_cont_tokens": "8f631ca5687dd0d4" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "6846f684260e3997", + "hash_cont_tokens": "7321048a28451473" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "85aee25d6bdad94a", + "hash_cont_tokens": "bb137581f269861c" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "290b66d6d666a35f", + "hash_cont_tokens": "b455cab2675bd863" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "a77a7668b437bc82", + "hash_cont_tokens": "1b3196fec7e58037" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "63548c7fa9ba7a78", + "hash_cont_tokens": "a331dedc2aa01b3e" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "83c5da18bfa50812", + "hash_cont_tokens": "d0fbe030b8c8c2bf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "bebbd11f22006685", + "hash_cont_tokens": "1dd29c3755494850" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "7b85ee9b8ee54f4f", + "hash_cont_tokens": "c85573f663c10691" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "7bfc55ab7065943e", + "hash_cont_tokens": "d263804ba918154f" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "69573f1675e053c6", + "hash_cont_tokens": "581986691a84ece8" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "552324ef20094bdc", + "hash_cont_tokens": "55a858b28bbda458" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "96449357a7318905", + "hash_cont_tokens": "e99d3d3efd4ac7a3" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "3b849249168e3b88", + "hash_cont_tokens": "13d9dc56bca34726" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "af0e186f2756b70d", + "hash_cont_tokens": "2700ea26933916a2" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "9f6a6de16509b6d9", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "9194406d589f7c10", + "hash_cont_tokens": "7bf4341c79587250" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "769486efc74d9f8e", + "hash_cont_tokens": "38a48e9de6976f00" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "a90fd4dd90959dad", + "hash_cont_tokens": "761c4dc187689d89" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "1a3b843e66efd29b", + "hash_cont_tokens": "65005bd7d6f6012a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "09820001a3d00013", + "hash_cont_tokens": "0b47934fb6314dec" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "7c4ec364ce2768c7", + "hash_cont_tokens": "3f20acd855ee0a29" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "ced0534574d0ae3f", + "hash_cont_tokens": "8f122ba881355d4b" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "bcbdbbde22ec73e3", + "hash_cont_tokens": "90d5df417c4d3fd3" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "c54d753563114d45", + "hash_cont_tokens": "4a2d2988884f7f70" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "b75dc55c0e32fa52", + "hash_cont_tokens": "e0a952cb8a9c81de" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "5ccdc8ec8db99622", + "hash_cont_tokens": "1fa77a8dff3922b8" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "ca8497342e5b1d57", + "hash_cont_tokens": "81fc9cb3cbdd52db" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "069c76424fbd3dab", + "hash_cont_tokens": "2a0493252ed2cf43" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "a7e393a626169576", + "hash_cont_tokens": "17b868b63507f9a3" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "bf99dc973e3a650d", + "hash_cont_tokens": "5ab892d003b00c98" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "1761cfaf21797065", + "hash_cont_tokens": "15a5e5dbdfbb8568" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "298b43914bbdf4ca", + "hash_cont_tokens": "5a8d4bb398b1c3c0" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "31aa3477d959f771", + "hash_cont_tokens": "618558fb93c0f288" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "a4fb946366902edf", + "hash_cont_tokens": "e82bcbd8099d36b2" + }, + "truncated": 0, + "non_truncated": 9536, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "f77517015873fe24" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "0ce409b3d436569d", + "hash_cont_tokens": "198188e1ea1ac557" + }, + "truncated": 0, + "non_truncated": 38195, + "padded": 113460, + "non_padded": 10948, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/vihangd/smartyplats-7b-v2/results_2023-12-02T14-37-57.896136.json b/eval-results/vihangd/smartyplats-7b-v2/results_2023-12-02T14-37-57.896136.json new file mode 100644 index 0000000000000000000000000000000000000000..070027e14cc1b9034fa6bd8f7abd26aa93da6284 --- /dev/null +++ b/eval-results/vihangd/smartyplats-7b-v2/results_2023-12-02T14-37-57.896136.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 320925.78776628, + "end_time": 323579.383685331, + "total_evaluation_time_secondes": "2653.5959190509748", + "model_name": "vihangd/smartyplats-7b-v2", + "model_sha": "99049eb184b9b3ef074043d6e626fe3db09f5a19", + "model_dtype": "torch.float16", + "model_size": "13.99 GB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.38817285822592873, + "acc_stderr": 0.013423607564002743 + }, + "all": { + "acc": 0.38817285822592873, + "acc_stderr": 0.013423607564002743 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "6af0ae8cfe684f50", + "hash_cont_tokens": "f77517015873fe24" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "f17391d49d33b9c0", + "hash_cont_tokens": "32343c0d13919c55" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/w95/megachat/results_2023-11-13T15-59-20.049368.json b/eval-results/w95/megachat/results_2023-11-13T15-59-20.049368.json new file mode 100644 index 0000000000000000000000000000000000000000..d986f072ac7a14733aac8460262ab14635439341 --- /dev/null +++ b/eval-results/w95/megachat/results_2023-11-13T15-59-20.049368.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 327246.852378531, + "end_time": 336443.72439835, + "total_evaluation_time_secondes": "9196.872019818984", + "model_name": "w95/megachat", + "model_sha": "789b259a18ca7b168ced4995138ad6195cd2e8e8", + "model_dtype": "torch.float16", + "model_size": "2.06 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.27047781569965873, + "acc_stderr": 0.012980954547659554, + "acc_norm": 0.30802047781569963, + "acc_norm_stderr": 0.01349142951729204 + }, + "harness|hellaswag|10": { + "acc": 0.4100776737701653, + "acc_stderr": 0.004908423147162023, + "acc_norm": 0.5435172276438957, + "acc_norm_stderr": 0.004970846697552307 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04072314811876837, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04072314811876837 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.3092105263157895, + "acc_stderr": 0.03761070869867479, + "acc_norm": 0.3092105263157895, + "acc_norm_stderr": 0.03761070869867479 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.23, + "acc_stderr": 0.04229525846816506, + "acc_norm": 0.23, + "acc_norm_stderr": 0.04229525846816506 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.21132075471698114, + "acc_stderr": 0.02512576648482784, + "acc_norm": 0.21132075471698114, + "acc_norm_stderr": 0.02512576648482784 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.17, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.17, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909284, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909284 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.03295304696818318, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.03295304696818318 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.041583075330832865, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.041583075330832865 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.20425531914893616, + "acc_stderr": 0.026355158413349424, + "acc_norm": 0.20425531914893616, + "acc_norm_stderr": 0.026355158413349424 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2631578947368421, + "acc_stderr": 0.04142439719489362, + "acc_norm": 0.2631578947368421, + "acc_norm_stderr": 0.04142439719489362 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.296551724137931, + "acc_stderr": 0.03806142687309993, + "acc_norm": 0.296551724137931, + "acc_norm_stderr": 0.03806142687309993 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2698412698412698, + "acc_stderr": 0.022860838309232072, + "acc_norm": 0.2698412698412698, + "acc_norm_stderr": 0.022860838309232072 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.15079365079365079, + "acc_stderr": 0.03200686497287392, + "acc_norm": 0.15079365079365079, + "acc_norm_stderr": 0.03200686497287392 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25161290322580643, + "acc_stderr": 0.024685979286239956, + "acc_norm": 0.25161290322580643, + "acc_norm_stderr": 0.024685979286239956 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.2660098522167488, + "acc_stderr": 0.031089826002937523, + "acc_norm": 0.2660098522167488, + "acc_norm_stderr": 0.031089826002937523 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.03453131801885415, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.03453131801885415 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.20202020202020202, + "acc_stderr": 0.02860620428922988, + "acc_norm": 0.20202020202020202, + "acc_norm_stderr": 0.02860620428922988 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.2538860103626943, + "acc_stderr": 0.03141024780565319, + "acc_norm": 0.2538860103626943, + "acc_norm_stderr": 0.03141024780565319 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.2230769230769231, + "acc_stderr": 0.021107730127243995, + "acc_norm": 0.2230769230769231, + "acc_norm_stderr": 0.021107730127243995 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.02696242432507383, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.02696242432507383 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21008403361344538, + "acc_stderr": 0.026461398717471874, + "acc_norm": 0.21008403361344538, + "acc_norm_stderr": 0.026461398717471874 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.271523178807947, + "acc_stderr": 0.03631329803969653, + "acc_norm": 0.271523178807947, + "acc_norm_stderr": 0.03631329803969653 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.181651376146789, + "acc_stderr": 0.01653061740926686, + "acc_norm": 0.181651376146789, + "acc_norm_stderr": 0.01653061740926686 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.21296296296296297, + "acc_stderr": 0.02792096314799366, + "acc_norm": 0.21296296296296297, + "acc_norm_stderr": 0.02792096314799366 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.2549019607843137, + "acc_stderr": 0.030587591351604243, + "acc_norm": 0.2549019607843137, + "acc_norm_stderr": 0.030587591351604243 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.25738396624472576, + "acc_stderr": 0.028458820991460302, + "acc_norm": 0.25738396624472576, + "acc_norm_stderr": 0.028458820991460302 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.20179372197309417, + "acc_stderr": 0.026936111912802273, + "acc_norm": 0.20179372197309417, + "acc_norm_stderr": 0.026936111912802273 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.24427480916030533, + "acc_stderr": 0.03768335959728745, + "acc_norm": 0.24427480916030533, + "acc_norm_stderr": 0.03768335959728745 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.36363636363636365, + "acc_stderr": 0.04391326286724071, + "acc_norm": 0.36363636363636365, + "acc_norm_stderr": 0.04391326286724071 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.24074074074074073, + "acc_stderr": 0.041331194402438376, + "acc_norm": 0.24074074074074073, + "acc_norm_stderr": 0.041331194402438376 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.3006134969325153, + "acc_stderr": 0.03602511318806771, + "acc_norm": 0.3006134969325153, + "acc_norm_stderr": 0.03602511318806771 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.22321428571428573, + "acc_stderr": 0.039523019677025116, + "acc_norm": 0.22321428571428573, + "acc_norm_stderr": 0.039523019677025116 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.18446601941747573, + "acc_stderr": 0.03840423627288276, + "acc_norm": 0.18446601941747573, + "acc_norm_stderr": 0.03840423627288276 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2606837606837607, + "acc_stderr": 0.028760348956523414, + "acc_norm": 0.2606837606837607, + "acc_norm_stderr": 0.028760348956523414 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.25287356321839083, + "acc_stderr": 0.015543377313719681, + "acc_norm": 0.25287356321839083, + "acc_norm_stderr": 0.015543377313719681 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.29190751445086704, + "acc_stderr": 0.02447699407624734, + "acc_norm": 0.29190751445086704, + "acc_norm_stderr": 0.02447699407624734 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.24692737430167597, + "acc_stderr": 0.014422292204808835, + "acc_norm": 0.24692737430167597, + "acc_norm_stderr": 0.014422292204808835 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.24836601307189543, + "acc_stderr": 0.02473998135511359, + "acc_norm": 0.24836601307189543, + "acc_norm_stderr": 0.02473998135511359 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.2990353697749196, + "acc_stderr": 0.026003301117885135, + "acc_norm": 0.2990353697749196, + "acc_norm_stderr": 0.026003301117885135 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.31790123456790126, + "acc_stderr": 0.025910063528240875, + "acc_norm": 0.31790123456790126, + "acc_norm_stderr": 0.025910063528240875 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.26595744680851063, + "acc_stderr": 0.02635806569888059, + "acc_norm": 0.26595744680851063, + "acc_norm_stderr": 0.02635806569888059 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.27183833116036504, + "acc_stderr": 0.011363135278651411, + "acc_norm": 0.27183833116036504, + "acc_norm_stderr": 0.011363135278651411 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.17279411764705882, + "acc_stderr": 0.022966067585581756, + "acc_norm": 0.17279411764705882, + "acc_norm_stderr": 0.022966067585581756 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.2696078431372549, + "acc_stderr": 0.017952449196987866, + "acc_norm": 0.2696078431372549, + "acc_norm_stderr": 0.017952449196987866 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.22727272727272727, + "acc_stderr": 0.040139645540727735, + "acc_norm": 0.22727272727272727, + "acc_norm_stderr": 0.040139645540727735 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.24897959183673468, + "acc_stderr": 0.027682979522960227, + "acc_norm": 0.24897959183673468, + "acc_norm_stderr": 0.027682979522960227 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24875621890547264, + "acc_stderr": 0.030567675938916707, + "acc_norm": 0.24875621890547264, + "acc_norm_stderr": 0.030567675938916707 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.2469879518072289, + "acc_stderr": 0.03357351982064536, + "acc_norm": 0.2469879518072289, + "acc_norm_stderr": 0.03357351982064536 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3742690058479532, + "acc_stderr": 0.037116011853894806, + "acc_norm": 0.3742690058479532, + "acc_norm_stderr": 0.037116011853894806 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.24479804161566707, + "mc1_stderr": 0.015051869486715014, + "mc2": 0.39854544628414945, + "mc2_stderr": 0.014106781910887378 + }, + "harness|winogrande|5": { + "acc": 0.5698500394632992, + "acc_stderr": 0.013914685094716698 + }, + "harness|drop|3": { + "em": 0.0006291946308724832, + "em_stderr": 0.00025680027497239604, + "f1": 0.041603397651006783, + "f1_stderr": 0.0011146754682383132 + }, + "harness|gsm8k|5": { + "acc": 0.009855951478392721, + "acc_stderr": 0.0027210765770416586 + }, + "all": { + "acc": 0.25936163462487405, + "acc_stderr": 0.03091692313677521, + "acc_norm": 0.26122603283331186, + "acc_norm_stderr": 0.031692702511721224, + "mc1": 0.24479804161566707, + "mc1_stderr": 0.015051869486715014, + "mc2": 0.39854544628414945, + "mc2_stderr": 0.014106781910887378, + "em": 0.0006291946308724832, + "em_stderr": 0.00025680027497239604, + "f1": 0.041603397651006783, + "f1_stderr": 0.0011146754682383132 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "c2d55d68c4441c39", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "38dc8458e001ab84", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "5e69bf9422c979cd", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "55065fe953492209", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non_truncated": -495, + "padded": 0, + "non_padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "0903f3aba4ea094f", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non_truncated": -612, + "padded": 0, + "non_padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non_truncated": 229, + "padded": 940, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non_truncated": 930, + "padded": 5524, + "non_padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "61b608e0b5ceed76", + "hash_cont_tokens": "3c88ea5dc519efa0" + }, + "truncated": 1263, + "non_truncated": 8273, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "4e3f906b6f8353cd" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "6c2529964ad5cacf", + "hash_cont_tokens": "7310ce6097f96f50" + }, + "truncated": 3351, + "non_truncated": 34844, + "padded": 111256, + "non_padded": 13152, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/FINETUNE3_TEST4/results_2023-09-15T03-51-45.225444.json b/eval-results/wei123602/FINETUNE3_TEST4/results_2023-09-15T03-51-45.225444.json new file mode 100644 index 0000000000000000000000000000000000000000..862555a0749806b117ee78acc8b5190697764ec7 --- /dev/null +++ b/eval-results/wei123602/FINETUNE3_TEST4/results_2023-09-15T03-51-45.225444.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "wei123602/FINETUNE3_TEST4", + "model_sha": "5195e87bb34317c5aaf201faa476aae78ecc9f1b", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.514505119453925, + "acc_stderr": 0.014605241081370053, + "acc_norm": 0.5563139931740614, + "acc_norm_stderr": 0.014518421825670454 + }, + "harness|hellaswag|10": { + "acc": 0.6047600079665405, + "acc_stderr": 0.004879030010598916, + "acc_norm": 0.8130850428201554, + "acc_norm_stderr": 0.003890465158271812 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4, + "acc_stderr": 0.04232073695151589, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04232073695151589 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5460526315789473, + "acc_stderr": 0.04051646342874143, + "acc_norm": 0.5460526315789473, + "acc_norm_stderr": 0.04051646342874143 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5018867924528302, + "acc_stderr": 0.03077265364207565, + "acc_norm": 0.5018867924528302, + "acc_norm_stderr": 0.03077265364207565 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5, + "acc_stderr": 0.04181210050035455, + "acc_norm": 0.5, + "acc_norm_stderr": 0.04181210050035455 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.050161355804659205, + "acc_norm": 0.47, + "acc_norm_stderr": 0.050161355804659205 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.41040462427745666, + "acc_stderr": 0.037507570448955356, + "acc_norm": 0.41040462427745666, + "acc_norm_stderr": 0.037507570448955356 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.04533838195929776, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.04533838195929776 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.67, + "acc_stderr": 0.04725815626252609, + "acc_norm": 0.67, + "acc_norm_stderr": 0.04725815626252609 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.425531914893617, + "acc_stderr": 0.03232146916224467, + "acc_norm": 0.425531914893617, + "acc_norm_stderr": 0.03232146916224467 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2719298245614035, + "acc_stderr": 0.041857744240220554, + "acc_norm": 0.2719298245614035, + "acc_norm_stderr": 0.041857744240220554 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.46206896551724136, + "acc_stderr": 0.041546596717075474, + "acc_norm": 0.46206896551724136, + "acc_norm_stderr": 0.041546596717075474 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30423280423280424, + "acc_stderr": 0.02369541500946309, + "acc_norm": 0.30423280423280424, + "acc_norm_stderr": 0.02369541500946309 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.04390259265377562, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.04390259265377562 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5064516129032258, + "acc_stderr": 0.02844163823354051, + "acc_norm": 0.5064516129032258, + "acc_norm_stderr": 0.02844163823354051 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.39408866995073893, + "acc_stderr": 0.03438157967036545, + "acc_norm": 0.39408866995073893, + "acc_norm_stderr": 0.03438157967036545 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.45, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6848484848484848, + "acc_stderr": 0.0362773057502241, + "acc_norm": 0.6848484848484848, + "acc_norm_stderr": 0.0362773057502241 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6414141414141414, + "acc_stderr": 0.034169036403915214, + "acc_norm": 0.6414141414141414, + "acc_norm_stderr": 0.034169036403915214 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7564766839378239, + "acc_stderr": 0.030975436386845447, + "acc_norm": 0.7564766839378239, + "acc_norm_stderr": 0.030975436386845447 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5282051282051282, + "acc_stderr": 0.02531063925493389, + "acc_norm": 0.5282051282051282, + "acc_norm_stderr": 0.02531063925493389 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.027940457136228412, + "acc_norm": 0.3, + "acc_norm_stderr": 0.027940457136228412 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5336134453781513, + "acc_stderr": 0.03240501447690071, + "acc_norm": 0.5336134453781513, + "acc_norm_stderr": 0.03240501447690071 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.03802039760107903, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.03802039760107903 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.6990825688073394, + "acc_stderr": 0.019664751366802114, + "acc_norm": 0.6990825688073394, + "acc_norm_stderr": 0.019664751366802114 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.03407632093854053, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.03407632093854053 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7598039215686274, + "acc_stderr": 0.02998373305591361, + "acc_norm": 0.7598039215686274, + "acc_norm_stderr": 0.02998373305591361 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7172995780590717, + "acc_stderr": 0.029312814153955917, + "acc_norm": 0.7172995780590717, + "acc_norm_stderr": 0.029312814153955917 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6053811659192825, + "acc_stderr": 0.03280400504755291, + "acc_norm": 0.6053811659192825, + "acc_norm_stderr": 0.03280400504755291 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5954198473282443, + "acc_stderr": 0.043046937953806645, + "acc_norm": 0.5954198473282443, + "acc_norm_stderr": 0.043046937953806645 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6694214876033058, + "acc_stderr": 0.04294340845212094, + "acc_norm": 0.6694214876033058, + "acc_norm_stderr": 0.04294340845212094 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6574074074074074, + "acc_stderr": 0.04587904741301809, + "acc_norm": 0.6574074074074074, + "acc_norm_stderr": 0.04587904741301809 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6012269938650306, + "acc_stderr": 0.03847021420456023, + "acc_norm": 0.6012269938650306, + "acc_norm_stderr": 0.03847021420456023 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.26785714285714285, + "acc_stderr": 0.04203277291467762, + "acc_norm": 0.26785714285714285, + "acc_norm_stderr": 0.04203277291467762 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7281553398058253, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.7281553398058253, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7350427350427351, + "acc_stderr": 0.02891120880274948, + "acc_norm": 0.7350427350427351, + "acc_norm_stderr": 0.02891120880274948 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620333, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620333 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.722860791826309, + "acc_stderr": 0.016005636294122425, + "acc_norm": 0.722860791826309, + "acc_norm_stderr": 0.016005636294122425 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5838150289017341, + "acc_stderr": 0.026538189104705477, + "acc_norm": 0.5838150289017341, + "acc_norm_stderr": 0.026538189104705477 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.29720670391061454, + "acc_stderr": 0.0152853133536416, + "acc_norm": 0.29720670391061454, + "acc_norm_stderr": 0.0152853133536416 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5490196078431373, + "acc_stderr": 0.028491993586171563, + "acc_norm": 0.5490196078431373, + "acc_norm_stderr": 0.028491993586171563 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6655948553054662, + "acc_stderr": 0.026795422327893934, + "acc_norm": 0.6655948553054662, + "acc_norm_stderr": 0.026795422327893934 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.595679012345679, + "acc_stderr": 0.027306625297327684, + "acc_norm": 0.595679012345679, + "acc_norm_stderr": 0.027306625297327684 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4397163120567376, + "acc_stderr": 0.02960991207559411, + "acc_norm": 0.4397163120567376, + "acc_norm_stderr": 0.02960991207559411 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.42242503259452413, + "acc_stderr": 0.01261560047573492, + "acc_norm": 0.42242503259452413, + "acc_norm_stderr": 0.01261560047573492 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5367647058823529, + "acc_stderr": 0.030290619180485694, + "acc_norm": 0.5367647058823529, + "acc_norm_stderr": 0.030290619180485694 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5179738562091504, + "acc_stderr": 0.020214761037872404, + "acc_norm": 0.5179738562091504, + "acc_norm_stderr": 0.020214761037872404 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6090909090909091, + "acc_stderr": 0.04673752333670239, + "acc_norm": 0.6090909090909091, + "acc_norm_stderr": 0.04673752333670239 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5918367346938775, + "acc_stderr": 0.03146465712827424, + "acc_norm": 0.5918367346938775, + "acc_norm_stderr": 0.03146465712827424 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7064676616915423, + "acc_stderr": 0.03220024104534205, + "acc_norm": 0.7064676616915423, + "acc_norm_stderr": 0.03220024104534205 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4036144578313253, + "acc_stderr": 0.038194861407583984, + "acc_norm": 0.4036144578313253, + "acc_norm_stderr": 0.038194861407583984 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7309941520467836, + "acc_stderr": 0.0340105262010409, + "acc_norm": 0.7309941520467836, + "acc_norm_stderr": 0.0340105262010409 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.27539779681762544, + "mc1_stderr": 0.01563813566777552, + "mc2": 0.41139981650563134, + "mc2_stderr": 0.014349376708937319 + }, + "all": { + "acc": 0.5226079777978104, + "acc_stderr": 0.03494053352340006, + "acc_norm": 0.5268475355702469, + "acc_norm_stderr": 0.03492230667411148, + "mc1": 0.27539779681762544, + "mc1_stderr": 0.01563813566777552, + "mc2": 0.41139981650563134, + "mc2_stderr": 0.014349376708937319 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "11825.587738990784", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/FINETUNE3_TEST4/results_2023-10-26T12-29-32.443633.json b/eval-results/wei123602/FINETUNE3_TEST4/results_2023-10-26T12-29-32.443633.json new file mode 100644 index 0000000000000000000000000000000000000000..4e3155471455558c7d97059b17b77e6d6670de7f --- /dev/null +++ b/eval-results/wei123602/FINETUNE3_TEST4/results_2023-10-26T12-29-32.443633.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "wei123602/FINETUNE3_TEST4", + "model_sha": "5195e87bb34317c5aaf201faa476aae78ecc9f1b", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.13779362416107382, + "em_stderr": 0.0035298790747402492, + "f1": 0.18403313758389214, + "f1_stderr": 0.0035900811790154394 + }, + "harness|gsm8k|5": { + "acc": 0.11220621683093253, + "acc_stderr": 0.00869374313824237 + }, + "harness|winogrande|5": { + "acc": 0.7671665351223362, + "acc_stderr": 0.011878201073856544 + }, + "all": { + "em": 0.13779362416107382, + "em_stderr": 0.0035298790747402492, + "f1": 0.18403313758389214, + "f1_stderr": 0.0035900811790154394, + "acc": 0.4396863759766344, + "acc_stderr": 0.010285972106049457 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "151c72912bbba041" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "10e462892036ed8a" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "eb17306cb2891c55" + }, + "total_evaluation_time_secondes": "12304.146438837051", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/Llama-2-13b-FINETUNE4/results_2023-09-18T13-14-12.416583.json b/eval-results/wei123602/Llama-2-13b-FINETUNE4/results_2023-09-18T13-14-12.416583.json new file mode 100644 index 0000000000000000000000000000000000000000..3526cd80667753ca31d2b09a66e4ba5a26c5b81e --- /dev/null +++ b/eval-results/wei123602/Llama-2-13b-FINETUNE4/results_2023-09-18T13-14-12.416583.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "wei123602/Llama-2-13b-FINETUNE4", + "model_sha": "939d06081210fa943c60210a47583f43b60901ad", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5401023890784983, + "acc_stderr": 0.01456431885692485, + "acc_norm": 0.5870307167235495, + "acc_norm_stderr": 0.014388344935398329 + }, + "harness|hellaswag|10": { + "acc": 0.609838677554272, + "acc_stderr": 0.004867893927258144, + "acc_norm": 0.819259111730731, + "acc_norm_stderr": 0.0038401692240122715 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.27, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.27, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.6052631578947368, + "acc_stderr": 0.039777499346220734, + "acc_norm": 0.6052631578947368, + "acc_norm_stderr": 0.039777499346220734 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6, + "acc_stderr": 0.03015113445777629, + "acc_norm": 0.6, + "acc_norm_stderr": 0.03015113445777629 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5486111111111112, + "acc_stderr": 0.04161402398403279, + "acc_norm": 0.5486111111111112, + "acc_norm_stderr": 0.04161402398403279 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5780346820809249, + "acc_stderr": 0.03765746693865151, + "acc_norm": 0.5780346820809249, + "acc_norm_stderr": 0.03765746693865151 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.4117647058823529, + "acc_stderr": 0.048971049527263666, + "acc_norm": 0.4117647058823529, + "acc_norm_stderr": 0.048971049527263666 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505, + "acc_norm": 0.68, + "acc_norm_stderr": 0.04688261722621505 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.425531914893617, + "acc_stderr": 0.03232146916224468, + "acc_norm": 0.425531914893617, + "acc_norm_stderr": 0.03232146916224468 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3508771929824561, + "acc_stderr": 0.044895393502706986, + "acc_norm": 0.3508771929824561, + "acc_norm_stderr": 0.044895393502706986 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4689655172413793, + "acc_stderr": 0.04158632762097828, + "acc_norm": 0.4689655172413793, + "acc_norm_stderr": 0.04158632762097828 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.335978835978836, + "acc_stderr": 0.024326310529149138, + "acc_norm": 0.335978835978836, + "acc_norm_stderr": 0.024326310529149138 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.043758884927270605, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.043758884927270605 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252604, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252604 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.632258064516129, + "acc_stderr": 0.02743086657997347, + "acc_norm": 0.632258064516129, + "acc_norm_stderr": 0.02743086657997347 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43349753694581283, + "acc_stderr": 0.03486731727419872, + "acc_norm": 0.43349753694581283, + "acc_norm_stderr": 0.03486731727419872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.0347769116216366, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.0347769116216366 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7070707070707071, + "acc_stderr": 0.03242497958178816, + "acc_norm": 0.7070707070707071, + "acc_norm_stderr": 0.03242497958178816 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7875647668393783, + "acc_stderr": 0.02951928261681723, + "acc_norm": 0.7875647668393783, + "acc_norm_stderr": 0.02951928261681723 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.6, + "acc_stderr": 0.02483881198803316, + "acc_norm": 0.6, + "acc_norm_stderr": 0.02483881198803316 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.35555555555555557, + "acc_stderr": 0.02918571494985741, + "acc_norm": 0.35555555555555557, + "acc_norm_stderr": 0.02918571494985741 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6134453781512605, + "acc_stderr": 0.03163145807552379, + "acc_norm": 0.6134453781512605, + "acc_norm_stderr": 0.03163145807552379 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3708609271523179, + "acc_stderr": 0.03943966699183629, + "acc_norm": 0.3708609271523179, + "acc_norm_stderr": 0.03943966699183629 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7761467889908257, + "acc_stderr": 0.017871217767790215, + "acc_norm": 0.7761467889908257, + "acc_norm_stderr": 0.017871217767790215 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.46296296296296297, + "acc_stderr": 0.034006036255382704, + "acc_norm": 0.46296296296296297, + "acc_norm_stderr": 0.034006036255382704 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.028867431449849313, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.028867431449849313 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7890295358649789, + "acc_stderr": 0.02655837250266192, + "acc_norm": 0.7890295358649789, + "acc_norm_stderr": 0.02655837250266192 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6502242152466368, + "acc_stderr": 0.03200736719484503, + "acc_norm": 0.6502242152466368, + "acc_norm_stderr": 0.03200736719484503 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5801526717557252, + "acc_stderr": 0.043285772152629715, + "acc_norm": 0.5801526717557252, + "acc_norm_stderr": 0.043285772152629715 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.768595041322314, + "acc_stderr": 0.03849856098794088, + "acc_norm": 0.768595041322314, + "acc_norm_stderr": 0.03849856098794088 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7685185185185185, + "acc_stderr": 0.04077494709252626, + "acc_norm": 0.7685185185185185, + "acc_norm_stderr": 0.04077494709252626 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7116564417177914, + "acc_stderr": 0.03559039531617342, + "acc_norm": 0.7116564417177914, + "acc_norm_stderr": 0.03559039531617342 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3482142857142857, + "acc_stderr": 0.04521829902833585, + "acc_norm": 0.3482142857142857, + "acc_norm_stderr": 0.04521829902833585 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7378640776699029, + "acc_stderr": 0.04354631077260595, + "acc_norm": 0.7378640776699029, + "acc_norm_stderr": 0.04354631077260595 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8034188034188035, + "acc_stderr": 0.02603538609895129, + "acc_norm": 0.8034188034188035, + "acc_norm_stderr": 0.02603538609895129 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.63, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.63, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.776500638569604, + "acc_stderr": 0.01489723522945071, + "acc_norm": 0.776500638569604, + "acc_norm_stderr": 0.01489723522945071 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5895953757225434, + "acc_stderr": 0.026483392042098177, + "acc_norm": 0.5895953757225434, + "acc_norm_stderr": 0.026483392042098177 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.4480446927374302, + "acc_stderr": 0.016631976628930595, + "acc_norm": 0.4480446927374302, + "acc_norm_stderr": 0.016631976628930595 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5751633986928104, + "acc_stderr": 0.02830457667314111, + "acc_norm": 0.5751633986928104, + "acc_norm_stderr": 0.02830457667314111 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6591639871382636, + "acc_stderr": 0.026920841260776165, + "acc_norm": 0.6591639871382636, + "acc_norm_stderr": 0.026920841260776165 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6574074074074074, + "acc_stderr": 0.026406145973625672, + "acc_norm": 0.6574074074074074, + "acc_norm_stderr": 0.026406145973625672 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.49645390070921985, + "acc_stderr": 0.02982674915328092, + "acc_norm": 0.49645390070921985, + "acc_norm_stderr": 0.02982674915328092 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44132985658409385, + "acc_stderr": 0.01268201633564667, + "acc_norm": 0.44132985658409385, + "acc_norm_stderr": 0.01268201633564667 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5772058823529411, + "acc_stderr": 0.030008562845003476, + "acc_norm": 0.5772058823529411, + "acc_norm_stderr": 0.030008562845003476 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.01994491413687358, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.01994491413687358 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302505, + "acc_norm": 0.6636363636363637, + "acc_norm_stderr": 0.04525393596302505 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5469387755102041, + "acc_stderr": 0.03186785930004128, + "acc_norm": 0.5469387755102041, + "acc_norm_stderr": 0.03186785930004128 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7213930348258707, + "acc_stderr": 0.031700561834973086, + "acc_norm": 0.7213930348258707, + "acc_norm_stderr": 0.031700561834973086 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.78, + "acc_stderr": 0.04163331998932264, + "acc_norm": 0.78, + "acc_norm_stderr": 0.04163331998932264 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.40963855421686746, + "acc_stderr": 0.03828401115079023, + "acc_norm": 0.40963855421686746, + "acc_norm_stderr": 0.03828401115079023 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.030944459778533193, + "acc_norm": 0.7953216374269005, + "acc_norm_stderr": 0.030944459778533193 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2974296205630355, + "mc1_stderr": 0.016002651487361005, + "mc2": 0.432555793209622, + "mc2_stderr": 0.014584160007096517 + }, + "all": { + "acc": 0.5721730477835751, + "acc_stderr": 0.03435830007549653, + "acc_norm": 0.5765179420517365, + "acc_norm_stderr": 0.03433789840389022, + "mc1": 0.2974296205630355, + "mc1_stderr": 0.016002651487361005, + "mc2": 0.432555793209622, + "mc2_stderr": 0.014584160007096517 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6357.531043291092", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/Llama-2-13b-FINETUNE4/results_2023-10-23T06-23-21.987505.json b/eval-results/wei123602/Llama-2-13b-FINETUNE4/results_2023-10-23T06-23-21.987505.json new file mode 100644 index 0000000000000000000000000000000000000000..8e2d3857f9fa98b366f5a0e4ca447c1f9c8b182d --- /dev/null +++ b/eval-results/wei123602/Llama-2-13b-FINETUNE4/results_2023-10-23T06-23-21.987505.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "wei123602/Llama-2-13b-FINETUNE4", + "model_sha": "939d06081210fa943c60210a47583f43b60901ad", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.08525587248322147, + "em_stderr": 0.0028599050719363664, + "f1": 0.13560297818791875, + "f1_stderr": 0.0029877199841954003 + }, + "harness|gsm8k|5": { + "acc": 0.12509476876421532, + "acc_stderr": 0.009112601439849643 + }, + "harness|winogrande|5": { + "acc": 0.7695343330702447, + "acc_stderr": 0.011835872164836671 + }, + "all": { + "em": 0.08525587248322147, + "em_stderr": 0.0028599050719363664, + "f1": 0.13560297818791875, + "f1_stderr": 0.0029877199841954003, + "acc": 0.44731455091723, + "acc_stderr": 0.010474236802343157 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "a1bd072839020a23" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "2dc9b2478c5d66f1" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "6e174dfd5ae45ef2" + }, + "total_evaluation_time_secondes": "12144.591464996338", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/Llama-2-13b-FINETUNE4_TEST/results_2023-09-21T23-17-56.003321.json b/eval-results/wei123602/Llama-2-13b-FINETUNE4_TEST/results_2023-09-21T23-17-56.003321.json new file mode 100644 index 0000000000000000000000000000000000000000..490fb284bc00399ed9850fed5e55e5b655a0df47 --- /dev/null +++ b/eval-results/wei123602/Llama-2-13b-FINETUNE4_TEST/results_2023-09-21T23-17-56.003321.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "wei123602/Llama-2-13b-FINETUNE4_TEST", + "model_sha": "0ed198a814192b06e60715112d2a4b6bfd630806", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5, + "acc_stderr": 0.014611390804670088, + "acc_norm": 0.5477815699658704, + "acc_norm_stderr": 0.014544519880633827 + }, + "harness|hellaswag|10": { + "acc": 0.6085441147181836, + "acc_stderr": 0.0048707850367082925, + "acc_norm": 0.8151762597092213, + "acc_norm_stderr": 0.0038736123391606564 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252606, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252606 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4666666666666667, + "acc_stderr": 0.043097329010363554, + "acc_norm": 0.4666666666666667, + "acc_norm_stderr": 0.043097329010363554 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5460526315789473, + "acc_stderr": 0.04051646342874143, + "acc_norm": 0.5460526315789473, + "acc_norm_stderr": 0.04051646342874143 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6, + "acc_stderr": 0.030151134457776292, + "acc_norm": 0.6, + "acc_norm_stderr": 0.030151134457776292 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6180555555555556, + "acc_stderr": 0.040629907841466674, + "acc_norm": 0.6180555555555556, + "acc_norm_stderr": 0.040629907841466674 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.049999999999999996, + "acc_norm": 0.45, + "acc_norm_stderr": 0.049999999999999996 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.41, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.41, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5549132947976878, + "acc_stderr": 0.03789401760283647, + "acc_norm": 0.5549132947976878, + "acc_norm_stderr": 0.03789401760283647 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3431372549019608, + "acc_stderr": 0.04724007352383888, + "acc_norm": 0.3431372549019608, + "acc_norm_stderr": 0.04724007352383888 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.46382978723404256, + "acc_stderr": 0.032600385118357715, + "acc_norm": 0.46382978723404256, + "acc_norm_stderr": 0.032600385118357715 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3508771929824561, + "acc_stderr": 0.044895393502706986, + "acc_norm": 0.3508771929824561, + "acc_norm_stderr": 0.044895393502706986 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4896551724137931, + "acc_stderr": 0.04165774775728763, + "acc_norm": 0.4896551724137931, + "acc_norm_stderr": 0.04165774775728763 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3306878306878307, + "acc_stderr": 0.024229965298425075, + "acc_norm": 0.3306878306878307, + "acc_norm_stderr": 0.024229965298425075 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.36507936507936506, + "acc_stderr": 0.04306241259127153, + "acc_norm": 0.36507936507936506, + "acc_norm_stderr": 0.04306241259127153 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.36, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.36, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6290322580645161, + "acc_stderr": 0.027480541887953593, + "acc_norm": 0.6290322580645161, + "acc_norm_stderr": 0.027480541887953593 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4039408866995074, + "acc_stderr": 0.0345245390382204, + "acc_norm": 0.4039408866995074, + "acc_norm_stderr": 0.0345245390382204 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.703030303030303, + "acc_stderr": 0.03567969772268049, + "acc_norm": 0.703030303030303, + "acc_norm_stderr": 0.03567969772268049 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.696969696969697, + "acc_stderr": 0.03274287914026866, + "acc_norm": 0.696969696969697, + "acc_norm_stderr": 0.03274287914026866 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8341968911917098, + "acc_stderr": 0.026839845022314415, + "acc_norm": 0.8341968911917098, + "acc_norm_stderr": 0.026839845022314415 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.558974358974359, + "acc_stderr": 0.025174048384000756, + "acc_norm": 0.558974358974359, + "acc_norm_stderr": 0.025174048384000756 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3296296296296296, + "acc_stderr": 0.02866120111652458, + "acc_norm": 0.3296296296296296, + "acc_norm_stderr": 0.02866120111652458 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6092436974789915, + "acc_stderr": 0.03169380235712996, + "acc_norm": 0.6092436974789915, + "acc_norm_stderr": 0.03169380235712996 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3509933774834437, + "acc_stderr": 0.03896981964257375, + "acc_norm": 0.3509933774834437, + "acc_norm_stderr": 0.03896981964257375 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7761467889908257, + "acc_stderr": 0.017871217767790236, + "acc_norm": 0.7761467889908257, + "acc_norm_stderr": 0.017871217767790236 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4074074074074074, + "acc_stderr": 0.03350991604696042, + "acc_norm": 0.4074074074074074, + "acc_norm_stderr": 0.03350991604696042 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7990196078431373, + "acc_stderr": 0.028125972265654362, + "acc_norm": 0.7990196078431373, + "acc_norm_stderr": 0.028125972265654362 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.759493670886076, + "acc_stderr": 0.027820781981149685, + "acc_norm": 0.759493670886076, + "acc_norm_stderr": 0.027820781981149685 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6412556053811659, + "acc_stderr": 0.03219079200419995, + "acc_norm": 0.6412556053811659, + "acc_norm_stderr": 0.03219079200419995 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5343511450381679, + "acc_stderr": 0.04374928560599736, + "acc_norm": 0.5343511450381679, + "acc_norm_stderr": 0.04374928560599736 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7603305785123967, + "acc_stderr": 0.03896878985070416, + "acc_norm": 0.7603305785123967, + "acc_norm_stderr": 0.03896878985070416 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6851851851851852, + "acc_stderr": 0.04489931073591312, + "acc_norm": 0.6851851851851852, + "acc_norm_stderr": 0.04489931073591312 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6380368098159509, + "acc_stderr": 0.037757007291414416, + "acc_norm": 0.6380368098159509, + "acc_norm_stderr": 0.037757007291414416 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.044328040552915185, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.044328040552915185 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7378640776699029, + "acc_stderr": 0.043546310772605956, + "acc_norm": 0.7378640776699029, + "acc_norm_stderr": 0.043546310772605956 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8162393162393162, + "acc_stderr": 0.02537213967172293, + "acc_norm": 0.8162393162393162, + "acc_norm_stderr": 0.02537213967172293 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.63, + "acc_stderr": 0.048523658709391, + "acc_norm": 0.63, + "acc_norm_stderr": 0.048523658709391 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7701149425287356, + "acc_stderr": 0.0150463018466918, + "acc_norm": 0.7701149425287356, + "acc_norm_stderr": 0.0150463018466918 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.615606936416185, + "acc_stderr": 0.026189666966272035, + "acc_norm": 0.615606936416185, + "acc_norm_stderr": 0.026189666966272035 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.29497206703910617, + "acc_stderr": 0.015251931579208173, + "acc_norm": 0.29497206703910617, + "acc_norm_stderr": 0.015251931579208173 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6013071895424836, + "acc_stderr": 0.02803609227389177, + "acc_norm": 0.6013071895424836, + "acc_norm_stderr": 0.02803609227389177 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.684887459807074, + "acc_stderr": 0.026385273703464492, + "acc_norm": 0.684887459807074, + "acc_norm_stderr": 0.026385273703464492 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6172839506172839, + "acc_stderr": 0.027044538138402616, + "acc_norm": 0.6172839506172839, + "acc_norm_stderr": 0.027044538138402616 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.48936170212765956, + "acc_stderr": 0.02982074719142248, + "acc_norm": 0.48936170212765956, + "acc_norm_stderr": 0.02982074719142248 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4361147327249022, + "acc_stderr": 0.012665568135455326, + "acc_norm": 0.4361147327249022, + "acc_norm_stderr": 0.012665568135455326 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5625, + "acc_stderr": 0.030134614954403924, + "acc_norm": 0.5625, + "acc_norm_stderr": 0.030134614954403924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5686274509803921, + "acc_stderr": 0.020036393768352635, + "acc_norm": 0.5686274509803921, + "acc_norm_stderr": 0.020036393768352635 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5346938775510204, + "acc_stderr": 0.03193207024425314, + "acc_norm": 0.5346938775510204, + "acc_norm_stderr": 0.03193207024425314 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7263681592039801, + "acc_stderr": 0.03152439186555402, + "acc_norm": 0.7263681592039801, + "acc_norm_stderr": 0.03152439186555402 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.73, + "acc_stderr": 0.044619604333847394, + "acc_norm": 0.73, + "acc_norm_stderr": 0.044619604333847394 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3855421686746988, + "acc_stderr": 0.03789134424611552, + "acc_norm": 0.3855421686746988, + "acc_norm_stderr": 0.03789134424611552 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7543859649122807, + "acc_stderr": 0.03301405946987249, + "acc_norm": 0.7543859649122807, + "acc_norm_stderr": 0.03301405946987249 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.27050183598531213, + "mc1_stderr": 0.015550778332842895, + "mc2": 0.3913965666576138, + "mc2_stderr": 0.014096837740998912 + }, + "all": { + "acc": 0.5601115387662375, + "acc_stderr": 0.034549712275199894, + "acc_norm": 0.5644236356299138, + "acc_norm_stderr": 0.03453167763754593, + "mc1": 0.27050183598531213, + "mc1_stderr": 0.015550778332842895, + "mc2": 0.3913965666576138, + "mc2_stderr": 0.014096837740998912 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6389.355361223221", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/Llama-2-13b-FINETUNE4_TEST/results_2023-10-25T04-35-36.269188.json b/eval-results/wei123602/Llama-2-13b-FINETUNE4_TEST/results_2023-10-25T04-35-36.269188.json new file mode 100644 index 0000000000000000000000000000000000000000..fa8bebee8a6aa4cfae164b964050067a8b70f567 --- /dev/null +++ b/eval-results/wei123602/Llama-2-13b-FINETUNE4_TEST/results_2023-10-25T04-35-36.269188.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "wei123602/Llama-2-13b-FINETUNE4_TEST", + "model_sha": "0ed198a814192b06e60715112d2a4b6bfd630806", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.2558724832214765, + "em_stderr": 0.004468637497676013, + "f1": 0.29727348993288566, + "f1_stderr": 0.0043971826108447475 + }, + "harness|gsm8k|5": { + "acc": 0.13191811978771797, + "acc_stderr": 0.009321265253857515 + }, + "harness|winogrande|5": { + "acc": 0.7703235990528808, + "acc_stderr": 0.011821645601838234 + }, + "all": { + "em": 0.2558724832214765, + "em_stderr": 0.004468637497676013, + "f1": 0.29727348993288566, + "f1_stderr": 0.0043971826108447475, + "acc": 0.4511208594202994, + "acc_stderr": 0.010571455427847876 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "5f6e7e0cb7f3a2c3" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "44f77bfdcf700bdf" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "75018c53eb3880f5" + }, + "total_evaluation_time_secondes": "11818.154337644577", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/Llama-2-13b-FINETUNE4_TEST2/results_2023-10-03T15-36-38.191985.json b/eval-results/wei123602/Llama-2-13b-FINETUNE4_TEST2/results_2023-10-03T15-36-38.191985.json new file mode 100644 index 0000000000000000000000000000000000000000..5be3297808f9ab486e442ae32ba8d4cf917a4d8b --- /dev/null +++ b/eval-results/wei123602/Llama-2-13b-FINETUNE4_TEST2/results_2023-10-03T15-36-38.191985.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "wei123602/Llama-2-13b-FINETUNE4_TEST2", + "model_sha": "e312c4c59cab9d130c33288c92aad7c0cb5331d5", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5401023890784983, + "acc_stderr": 0.014564318856924848, + "acc_norm": 0.5844709897610921, + "acc_norm_stderr": 0.014401366641216377 + }, + "harness|hellaswag|10": { + "acc": 0.6101374228241386, + "acc_stderr": 0.004867221634461272, + "acc_norm": 0.8169687313284206, + "acc_norm_stderr": 0.0038590186619619944 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.32, + "acc_stderr": 0.046882617226215034, + "acc_norm": 0.32, + "acc_norm_stderr": 0.046882617226215034 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5037037037037037, + "acc_stderr": 0.04319223625811331, + "acc_norm": 0.5037037037037037, + "acc_norm_stderr": 0.04319223625811331 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5789473684210527, + "acc_stderr": 0.040179012759817494, + "acc_norm": 0.5789473684210527, + "acc_norm_stderr": 0.040179012759817494 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5962264150943396, + "acc_stderr": 0.03019761160019795, + "acc_norm": 0.5962264150943396, + "acc_norm_stderr": 0.03019761160019795 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5902777777777778, + "acc_stderr": 0.04112490974670787, + "acc_norm": 0.5902777777777778, + "acc_norm_stderr": 0.04112490974670787 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.43, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.43, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.51, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.51, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145632, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145632 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.48554913294797686, + "acc_stderr": 0.03810871630454764, + "acc_norm": 0.48554913294797686, + "acc_norm_stderr": 0.03810871630454764 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.39215686274509803, + "acc_stderr": 0.04858083574266345, + "acc_norm": 0.39215686274509803, + "acc_norm_stderr": 0.04858083574266345 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4425531914893617, + "acc_stderr": 0.03246956919789958, + "acc_norm": 0.4425531914893617, + "acc_norm_stderr": 0.03246956919789958 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.2982456140350877, + "acc_stderr": 0.04303684033537314, + "acc_norm": 0.2982456140350877, + "acc_norm_stderr": 0.04303684033537314 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5172413793103449, + "acc_stderr": 0.04164188720169375, + "acc_norm": 0.5172413793103449, + "acc_norm_stderr": 0.04164188720169375 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.34656084656084657, + "acc_stderr": 0.02450877752102841, + "acc_norm": 0.34656084656084657, + "acc_norm_stderr": 0.02450877752102841 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.31746031746031744, + "acc_stderr": 0.04163453031302859, + "acc_norm": 0.31746031746031744, + "acc_norm_stderr": 0.04163453031302859 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6419354838709678, + "acc_stderr": 0.027273890594300642, + "acc_norm": 0.6419354838709678, + "acc_norm_stderr": 0.027273890594300642 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4088669950738916, + "acc_stderr": 0.034590588158832314, + "acc_norm": 0.4088669950738916, + "acc_norm_stderr": 0.034590588158832314 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.6, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.6, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7151515151515152, + "acc_stderr": 0.03524390844511781, + "acc_norm": 0.7151515151515152, + "acc_norm_stderr": 0.03524390844511781 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7121212121212122, + "acc_stderr": 0.03225883512300992, + "acc_norm": 0.7121212121212122, + "acc_norm_stderr": 0.03225883512300992 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.844559585492228, + "acc_stderr": 0.026148483469153303, + "acc_norm": 0.844559585492228, + "acc_norm_stderr": 0.026148483469153303 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5564102564102564, + "acc_stderr": 0.025189149894764205, + "acc_norm": 0.5564102564102564, + "acc_norm_stderr": 0.025189149894764205 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34444444444444444, + "acc_stderr": 0.028972648884844267, + "acc_norm": 0.34444444444444444, + "acc_norm_stderr": 0.028972648884844267 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6050420168067226, + "acc_stderr": 0.031753678460966245, + "acc_norm": 0.6050420168067226, + "acc_norm_stderr": 0.031753678460966245 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.0386155754625517, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.0386155754625517 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.781651376146789, + "acc_stderr": 0.017712600528722717, + "acc_norm": 0.781651376146789, + "acc_norm_stderr": 0.017712600528722717 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.46296296296296297, + "acc_stderr": 0.03400603625538271, + "acc_norm": 0.46296296296296297, + "acc_norm_stderr": 0.03400603625538271 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7401960784313726, + "acc_stderr": 0.03077855467869327, + "acc_norm": 0.7401960784313726, + "acc_norm_stderr": 0.03077855467869327 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7805907172995781, + "acc_stderr": 0.026939106581553945, + "acc_norm": 0.7805907172995781, + "acc_norm_stderr": 0.026939106581553945 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6547085201793722, + "acc_stderr": 0.03191100192835794, + "acc_norm": 0.6547085201793722, + "acc_norm_stderr": 0.03191100192835794 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6412213740458015, + "acc_stderr": 0.04206739313864908, + "acc_norm": 0.6412213740458015, + "acc_norm_stderr": 0.04206739313864908 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.743801652892562, + "acc_stderr": 0.03984979653302871, + "acc_norm": 0.743801652892562, + "acc_norm_stderr": 0.03984979653302871 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.04133119440243839, + "acc_norm": 0.7592592592592593, + "acc_norm_stderr": 0.04133119440243839 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6932515337423313, + "acc_stderr": 0.03623089915724147, + "acc_norm": 0.6932515337423313, + "acc_norm_stderr": 0.03623089915724147 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3392857142857143, + "acc_stderr": 0.04493949068613539, + "acc_norm": 0.3392857142857143, + "acc_norm_stderr": 0.04493949068613539 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7281553398058253, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.7281553398058253, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7863247863247863, + "acc_stderr": 0.026853450377009168, + "acc_norm": 0.7863247863247863, + "acc_norm_stderr": 0.026853450377009168 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7790549169859514, + "acc_stderr": 0.014836205167333564, + "acc_norm": 0.7790549169859514, + "acc_norm_stderr": 0.014836205167333564 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6242774566473989, + "acc_stderr": 0.026074314851657083, + "acc_norm": 0.6242774566473989, + "acc_norm_stderr": 0.026074314851657083 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3139664804469274, + "acc_stderr": 0.015521923933523649, + "acc_norm": 0.3139664804469274, + "acc_norm_stderr": 0.015521923933523649 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5947712418300654, + "acc_stderr": 0.028110928492809075, + "acc_norm": 0.5947712418300654, + "acc_norm_stderr": 0.028110928492809075 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6559485530546624, + "acc_stderr": 0.026981478043648043, + "acc_norm": 0.6559485530546624, + "acc_norm_stderr": 0.026981478043648043 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6481481481481481, + "acc_stderr": 0.026571483480719967, + "acc_norm": 0.6481481481481481, + "acc_norm_stderr": 0.026571483480719967 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4574468085106383, + "acc_stderr": 0.029719281272236848, + "acc_norm": 0.4574468085106383, + "acc_norm_stderr": 0.029719281272236848 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44784876140808344, + "acc_stderr": 0.012700582404768217, + "acc_norm": 0.44784876140808344, + "acc_norm_stderr": 0.012700582404768217 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5551470588235294, + "acc_stderr": 0.030187532060329383, + "acc_norm": 0.5551470588235294, + "acc_norm_stderr": 0.030187532060329383 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5751633986928104, + "acc_stderr": 0.01999797303545833, + "acc_norm": 0.5751633986928104, + "acc_norm_stderr": 0.01999797303545833 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6272727272727273, + "acc_stderr": 0.04631381319425465, + "acc_norm": 0.6272727272727273, + "acc_norm_stderr": 0.04631381319425465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5346938775510204, + "acc_stderr": 0.03193207024425314, + "acc_norm": 0.5346938775510204, + "acc_norm_stderr": 0.03193207024425314 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.736318407960199, + "acc_stderr": 0.031157150869355554, + "acc_norm": 0.736318407960199, + "acc_norm_stderr": 0.031157150869355554 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.77, + "acc_stderr": 0.04229525846816505, + "acc_norm": 0.77, + "acc_norm_stderr": 0.04229525846816505 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42168674698795183, + "acc_stderr": 0.03844453181770917, + "acc_norm": 0.42168674698795183, + "acc_norm_stderr": 0.03844453181770917 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7602339181286549, + "acc_stderr": 0.03274485211946956, + "acc_norm": 0.7602339181286549, + "acc_norm_stderr": 0.03274485211946956 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.27539779681762544, + "mc1_stderr": 0.01563813566777552, + "mc2": 0.40185287585334617, + "mc2_stderr": 0.014323074457881934 + }, + "all": { + "acc": 0.5664208490341656, + "acc_stderr": 0.03443990811908472, + "acc_norm": 0.5706784746136042, + "acc_norm_stderr": 0.034420058031149, + "mc1": 0.27539779681762544, + "mc1_stderr": 0.01563813566777552, + "mc2": 0.40185287585334617, + "mc2_stderr": 0.014323074457881934 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6364.167732954025", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/Llama-2-13b-FINETUNE4_TEST2/results_2023-10-25T21-06-32.496100.json b/eval-results/wei123602/Llama-2-13b-FINETUNE4_TEST2/results_2023-10-25T21-06-32.496100.json new file mode 100644 index 0000000000000000000000000000000000000000..2bb39def68b54c71dcbaf95de57669ad2675d5fc --- /dev/null +++ b/eval-results/wei123602/Llama-2-13b-FINETUNE4_TEST2/results_2023-10-25T21-06-32.496100.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "wei123602/Llama-2-13b-FINETUNE4_TEST2", + "model_sha": "e312c4c59cab9d130c33288c92aad7c0cb5331d5", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.15016778523489932, + "em_stderr": 0.0036584290259430103, + "f1": 0.2005201342281873, + "f1_stderr": 0.0036902547918246254 + }, + "harness|gsm8k|5": { + "acc": 0.13191811978771797, + "acc_stderr": 0.009321265253857515 + }, + "harness|winogrande|5": { + "acc": 0.7663772691397001, + "acc_stderr": 0.011892194477183524 + }, + "all": { + "em": 0.15016778523489932, + "em_stderr": 0.0036584290259430103, + "f1": 0.2005201342281873, + "f1_stderr": 0.0036902547918246254, + "acc": 0.449147694463709, + "acc_stderr": 0.010606729865520519 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "37f28eb45daf7cc4" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "4265b7ca37c98bd5" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "d92c93a0e25f0f1d" + }, + "total_evaluation_time_secondes": "11831.553218841553", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/Llama-2-13b-FINETUNE4_TEST3/results_2023-10-04T02-48-34.144397.json b/eval-results/wei123602/Llama-2-13b-FINETUNE4_TEST3/results_2023-10-04T02-48-34.144397.json new file mode 100644 index 0000000000000000000000000000000000000000..062a25c5dfad3a70007432f40aca0cf807934052 --- /dev/null +++ b/eval-results/wei123602/Llama-2-13b-FINETUNE4_TEST3/results_2023-10-04T02-48-34.144397.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "wei123602/Llama-2-13b-FINETUNE4_TEST3", + "model_sha": "e81b5d4550224711929fdea4effdd990cc0c7404", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5358361774744027, + "acc_stderr": 0.01457381366473572, + "acc_norm": 0.590443686006826, + "acc_norm_stderr": 0.014370358632472432 + }, + "harness|hellaswag|10": { + "acc": 0.6083449512049393, + "acc_stderr": 0.004871226629346401, + "acc_norm": 0.8164708225453097, + "acc_norm_stderr": 0.0038630862999845836 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5185185185185185, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.5185185185185185, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5986842105263158, + "acc_stderr": 0.039889037033362836, + "acc_norm": 0.5986842105263158, + "acc_norm_stderr": 0.039889037033362836 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6452830188679245, + "acc_stderr": 0.02944517532819959, + "acc_norm": 0.6452830188679245, + "acc_norm_stderr": 0.02944517532819959 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5694444444444444, + "acc_stderr": 0.04140685639111503, + "acc_norm": 0.5694444444444444, + "acc_norm_stderr": 0.04140685639111503 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.049236596391733084, + "acc_norm": 0.4, + "acc_norm_stderr": 0.049236596391733084 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.37, + "acc_stderr": 0.04852365870939099, + "acc_norm": 0.37, + "acc_norm_stderr": 0.04852365870939099 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5549132947976878, + "acc_stderr": 0.03789401760283647, + "acc_norm": 0.5549132947976878, + "acc_norm_stderr": 0.03789401760283647 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3431372549019608, + "acc_stderr": 0.047240073523838876, + "acc_norm": 0.3431372549019608, + "acc_norm_stderr": 0.047240073523838876 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.41702127659574467, + "acc_stderr": 0.03223276266711712, + "acc_norm": 0.41702127659574467, + "acc_norm_stderr": 0.03223276266711712 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.37719298245614036, + "acc_stderr": 0.04559522141958216, + "acc_norm": 0.37719298245614036, + "acc_norm_stderr": 0.04559522141958216 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.43448275862068964, + "acc_stderr": 0.041307408795554966, + "acc_norm": 0.43448275862068964, + "acc_norm_stderr": 0.041307408795554966 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3544973544973545, + "acc_stderr": 0.024636830602842, + "acc_norm": 0.3544973544973545, + "acc_norm_stderr": 0.024636830602842 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4126984126984127, + "acc_stderr": 0.04403438954768176, + "acc_norm": 0.4126984126984127, + "acc_norm_stderr": 0.04403438954768176 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.35, + "acc_stderr": 0.047937248544110196, + "acc_norm": 0.35, + "acc_norm_stderr": 0.047937248544110196 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5774193548387097, + "acc_stderr": 0.02810096472427264, + "acc_norm": 0.5774193548387097, + "acc_norm_stderr": 0.02810096472427264 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4630541871921182, + "acc_stderr": 0.035083705204426656, + "acc_norm": 0.4630541871921182, + "acc_norm_stderr": 0.035083705204426656 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7454545454545455, + "acc_stderr": 0.03401506715249039, + "acc_norm": 0.7454545454545455, + "acc_norm_stderr": 0.03401506715249039 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7121212121212122, + "acc_stderr": 0.03225883512300992, + "acc_norm": 0.7121212121212122, + "acc_norm_stderr": 0.03225883512300992 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8238341968911918, + "acc_stderr": 0.027493504244548057, + "acc_norm": 0.8238341968911918, + "acc_norm_stderr": 0.027493504244548057 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5794871794871795, + "acc_stderr": 0.02502861027671086, + "acc_norm": 0.5794871794871795, + "acc_norm_stderr": 0.02502861027671086 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.31851851851851853, + "acc_stderr": 0.02840653309060846, + "acc_norm": 0.31851851851851853, + "acc_norm_stderr": 0.02840653309060846 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6428571428571429, + "acc_stderr": 0.031124619309328177, + "acc_norm": 0.6428571428571429, + "acc_norm_stderr": 0.031124619309328177 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.26490066225165565, + "acc_stderr": 0.03603038545360383, + "acc_norm": 0.26490066225165565, + "acc_norm_stderr": 0.03603038545360383 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7724770642201835, + "acc_stderr": 0.017974463578776502, + "acc_norm": 0.7724770642201835, + "acc_norm_stderr": 0.017974463578776502 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5046296296296297, + "acc_stderr": 0.03409825519163572, + "acc_norm": 0.5046296296296297, + "acc_norm_stderr": 0.03409825519163572 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.75, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.75, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7890295358649789, + "acc_stderr": 0.02655837250266192, + "acc_norm": 0.7890295358649789, + "acc_norm_stderr": 0.02655837250266192 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6322869955156951, + "acc_stderr": 0.03236198350928276, + "acc_norm": 0.6322869955156951, + "acc_norm_stderr": 0.03236198350928276 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6030534351145038, + "acc_stderr": 0.04291135671009224, + "acc_norm": 0.6030534351145038, + "acc_norm_stderr": 0.04291135671009224 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6859504132231405, + "acc_stderr": 0.042369647530410184, + "acc_norm": 0.6859504132231405, + "acc_norm_stderr": 0.042369647530410184 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7222222222222222, + "acc_stderr": 0.043300437496507416, + "acc_norm": 0.7222222222222222, + "acc_norm_stderr": 0.043300437496507416 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6441717791411042, + "acc_stderr": 0.03761521380046735, + "acc_norm": 0.6441717791411042, + "acc_norm_stderr": 0.03761521380046735 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7669902912621359, + "acc_stderr": 0.041858325989283136, + "acc_norm": 0.7669902912621359, + "acc_norm_stderr": 0.041858325989283136 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7991452991452992, + "acc_stderr": 0.026246772946890477, + "acc_norm": 0.7991452991452992, + "acc_norm_stderr": 0.026246772946890477 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7713920817369093, + "acc_stderr": 0.015016884698539883, + "acc_norm": 0.7713920817369093, + "acc_norm_stderr": 0.015016884698539883 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6271676300578035, + "acc_stderr": 0.02603389061357628, + "acc_norm": 0.6271676300578035, + "acc_norm_stderr": 0.02603389061357628 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.31620111731843575, + "acc_stderr": 0.015551673652172547, + "acc_norm": 0.31620111731843575, + "acc_norm_stderr": 0.015551673652172547 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6078431372549019, + "acc_stderr": 0.027956046165424513, + "acc_norm": 0.6078431372549019, + "acc_norm_stderr": 0.027956046165424513 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6463022508038585, + "acc_stderr": 0.027155208103200865, + "acc_norm": 0.6463022508038585, + "acc_norm_stderr": 0.027155208103200865 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6450617283950617, + "acc_stderr": 0.02662415247884585, + "acc_norm": 0.6450617283950617, + "acc_norm_stderr": 0.02662415247884585 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5, + "acc_stderr": 0.029827499313594685, + "acc_norm": 0.5, + "acc_norm_stderr": 0.029827499313594685 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4621903520208605, + "acc_stderr": 0.012733671880342507, + "acc_norm": 0.4621903520208605, + "acc_norm_stderr": 0.012733671880342507 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5661764705882353, + "acc_stderr": 0.030105636570016636, + "acc_norm": 0.5661764705882353, + "acc_norm_stderr": 0.030105636570016636 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5947712418300654, + "acc_stderr": 0.019861155193829153, + "acc_norm": 0.5947712418300654, + "acc_norm_stderr": 0.019861155193829153 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6272727272727273, + "acc_stderr": 0.04631381319425464, + "acc_norm": 0.6272727272727273, + "acc_norm_stderr": 0.04631381319425464 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5959183673469388, + "acc_stderr": 0.031414708025865885, + "acc_norm": 0.5959183673469388, + "acc_norm_stderr": 0.031414708025865885 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6069651741293532, + "acc_stderr": 0.0345368246603156, + "acc_norm": 0.6069651741293532, + "acc_norm_stderr": 0.0345368246603156 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.038367221765980515, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.038367221765980515 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7543859649122807, + "acc_stderr": 0.03301405946987249, + "acc_norm": 0.7543859649122807, + "acc_norm_stderr": 0.03301405946987249 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.27050183598531213, + "mc1_stderr": 0.015550778332842893, + "mc2": 0.3997802061886997, + "mc2_stderr": 0.014380874604105908 + }, + "all": { + "acc": 0.5640249019461957, + "acc_stderr": 0.034606850502455905, + "acc_norm": 0.5684780100796328, + "acc_norm_stderr": 0.034586314987852085, + "mc1": 0.27050183598531213, + "mc1_stderr": 0.015550778332842893, + "mc2": 0.3997802061886997, + "mc2_stderr": 0.014380874604105908 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6337.671374797821", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/Llama-2-13b-FINETUNE4_TEST3/results_2023-10-26T13-48-49.333400.json b/eval-results/wei123602/Llama-2-13b-FINETUNE4_TEST3/results_2023-10-26T13-48-49.333400.json new file mode 100644 index 0000000000000000000000000000000000000000..c747393d32f0213bde68a1c2b09621d8ccd13bc9 --- /dev/null +++ b/eval-results/wei123602/Llama-2-13b-FINETUNE4_TEST3/results_2023-10-26T13-48-49.333400.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "wei123602/Llama-2-13b-FINETUNE4_TEST3", + "model_sha": "e81b5d4550224711929fdea4effdd990cc0c7404", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.25010486577181207, + "em_stderr": 0.004435075216390866, + "f1": 0.29184563758389254, + "f1_stderr": 0.004391827411197271 + }, + "harness|gsm8k|5": { + "acc": 0.11220621683093253, + "acc_stderr": 0.008693743138242373 + }, + "harness|winogrande|5": { + "acc": 0.7545382794001578, + "acc_stderr": 0.012095272937183642 + }, + "all": { + "em": 0.25010486577181207, + "em_stderr": 0.004435075216390866, + "f1": 0.29184563758389254, + "f1_stderr": 0.004391827411197271, + "acc": 0.4333722481155452, + "acc_stderr": 0.010394508037713007 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "61313f09c56ce196" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "385baf0a674c0f57" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "cc30b52b5cab3300" + }, + "total_evaluation_time_secondes": "12148.807684659958", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/Llama-2-13b-FINETUNE4_compare8k2/results_2023-10-03T21-01-32.366658.json b/eval-results/wei123602/Llama-2-13b-FINETUNE4_compare8k2/results_2023-10-03T21-01-32.366658.json new file mode 100644 index 0000000000000000000000000000000000000000..ecf9f162ad77f4a705e849864a8b08ed66576cfc --- /dev/null +++ b/eval-results/wei123602/Llama-2-13b-FINETUNE4_compare8k2/results_2023-10-03T21-01-32.366658.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "wei123602/Llama-2-13b-FINETUNE4_compare8k2", + "model_sha": "fe1b604097aad9408ce63fa7ffc9c320cdd06e4f", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5392491467576792, + "acc_stderr": 0.014566303676636586, + "acc_norm": 0.5827645051194539, + "acc_norm_stderr": 0.014409825518403077 + }, + "harness|hellaswag|10": { + "acc": 0.6060545708026289, + "acc_stderr": 0.004876243842318609, + "acc_norm": 0.8138816968731328, + "acc_norm_stderr": 0.0038840668811314745 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.04292596718256981, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.04292596718256981 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5789473684210527, + "acc_stderr": 0.04017901275981749, + "acc_norm": 0.5789473684210527, + "acc_norm_stderr": 0.04017901275981749 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6150943396226415, + "acc_stderr": 0.02994649856769995, + "acc_norm": 0.6150943396226415, + "acc_norm_stderr": 0.02994649856769995 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6458333333333334, + "acc_stderr": 0.039994111357535424, + "acc_norm": 0.6458333333333334, + "acc_norm_stderr": 0.039994111357535424 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.4, + "acc_stderr": 0.04923659639173309, + "acc_norm": 0.4, + "acc_norm_stderr": 0.04923659639173309 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5260115606936416, + "acc_stderr": 0.038073017265045125, + "acc_norm": 0.5260115606936416, + "acc_norm_stderr": 0.038073017265045125 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.39215686274509803, + "acc_stderr": 0.04858083574266345, + "acc_norm": 0.39215686274509803, + "acc_norm_stderr": 0.04858083574266345 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.46382978723404256, + "acc_stderr": 0.03260038511835771, + "acc_norm": 0.46382978723404256, + "acc_norm_stderr": 0.03260038511835771 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.32456140350877194, + "acc_stderr": 0.044045561573747664, + "acc_norm": 0.32456140350877194, + "acc_norm_stderr": 0.044045561573747664 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.47586206896551725, + "acc_stderr": 0.041618085035015295, + "acc_norm": 0.47586206896551725, + "acc_norm_stderr": 0.041618085035015295 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3148148148148148, + "acc_stderr": 0.023919984164047732, + "acc_norm": 0.3148148148148148, + "acc_norm_stderr": 0.023919984164047732 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.373015873015873, + "acc_stderr": 0.04325506042017086, + "acc_norm": 0.373015873015873, + "acc_norm_stderr": 0.04325506042017086 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5903225806451613, + "acc_stderr": 0.027976054915347364, + "acc_norm": 0.5903225806451613, + "acc_norm_stderr": 0.027976054915347364 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.03481904844438804, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.03481904844438804 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237101, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237101 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7212121212121212, + "acc_stderr": 0.03501438706296781, + "acc_norm": 0.7212121212121212, + "acc_norm_stderr": 0.03501438706296781 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.702020202020202, + "acc_stderr": 0.03258630383836556, + "acc_norm": 0.702020202020202, + "acc_norm_stderr": 0.03258630383836556 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8082901554404145, + "acc_stderr": 0.02840895362624528, + "acc_norm": 0.8082901554404145, + "acc_norm_stderr": 0.02840895362624528 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5974358974358974, + "acc_stderr": 0.024864995159767755, + "acc_norm": 0.5974358974358974, + "acc_norm_stderr": 0.024864995159767755 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3111111111111111, + "acc_stderr": 0.028226446749683515, + "acc_norm": 0.3111111111111111, + "acc_norm_stderr": 0.028226446749683515 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6134453781512605, + "acc_stderr": 0.03163145807552378, + "acc_norm": 0.6134453781512605, + "acc_norm_stderr": 0.03163145807552378 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.31788079470198677, + "acc_stderr": 0.03802039760107903, + "acc_norm": 0.31788079470198677, + "acc_norm_stderr": 0.03802039760107903 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7963302752293578, + "acc_stderr": 0.017266742087630793, + "acc_norm": 0.7963302752293578, + "acc_norm_stderr": 0.017266742087630793 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.46296296296296297, + "acc_stderr": 0.03400603625538271, + "acc_norm": 0.46296296296296297, + "acc_norm_stderr": 0.03400603625538271 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8137254901960784, + "acc_stderr": 0.027325470966716323, + "acc_norm": 0.8137254901960784, + "acc_norm_stderr": 0.027325470966716323 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7426160337552743, + "acc_stderr": 0.028458820991460302, + "acc_norm": 0.7426160337552743, + "acc_norm_stderr": 0.028458820991460302 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6412556053811659, + "acc_stderr": 0.03219079200419995, + "acc_norm": 0.6412556053811659, + "acc_norm_stderr": 0.03219079200419995 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6335877862595419, + "acc_stderr": 0.04225875451969637, + "acc_norm": 0.6335877862595419, + "acc_norm_stderr": 0.04225875451969637 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6776859504132231, + "acc_stderr": 0.042664163633521685, + "acc_norm": 0.6776859504132231, + "acc_norm_stderr": 0.042664163633521685 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.044531975073749834, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.044531975073749834 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6871165644171779, + "acc_stderr": 0.03642914578292406, + "acc_norm": 0.6871165644171779, + "acc_norm_stderr": 0.03642914578292406 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3392857142857143, + "acc_stderr": 0.044939490686135404, + "acc_norm": 0.3392857142857143, + "acc_norm_stderr": 0.044939490686135404 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7766990291262136, + "acc_stderr": 0.04123553189891431, + "acc_norm": 0.7766990291262136, + "acc_norm_stderr": 0.04123553189891431 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.811965811965812, + "acc_stderr": 0.025598193686652244, + "acc_norm": 0.811965811965812, + "acc_norm_stderr": 0.025598193686652244 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7675606641123882, + "acc_stderr": 0.015104550008905706, + "acc_norm": 0.7675606641123882, + "acc_norm_stderr": 0.015104550008905706 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.630057803468208, + "acc_stderr": 0.025992472029306376, + "acc_norm": 0.630057803468208, + "acc_norm_stderr": 0.025992472029306376 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2782122905027933, + "acc_stderr": 0.014987325439963554, + "acc_norm": 0.2782122905027933, + "acc_norm_stderr": 0.014987325439963554 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.630718954248366, + "acc_stderr": 0.02763417668960266, + "acc_norm": 0.630718954248366, + "acc_norm_stderr": 0.02763417668960266 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.662379421221865, + "acc_stderr": 0.02685882587948854, + "acc_norm": 0.662379421221865, + "acc_norm_stderr": 0.02685882587948854 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6419753086419753, + "acc_stderr": 0.026675611926037082, + "acc_norm": 0.6419753086419753, + "acc_norm_stderr": 0.026675611926037082 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.44680851063829785, + "acc_stderr": 0.029658235097666904, + "acc_norm": 0.44680851063829785, + "acc_norm_stderr": 0.029658235097666904 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.44784876140808344, + "acc_stderr": 0.01270058240476822, + "acc_norm": 0.44784876140808344, + "acc_norm_stderr": 0.01270058240476822 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6176470588235294, + "acc_stderr": 0.02952009569768776, + "acc_norm": 0.6176470588235294, + "acc_norm_stderr": 0.02952009569768776 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6045751633986928, + "acc_stderr": 0.019780465954777515, + "acc_norm": 0.6045751633986928, + "acc_norm_stderr": 0.019780465954777515 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6, + "acc_stderr": 0.0469237132203465, + "acc_norm": 0.6, + "acc_norm_stderr": 0.0469237132203465 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6489795918367347, + "acc_stderr": 0.03055531675557364, + "acc_norm": 0.6489795918367347, + "acc_norm_stderr": 0.03055531675557364 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.736318407960199, + "acc_stderr": 0.03115715086935559, + "acc_norm": 0.736318407960199, + "acc_norm_stderr": 0.03115715086935559 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.033799766898963086, + "acc_norm": 0.87, + "acc_norm_stderr": 0.033799766898963086 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4578313253012048, + "acc_stderr": 0.038786267710023595, + "acc_norm": 0.4578313253012048, + "acc_norm_stderr": 0.038786267710023595 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7602339181286549, + "acc_stderr": 0.03274485211946956, + "acc_norm": 0.7602339181286549, + "acc_norm_stderr": 0.03274485211946956 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2778457772337821, + "mc1_stderr": 0.015680929364024654, + "mc2": 0.3985894900243793, + "mc2_stderr": 0.014339806253558348 + }, + "all": { + "acc": 0.5687964083352047, + "acc_stderr": 0.03428422878368393, + "acc_norm": 0.5730564504442265, + "acc_norm_stderr": 0.034264760052846295, + "mc1": 0.2778457772337821, + "mc1_stderr": 0.015680929364024654, + "mc2": 0.3985894900243793, + "mc2_stderr": 0.014339806253558348 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6350.806956291199", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/Llama-2-13b-FINETUNE4_compare8k2/results_2023-10-25T20-52-42.973065.json b/eval-results/wei123602/Llama-2-13b-FINETUNE4_compare8k2/results_2023-10-25T20-52-42.973065.json new file mode 100644 index 0000000000000000000000000000000000000000..e73b7554d0cbbf6c02739aaee245bf51d96a3066 --- /dev/null +++ b/eval-results/wei123602/Llama-2-13b-FINETUNE4_compare8k2/results_2023-10-25T20-52-42.973065.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "wei123602/Llama-2-13b-FINETUNE4_compare8k2", + "model_sha": "fe1b604097aad9408ce63fa7ffc9c320cdd06e4f", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.1950503355704698, + "em_stderr": 0.004057862846679778, + "f1": 0.24364408557046902, + "f1_stderr": 0.0040320043717616385 + }, + "harness|gsm8k|5": { + "acc": 0.11902956785443518, + "acc_stderr": 0.008919702911161618 + }, + "harness|winogrande|5": { + "acc": 0.7600631412786109, + "acc_stderr": 0.012002078629485742 + }, + "all": { + "em": 0.1950503355704698, + "em_stderr": 0.004057862846679778, + "f1": 0.24364408557046902, + "f1_stderr": 0.0040320043717616385, + "acc": 0.439546354566523, + "acc_stderr": 0.01046089077032368 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "8f57a63bf2347dfb" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "7b145a0b68f1850d" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "d7dac9d8239029f2" + }, + "total_evaluation_time_secondes": "12640.03461766243", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/llama-13b-FINETUNE3/results_2023-09-13T02-24-38.254919.json b/eval-results/wei123602/llama-13b-FINETUNE3/results_2023-09-13T02-24-38.254919.json new file mode 100644 index 0000000000000000000000000000000000000000..5cc0a73fb1b145a71c053c903f79c61e9b7d41f6 --- /dev/null +++ b/eval-results/wei123602/llama-13b-FINETUNE3/results_2023-09-13T02-24-38.254919.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "wei123602/llama-13b-FINETUNE3", + "model_sha": "bacd035db122dafaf86bf52bb9ca8c613070cc58", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5520477815699659, + "acc_stderr": 0.014532011498211676, + "acc_norm": 0.5930034129692833, + "acc_norm_stderr": 0.014356399418009121 + }, + "harness|hellaswag|10": { + "acc": 0.6093407687711612, + "acc_stderr": 0.004869010152280755, + "acc_norm": 0.8152758414658434, + "acc_norm_stderr": 0.00387280518960755 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.4444444444444444, + "acc_stderr": 0.04292596718256981, + "acc_norm": 0.4444444444444444, + "acc_norm_stderr": 0.04292596718256981 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5921052631578947, + "acc_stderr": 0.039993097127774734, + "acc_norm": 0.5921052631578947, + "acc_norm_stderr": 0.039993097127774734 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.53, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.53, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5849056603773585, + "acc_stderr": 0.030325945789286112, + "acc_norm": 0.5849056603773585, + "acc_norm_stderr": 0.030325945789286112 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6458333333333334, + "acc_stderr": 0.039994111357535424, + "acc_norm": 0.6458333333333334, + "acc_norm_stderr": 0.039994111357535424 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.45, + "acc_stderr": 0.05, + "acc_norm": 0.45, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.31, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.31, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5433526011560693, + "acc_stderr": 0.03798106566014498, + "acc_norm": 0.5433526011560693, + "acc_norm_stderr": 0.03798106566014498 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.39215686274509803, + "acc_stderr": 0.04858083574266344, + "acc_norm": 0.39215686274509803, + "acc_norm_stderr": 0.04858083574266344 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.69, + "acc_stderr": 0.04648231987117316, + "acc_norm": 0.69, + "acc_norm_stderr": 0.04648231987117316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.03261936918467382, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.03261936918467382 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.04339138322579861, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.04339138322579861 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.503448275862069, + "acc_stderr": 0.04166567577101579, + "acc_norm": 0.503448275862069, + "acc_norm_stderr": 0.04166567577101579 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3306878306878307, + "acc_stderr": 0.024229965298425082, + "acc_norm": 0.3306878306878307, + "acc_norm_stderr": 0.024229965298425082 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.40476190476190477, + "acc_stderr": 0.04390259265377562, + "acc_norm": 0.40476190476190477, + "acc_norm_stderr": 0.04390259265377562 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.38, + "acc_stderr": 0.04878317312145633, + "acc_norm": 0.38, + "acc_norm_stderr": 0.04878317312145633 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6548387096774193, + "acc_stderr": 0.02704574657353433, + "acc_norm": 0.6548387096774193, + "acc_norm_stderr": 0.02704574657353433 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.43349753694581283, + "acc_stderr": 0.03486731727419872, + "acc_norm": 0.43349753694581283, + "acc_norm_stderr": 0.03486731727419872 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7212121212121212, + "acc_stderr": 0.035014387062967806, + "acc_norm": 0.7212121212121212, + "acc_norm_stderr": 0.035014387062967806 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7373737373737373, + "acc_stderr": 0.03135305009533085, + "acc_norm": 0.7373737373737373, + "acc_norm_stderr": 0.03135305009533085 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8238341968911918, + "acc_stderr": 0.02749350424454805, + "acc_norm": 0.8238341968911918, + "acc_norm_stderr": 0.02749350424454805 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5974358974358974, + "acc_stderr": 0.024864995159767755, + "acc_norm": 0.5974358974358974, + "acc_norm_stderr": 0.024864995159767755 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.34814814814814815, + "acc_stderr": 0.029045600290616255, + "acc_norm": 0.34814814814814815, + "acc_norm_stderr": 0.029045600290616255 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6386554621848739, + "acc_stderr": 0.03120469122515001, + "acc_norm": 0.6386554621848739, + "acc_norm_stderr": 0.03120469122515001 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2980132450331126, + "acc_stderr": 0.037345356767871984, + "acc_norm": 0.2980132450331126, + "acc_norm_stderr": 0.037345356767871984 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7871559633027523, + "acc_stderr": 0.017549376389313694, + "acc_norm": 0.7871559633027523, + "acc_norm_stderr": 0.017549376389313694 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4861111111111111, + "acc_stderr": 0.03408655867977748, + "acc_norm": 0.4861111111111111, + "acc_norm_stderr": 0.03408655867977748 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8088235294117647, + "acc_stderr": 0.027599174300640766, + "acc_norm": 0.8088235294117647, + "acc_norm_stderr": 0.027599174300640766 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7848101265822784, + "acc_stderr": 0.026750826994676166, + "acc_norm": 0.7848101265822784, + "acc_norm_stderr": 0.026750826994676166 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6547085201793722, + "acc_stderr": 0.03191100192835794, + "acc_norm": 0.6547085201793722, + "acc_norm_stderr": 0.03191100192835794 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6564885496183206, + "acc_stderr": 0.041649760719448786, + "acc_norm": 0.6564885496183206, + "acc_norm_stderr": 0.041649760719448786 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7355371900826446, + "acc_stderr": 0.04026187527591207, + "acc_norm": 0.7355371900826446, + "acc_norm_stderr": 0.04026187527591207 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7407407407407407, + "acc_stderr": 0.04236511258094633, + "acc_norm": 0.7407407407407407, + "acc_norm_stderr": 0.04236511258094633 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6625766871165644, + "acc_stderr": 0.03714908409935574, + "acc_norm": 0.6625766871165644, + "acc_norm_stderr": 0.03714908409935574 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.04432804055291517, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.04432804055291517 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7184466019417476, + "acc_stderr": 0.04453254836326467, + "acc_norm": 0.7184466019417476, + "acc_norm_stderr": 0.04453254836326467 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7863247863247863, + "acc_stderr": 0.026853450377009157, + "acc_norm": 0.7863247863247863, + "acc_norm_stderr": 0.026853450377009157 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.59, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.768837803320562, + "acc_stderr": 0.015075523238101083, + "acc_norm": 0.768837803320562, + "acc_norm_stderr": 0.015075523238101083 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.638728323699422, + "acc_stderr": 0.025862201852277895, + "acc_norm": 0.638728323699422, + "acc_norm_stderr": 0.025862201852277895 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3474860335195531, + "acc_stderr": 0.01592556406020815, + "acc_norm": 0.3474860335195531, + "acc_norm_stderr": 0.01592556406020815 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6078431372549019, + "acc_stderr": 0.027956046165424516, + "acc_norm": 0.6078431372549019, + "acc_norm_stderr": 0.027956046165424516 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6463022508038585, + "acc_stderr": 0.027155208103200858, + "acc_norm": 0.6463022508038585, + "acc_norm_stderr": 0.027155208103200858 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6820987654320988, + "acc_stderr": 0.02591006352824089, + "acc_norm": 0.6820987654320988, + "acc_norm_stderr": 0.02591006352824089 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.5177304964539007, + "acc_stderr": 0.02980873964223777, + "acc_norm": 0.5177304964539007, + "acc_norm_stderr": 0.02980873964223777 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4485006518904824, + "acc_stderr": 0.0127023174905598, + "acc_norm": 0.4485006518904824, + "acc_norm_stderr": 0.0127023174905598 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5588235294117647, + "acc_stderr": 0.030161911930767105, + "acc_norm": 0.5588235294117647, + "acc_norm_stderr": 0.030161911930767105 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5947712418300654, + "acc_stderr": 0.019861155193829156, + "acc_norm": 0.5947712418300654, + "acc_norm_stderr": 0.019861155193829156 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5795918367346938, + "acc_stderr": 0.03160106993449601, + "acc_norm": 0.5795918367346938, + "acc_norm_stderr": 0.03160106993449601 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.736318407960199, + "acc_stderr": 0.031157150869355586, + "acc_norm": 0.736318407960199, + "acc_norm_stderr": 0.031157150869355586 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.81, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.81, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.42771084337349397, + "acc_stderr": 0.038515976837185335, + "acc_norm": 0.42771084337349397, + "acc_norm_stderr": 0.038515976837185335 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.8187134502923976, + "acc_stderr": 0.029547741687640038, + "acc_norm": 0.8187134502923976, + "acc_norm_stderr": 0.029547741687640038 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2802937576499388, + "mc1_stderr": 0.015723139524608767, + "mc2": 0.41626548348007847, + "mc2_stderr": 0.014474677192828984 + }, + "all": { + "acc": 0.574816427427013, + "acc_stderr": 0.034285561451492357, + "acc_norm": 0.5790010156319961, + "acc_norm_stderr": 0.03426570014568091, + "mc1": 0.2802937576499388, + "mc1_stderr": 0.015723139524608767, + "mc2": 0.41626548348007847, + "mc2_stderr": 0.014474677192828984 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6360.256312131882", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/llama-13b-FINETUNE3/results_2023-10-25T20-56-48.132337.json b/eval-results/wei123602/llama-13b-FINETUNE3/results_2023-10-25T20-56-48.132337.json new file mode 100644 index 0000000000000000000000000000000000000000..0a21d8b0ba1c9f792f3c918dde110d965e10c352 --- /dev/null +++ b/eval-results/wei123602/llama-13b-FINETUNE3/results_2023-10-25T20-56-48.132337.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "wei123602/llama-13b-FINETUNE3", + "model_sha": "bacd035db122dafaf86bf52bb9ca8c613070cc58", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.10360738255033557, + "em_stderr": 0.003120930790921416, + "f1": 0.14798552852348912, + "f1_stderr": 0.003214007613815376 + }, + "harness|gsm8k|5": { + "acc": 0.12130401819560273, + "acc_stderr": 0.00899288849727557 + }, + "harness|winogrande|5": { + "acc": 0.7671665351223362, + "acc_stderr": 0.01187820107385654 + }, + "all": { + "em": 0.10360738255033557, + "em_stderr": 0.003120930790921416, + "f1": 0.14798552852348912, + "f1_stderr": 0.003214007613815376, + "acc": 0.4442352766589695, + "acc_stderr": 0.010435544785566055 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "7dc0b5e0f9b1dd6c" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "3a0a4eb97c62ddb8" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "caa0c36544a374a9" + }, + "total_evaluation_time_secondes": "12346.43645477295", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/llama2-13b-FINETUNE3_TEST/results_2023-09-14T13-48-26.265439.json b/eval-results/wei123602/llama2-13b-FINETUNE3_TEST/results_2023-09-14T13-48-26.265439.json new file mode 100644 index 0000000000000000000000000000000000000000..8aab3c61485557aa8cfa947d5ba64cd842f44bd9 --- /dev/null +++ b/eval-results/wei123602/llama2-13b-FINETUNE3_TEST/results_2023-09-14T13-48-26.265439.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "wei123602/llama2-13b-FINETUNE3_TEST", + "model_sha": "22cea7bf138eb0d6c962812df2b2235290acbee2", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.47952218430034127, + "acc_stderr": 0.014599131353035004, + "acc_norm": 0.5366894197952219, + "acc_norm_stderr": 0.014572000527756989 + }, + "harness|hellaswag|10": { + "acc": 0.5889265086636128, + "acc_stderr": 0.004910229643262737, + "acc_norm": 0.7965544712208723, + "acc_norm_stderr": 0.004017383866405767 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45925925925925926, + "acc_stderr": 0.04304979692464243, + "acc_norm": 0.45925925925925926, + "acc_norm_stderr": 0.04304979692464243 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5526315789473685, + "acc_stderr": 0.0404633688397825, + "acc_norm": 0.5526315789473685, + "acc_norm_stderr": 0.0404633688397825 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.57, + "acc_stderr": 0.04975698519562428, + "acc_norm": 0.57, + "acc_norm_stderr": 0.04975698519562428 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5962264150943396, + "acc_stderr": 0.030197611600197946, + "acc_norm": 0.5962264150943396, + "acc_norm_stderr": 0.030197611600197946 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.5625, + "acc_stderr": 0.04148415739394154, + "acc_norm": 0.5625, + "acc_norm_stderr": 0.04148415739394154 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.49, + "acc_stderr": 0.05024183937956912, + "acc_norm": 0.49, + "acc_norm_stderr": 0.05024183937956912 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252605, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252605 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5433526011560693, + "acc_stderr": 0.03798106566014498, + "acc_norm": 0.5433526011560693, + "acc_norm_stderr": 0.03798106566014498 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.04784060704105654, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.04784060704105654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.62, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.62, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4765957446808511, + "acc_stderr": 0.03265019475033582, + "acc_norm": 0.4765957446808511, + "acc_norm_stderr": 0.03265019475033582 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.30701754385964913, + "acc_stderr": 0.043391383225798615, + "acc_norm": 0.30701754385964913, + "acc_norm_stderr": 0.043391383225798615 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.3931034482758621, + "acc_stderr": 0.040703290137070705, + "acc_norm": 0.3931034482758621, + "acc_norm_stderr": 0.040703290137070705 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3201058201058201, + "acc_stderr": 0.024026846392873506, + "acc_norm": 0.3201058201058201, + "acc_norm_stderr": 0.024026846392873506 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4523809523809524, + "acc_stderr": 0.044518079590553275, + "acc_norm": 0.4523809523809524, + "acc_norm_stderr": 0.044518079590553275 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.33, + "acc_stderr": 0.04725815626252606, + "acc_norm": 0.33, + "acc_norm_stderr": 0.04725815626252606 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6225806451612903, + "acc_stderr": 0.027575960723278246, + "acc_norm": 0.6225806451612903, + "acc_norm_stderr": 0.027575960723278246 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.46798029556650245, + "acc_stderr": 0.035107665979592154, + "acc_norm": 0.46798029556650245, + "acc_norm_stderr": 0.035107665979592154 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.57, + "acc_stderr": 0.049756985195624284, + "acc_norm": 0.57, + "acc_norm_stderr": 0.049756985195624284 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6666666666666666, + "acc_stderr": 0.0368105086916155, + "acc_norm": 0.6666666666666666, + "acc_norm_stderr": 0.0368105086916155 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6767676767676768, + "acc_stderr": 0.03332299921070644, + "acc_norm": 0.6767676767676768, + "acc_norm_stderr": 0.03332299921070644 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8238341968911918, + "acc_stderr": 0.027493504244548047, + "acc_norm": 0.8238341968911918, + "acc_norm_stderr": 0.027493504244548047 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5692307692307692, + "acc_stderr": 0.02510682066053975, + "acc_norm": 0.5692307692307692, + "acc_norm_stderr": 0.02510682066053975 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.28888888888888886, + "acc_stderr": 0.02763490726417854, + "acc_norm": 0.28888888888888886, + "acc_norm_stderr": 0.02763490726417854 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6008403361344538, + "acc_stderr": 0.03181110032413926, + "acc_norm": 0.6008403361344538, + "acc_norm_stderr": 0.03181110032413926 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7889908256880734, + "acc_stderr": 0.01749392240411265, + "acc_norm": 0.7889908256880734, + "acc_norm_stderr": 0.01749392240411265 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.46296296296296297, + "acc_stderr": 0.03400603625538271, + "acc_norm": 0.46296296296296297, + "acc_norm_stderr": 0.03400603625538271 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7598039215686274, + "acc_stderr": 0.02998373305591362, + "acc_norm": 0.7598039215686274, + "acc_norm_stderr": 0.02998373305591362 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7510548523206751, + "acc_stderr": 0.028146970599422644, + "acc_norm": 0.7510548523206751, + "acc_norm_stderr": 0.028146970599422644 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.5650224215246636, + "acc_stderr": 0.033272833702713445, + "acc_norm": 0.5650224215246636, + "acc_norm_stderr": 0.033272833702713445 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5801526717557252, + "acc_stderr": 0.04328577215262971, + "acc_norm": 0.5801526717557252, + "acc_norm_stderr": 0.04328577215262971 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6446280991735537, + "acc_stderr": 0.0436923632657398, + "acc_norm": 0.6446280991735537, + "acc_norm_stderr": 0.0436923632657398 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.04453197507374983, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.04453197507374983 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6257668711656442, + "acc_stderr": 0.03802068102899616, + "acc_norm": 0.6257668711656442, + "acc_norm_stderr": 0.03802068102899616 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.32142857142857145, + "acc_stderr": 0.04432804055291518, + "acc_norm": 0.32142857142857145, + "acc_norm_stderr": 0.04432804055291518 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.6893203883495146, + "acc_stderr": 0.0458212416016155, + "acc_norm": 0.6893203883495146, + "acc_norm_stderr": 0.0458212416016155 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8247863247863247, + "acc_stderr": 0.024904439098918228, + "acc_norm": 0.8247863247863247, + "acc_norm_stderr": 0.024904439098918228 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.61, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.61, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.719029374201788, + "acc_stderr": 0.016073127851221225, + "acc_norm": 0.719029374201788, + "acc_norm_stderr": 0.016073127851221225 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5722543352601156, + "acc_stderr": 0.026636539741116086, + "acc_norm": 0.5722543352601156, + "acc_norm_stderr": 0.026636539741116086 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3128491620111732, + "acc_stderr": 0.015506892594647274, + "acc_norm": 0.3128491620111732, + "acc_norm_stderr": 0.015506892594647274 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5816993464052288, + "acc_stderr": 0.028245134024387296, + "acc_norm": 0.5816993464052288, + "acc_norm_stderr": 0.028245134024387296 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6270096463022508, + "acc_stderr": 0.0274666102131401, + "acc_norm": 0.6270096463022508, + "acc_norm_stderr": 0.0274666102131401 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.027125115513166858, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.027125115513166858 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.3723404255319149, + "acc_stderr": 0.02883892147125146, + "acc_norm": 0.3723404255319149, + "acc_norm_stderr": 0.02883892147125146 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4511082138200782, + "acc_stderr": 0.012709037347346233, + "acc_norm": 0.4511082138200782, + "acc_norm_stderr": 0.012709037347346233 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5845588235294118, + "acc_stderr": 0.02993534270787774, + "acc_norm": 0.5845588235294118, + "acc_norm_stderr": 0.02993534270787774 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5522875816993464, + "acc_stderr": 0.020116925347422425, + "acc_norm": 0.5522875816993464, + "acc_norm_stderr": 0.020116925347422425 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.04607582090719976, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.04607582090719976 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5428571428571428, + "acc_stderr": 0.031891418324213966, + "acc_norm": 0.5428571428571428, + "acc_norm_stderr": 0.031891418324213966 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.6716417910447762, + "acc_stderr": 0.033206858897443244, + "acc_norm": 0.6716417910447762, + "acc_norm_stderr": 0.033206858897443244 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.72, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.72, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.41566265060240964, + "acc_stderr": 0.038367221765980515, + "acc_norm": 0.41566265060240964, + "acc_norm_stderr": 0.038367221765980515 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7368421052631579, + "acc_stderr": 0.03377310252209205, + "acc_norm": 0.7368421052631579, + "acc_norm_stderr": 0.03377310252209205 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.27539779681762544, + "mc1_stderr": 0.01563813566777552, + "mc2": 0.40223962650004846, + "mc2_stderr": 0.014370809574399035 + }, + "all": { + "acc": 0.5444895368748129, + "acc_stderr": 0.03478801872078259, + "acc_norm": 0.5489775910790865, + "acc_norm_stderr": 0.03477242589701758, + "mc1": 0.27539779681762544, + "mc1_stderr": 0.01563813566777552, + "mc2": 0.40223962650004846, + "mc2_stderr": 0.014370809574399035 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6334.192758321762", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/llama2-13b-FINETUNE3_TEST/results_2023-10-24T13-49-39.272665.json b/eval-results/wei123602/llama2-13b-FINETUNE3_TEST/results_2023-10-24T13-49-39.272665.json new file mode 100644 index 0000000000000000000000000000000000000000..6002849db7bea1f1c3b9d2056e4879745860ae63 --- /dev/null +++ b/eval-results/wei123602/llama2-13b-FINETUNE3_TEST/results_2023-10-24T13-49-39.272665.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "wei123602/llama2-13b-FINETUNE3_TEST", + "model_sha": "22cea7bf138eb0d6c962812df2b2235290acbee2", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.21885486577181207, + "em_stderr": 0.004234319461313102, + "f1": 0.2578481543624162, + "f1_stderr": 0.004224663408638886 + }, + "harness|gsm8k|5": { + "acc": 0.14556482183472327, + "acc_stderr": 0.009714267797726266 + }, + "harness|winogrande|5": { + "acc": 0.7592738752959748, + "acc_stderr": 0.01201555921222418 + }, + "all": { + "em": 0.21885486577181207, + "em_stderr": 0.004234319461313102, + "f1": 0.2578481543624162, + "f1_stderr": 0.004224663408638886, + "acc": 0.45241934856534904, + "acc_stderr": 0.010864913504975222 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "d2800e1afb32d858" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "79527b797ee704ef" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "5e31766e09f6ac3a" + }, + "total_evaluation_time_secondes": "11897.697054624557", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/llama2-13b-FINETUNE3_TEST2/results_2023-09-14T13-51-34.438102.json b/eval-results/wei123602/llama2-13b-FINETUNE3_TEST2/results_2023-09-14T13-51-34.438102.json new file mode 100644 index 0000000000000000000000000000000000000000..2e40289996f22a7d7b4631bba5c0b501db170c20 --- /dev/null +++ b/eval-results/wei123602/llama2-13b-FINETUNE3_TEST2/results_2023-09-14T13-51-34.438102.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "wei123602/llama2-13b-FINETUNE3_TEST2", + "model_sha": "9e6431061bd13852a7435f5fe7a6eb0bbd148e14", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5136518771331058, + "acc_stderr": 0.014605943429860945, + "acc_norm": 0.5469283276450512, + "acc_norm_stderr": 0.014546892052005628 + }, + "harness|hellaswag|10": { + "acc": 0.6077474606652061, + "acc_stderr": 0.004872546302641852, + "acc_norm": 0.8147779326827326, + "acc_norm_stderr": 0.0038768367094611364 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.48148148148148145, + "acc_stderr": 0.043163785995113245, + "acc_norm": 0.48148148148148145, + "acc_norm_stderr": 0.043163785995113245 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5526315789473685, + "acc_stderr": 0.04046336883978251, + "acc_norm": 0.5526315789473685, + "acc_norm_stderr": 0.04046336883978251 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5962264150943396, + "acc_stderr": 0.03019761160019795, + "acc_norm": 0.5962264150943396, + "acc_norm_stderr": 0.03019761160019795 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6597222222222222, + "acc_stderr": 0.039621355734862175, + "acc_norm": 0.6597222222222222, + "acc_norm_stderr": 0.039621355734862175 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542127, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542127 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6069364161849711, + "acc_stderr": 0.0372424959581773, + "acc_norm": 0.6069364161849711, + "acc_norm_stderr": 0.0372424959581773 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.04690650298201942, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.04690650298201942 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.48936170212765956, + "acc_stderr": 0.03267862331014063, + "acc_norm": 0.48936170212765956, + "acc_norm_stderr": 0.03267862331014063 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3684210526315789, + "acc_stderr": 0.04537815354939392, + "acc_norm": 0.3684210526315789, + "acc_norm_stderr": 0.04537815354939392 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.04164188720169377, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.04164188720169377 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.025107425481137285, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.025107425481137285 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.4365079365079365, + "acc_stderr": 0.04435932892851466, + "acc_norm": 0.4365079365079365, + "acc_norm_stderr": 0.04435932892851466 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.41, + "acc_stderr": 0.04943110704237102, + "acc_norm": 0.41, + "acc_norm_stderr": 0.04943110704237102 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.5516129032258065, + "acc_stderr": 0.028292056830112735, + "acc_norm": 0.5516129032258065, + "acc_norm_stderr": 0.028292056830112735 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.03481904844438803, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.03481904844438803 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7090909090909091, + "acc_stderr": 0.03546563019624335, + "acc_norm": 0.7090909090909091, + "acc_norm_stderr": 0.03546563019624335 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.03173071239071724, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.03173071239071724 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8186528497409327, + "acc_stderr": 0.02780703236068609, + "acc_norm": 0.8186528497409327, + "acc_norm_stderr": 0.02780703236068609 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5538461538461539, + "acc_stderr": 0.02520357177302833, + "acc_norm": 0.5538461538461539, + "acc_norm_stderr": 0.02520357177302833 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.337037037037037, + "acc_stderr": 0.028820884666253252, + "acc_norm": 0.337037037037037, + "acc_norm_stderr": 0.028820884666253252 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6092436974789915, + "acc_stderr": 0.031693802357129965, + "acc_norm": 0.6092436974789915, + "acc_norm_stderr": 0.031693802357129965 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.36423841059602646, + "acc_stderr": 0.03929111781242742, + "acc_norm": 0.36423841059602646, + "acc_norm_stderr": 0.03929111781242742 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.8, + "acc_stderr": 0.017149858514250958, + "acc_norm": 0.8, + "acc_norm_stderr": 0.017149858514250958 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.5462962962962963, + "acc_stderr": 0.03395322726375798, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.03395322726375798 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7696078431372549, + "acc_stderr": 0.029554292605695066, + "acc_norm": 0.7696078431372549, + "acc_norm_stderr": 0.029554292605695066 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7679324894514767, + "acc_stderr": 0.02747974455080851, + "acc_norm": 0.7679324894514767, + "acc_norm_stderr": 0.02747974455080851 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6322869955156951, + "acc_stderr": 0.03236198350928276, + "acc_norm": 0.6322869955156951, + "acc_norm_stderr": 0.03236198350928276 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5954198473282443, + "acc_stderr": 0.043046937953806645, + "acc_norm": 0.5954198473282443, + "acc_norm_stderr": 0.043046937953806645 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6528925619834711, + "acc_stderr": 0.043457245702925335, + "acc_norm": 0.6528925619834711, + "acc_norm_stderr": 0.043457245702925335 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.7129629629629629, + "acc_stderr": 0.04373313040914761, + "acc_norm": 0.7129629629629629, + "acc_norm_stderr": 0.04373313040914761 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.656441717791411, + "acc_stderr": 0.037311335196738925, + "acc_norm": 0.656441717791411, + "acc_norm_stderr": 0.037311335196738925 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.39285714285714285, + "acc_stderr": 0.04635550135609976, + "acc_norm": 0.39285714285714285, + "acc_norm_stderr": 0.04635550135609976 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7475728155339806, + "acc_stderr": 0.04301250399690878, + "acc_norm": 0.7475728155339806, + "acc_norm_stderr": 0.04301250399690878 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7948717948717948, + "acc_stderr": 0.02645350805404033, + "acc_norm": 0.7948717948717948, + "acc_norm_stderr": 0.02645350805404033 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.64, + "acc_stderr": 0.04824181513244218, + "acc_norm": 0.64, + "acc_norm_stderr": 0.04824181513244218 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7637292464878672, + "acc_stderr": 0.015190473717037495, + "acc_norm": 0.7637292464878672, + "acc_norm_stderr": 0.015190473717037495 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.5982658959537572, + "acc_stderr": 0.026394104177643634, + "acc_norm": 0.5982658959537572, + "acc_norm_stderr": 0.026394104177643634 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.39553072625698327, + "acc_stderr": 0.016353415410075775, + "acc_norm": 0.39553072625698327, + "acc_norm_stderr": 0.016353415410075775 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5784313725490197, + "acc_stderr": 0.02827549015679146, + "acc_norm": 0.5784313725490197, + "acc_norm_stderr": 0.02827549015679146 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6495176848874598, + "acc_stderr": 0.027098652621301757, + "acc_norm": 0.6495176848874598, + "acc_norm_stderr": 0.027098652621301757 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6450617283950617, + "acc_stderr": 0.02662415247884585, + "acc_norm": 0.6450617283950617, + "acc_norm_stderr": 0.02662415247884585 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.4326241134751773, + "acc_stderr": 0.029555454236778855, + "acc_norm": 0.4326241134751773, + "acc_norm_stderr": 0.029555454236778855 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.43089960886571055, + "acc_stderr": 0.012647695889547231, + "acc_norm": 0.43089960886571055, + "acc_norm_stderr": 0.012647695889547231 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5625, + "acc_stderr": 0.030134614954403924, + "acc_norm": 0.5625, + "acc_norm_stderr": 0.030134614954403924 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.01994491413687358, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.01994491413687358 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.046075820907199756, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.046075820907199756 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5918367346938775, + "acc_stderr": 0.03146465712827424, + "acc_norm": 0.5918367346938775, + "acc_norm_stderr": 0.03146465712827424 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7313432835820896, + "acc_stderr": 0.03134328358208954, + "acc_norm": 0.7313432835820896, + "acc_norm_stderr": 0.03134328358208954 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.74, + "acc_stderr": 0.044084400227680794, + "acc_norm": 0.74, + "acc_norm_stderr": 0.044084400227680794 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.43373493975903615, + "acc_stderr": 0.03858158940685517, + "acc_norm": 0.43373493975903615, + "acc_norm_stderr": 0.03858158940685517 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.03188578017686398, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.03188578017686398 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.2802937576499388, + "mc1_stderr": 0.015723139524608767, + "mc2": 0.399328906923071, + "mc2_stderr": 0.01423505140557656 + }, + "all": { + "acc": 0.5677513161504792, + "acc_stderr": 0.034667527806544376, + "acc_norm": 0.5718243148374196, + "acc_norm_stderr": 0.03464965050195055, + "mc1": 0.2802937576499388, + "mc1_stderr": 0.015723139524608767, + "mc2": 0.399328906923071, + "mc2_stderr": 0.01423505140557656 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6336.140574455261", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/llama2-13b-FINETUNE3_TEST2/results_2023-10-28T06-56-58.916586.json b/eval-results/wei123602/llama2-13b-FINETUNE3_TEST2/results_2023-10-28T06-56-58.916586.json new file mode 100644 index 0000000000000000000000000000000000000000..7ec7c37030b55c441eb71c48988f28e0fd926084 --- /dev/null +++ b/eval-results/wei123602/llama2-13b-FINETUNE3_TEST2/results_2023-10-28T06-56-58.916586.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "wei123602/llama2-13b-FINETUNE3_TEST2", + "model_sha": "9e6431061bd13852a7435f5fe7a6eb0bbd148e14", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.2633179530201342, + "em_stderr": 0.004510450588757746, + "f1": 0.3047556627516783, + "f1_stderr": 0.004459334625484884 + }, + "harness|gsm8k|5": { + "acc": 0.12585291887793784, + "acc_stderr": 0.009136212598406319 + }, + "harness|winogrande|5": { + "acc": 0.7624309392265194, + "acc_stderr": 0.01196129890580315 + }, + "all": { + "em": 0.2633179530201342, + "em_stderr": 0.004510450588757746, + "f1": 0.3047556627516783, + "f1_stderr": 0.004459334625484884, + "acc": 0.4441419290522286, + "acc_stderr": 0.010548755752104734 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "e795ee310fe69abf" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "0c8fa43ede31a6b2" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "409514f44364b529" + }, + "total_evaluation_time_secondes": "11791.107451438904", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/llama2-13b-FINETUNE3_TEST2/results_2023-10-28T09-53-17.709619.json b/eval-results/wei123602/llama2-13b-FINETUNE3_TEST2/results_2023-10-28T09-53-17.709619.json new file mode 100644 index 0000000000000000000000000000000000000000..baba0a3a66c6725b48dea1755c3d64f3bc1d62fa --- /dev/null +++ b/eval-results/wei123602/llama2-13b-FINETUNE3_TEST2/results_2023-10-28T09-53-17.709619.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "wei123602/llama2-13b-FINETUNE3_TEST2", + "model_sha": "9e6431061bd13852a7435f5fe7a6eb0bbd148e14", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.2633179530201342, + "em_stderr": 0.004510450588757746, + "f1": 0.3047556627516783, + "f1_stderr": 0.004459334625484884 + }, + "harness|gsm8k|5": { + "acc": 0.12585291887793784, + "acc_stderr": 0.009136212598406319 + }, + "harness|winogrande|5": { + "acc": 0.7624309392265194, + "acc_stderr": 0.01196129890580315 + }, + "all": { + "em": 0.2633179530201342, + "em_stderr": 0.004510450588757746, + "f1": 0.3047556627516783, + "f1_stderr": 0.004459334625484884, + "acc": 0.4441419290522286, + "acc_stderr": 0.010548755752104734 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "e795ee310fe69abf" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "0c8fa43ede31a6b2" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "409514f44364b529" + }, + "total_evaluation_time_secondes": "11930.68575334549", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/llama2-13b-fintune2-4E/results_2023-09-14T13-45-51.161008.json b/eval-results/wei123602/llama2-13b-fintune2-4E/results_2023-09-14T13-45-51.161008.json new file mode 100644 index 0000000000000000000000000000000000000000..6a03a8483dac4675d4be63e214b7915d5845caa9 --- /dev/null +++ b/eval-results/wei123602/llama2-13b-fintune2-4E/results_2023-09-14T13-45-51.161008.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "wei123602/llama2-13b-fintune2-4E", + "model_sha": "645ede9d6ec60d8fa051bc7ad32ab5f7bfdc066d", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "ff467795ccc45b291b69333c263d5f16abd1fcd9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.5187713310580204, + "acc_stderr": 0.014601090150633964, + "acc_norm": 0.5588737201365188, + "acc_norm_stderr": 0.014509747749064663 + }, + "harness|hellaswag|10": { + "acc": 0.6186018721370244, + "acc_stderr": 0.004847372670134645, + "acc_norm": 0.8095000995817566, + "acc_norm_stderr": 0.003918928556590478 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.31, + "acc_stderr": 0.046482319871173156, + "acc_norm": 0.31, + "acc_norm_stderr": 0.046482319871173156 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.45185185185185184, + "acc_stderr": 0.04299268905480863, + "acc_norm": 0.45185185185185184, + "acc_norm_stderr": 0.04299268905480863 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5855263157894737, + "acc_stderr": 0.04008973785779205, + "acc_norm": 0.5855263157894737, + "acc_norm_stderr": 0.04008973785779205 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.54, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.54, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.5773584905660377, + "acc_stderr": 0.03040233144576954, + "acc_norm": 0.5773584905660377, + "acc_norm_stderr": 0.03040233144576954 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6111111111111112, + "acc_stderr": 0.04076663253918567, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.04076663253918567 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045, + "acc_norm": 0.33, + "acc_norm_stderr": 0.047258156262526045 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.49710982658959535, + "acc_stderr": 0.038124005659748335, + "acc_norm": 0.49710982658959535, + "acc_norm_stderr": 0.038124005659748335 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.28431372549019607, + "acc_stderr": 0.04488482852329017, + "acc_norm": 0.28431372549019607, + "acc_norm_stderr": 0.04488482852329017 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.7, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.46808510638297873, + "acc_stderr": 0.03261936918467382, + "acc_norm": 0.46808510638297873, + "acc_norm_stderr": 0.03261936918467382 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.044346007015849245, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.044346007015849245 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.4827586206896552, + "acc_stderr": 0.04164188720169377, + "acc_norm": 0.4827586206896552, + "acc_norm_stderr": 0.04164188720169377 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.37566137566137564, + "acc_stderr": 0.02494236893115979, + "acc_norm": 0.37566137566137564, + "acc_norm_stderr": 0.02494236893115979 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3888888888888889, + "acc_stderr": 0.04360314860077459, + "acc_norm": 0.3888888888888889, + "acc_norm_stderr": 0.04360314860077459 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001975 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.567741935483871, + "acc_stderr": 0.028181739720019416, + "acc_norm": 0.567741935483871, + "acc_norm_stderr": 0.028181739720019416 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.39408866995073893, + "acc_stderr": 0.034381579670365446, + "acc_norm": 0.39408866995073893, + "acc_norm_stderr": 0.034381579670365446 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.47, + "acc_stderr": 0.05016135580465919, + "acc_norm": 0.47, + "acc_norm_stderr": 0.05016135580465919 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.6545454545454545, + "acc_stderr": 0.037131580674819135, + "acc_norm": 0.6545454545454545, + "acc_norm_stderr": 0.037131580674819135 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.6818181818181818, + "acc_stderr": 0.03318477333845331, + "acc_norm": 0.6818181818181818, + "acc_norm_stderr": 0.03318477333845331 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.7668393782383419, + "acc_stderr": 0.03051611137147601, + "acc_norm": 0.7668393782383419, + "acc_norm_stderr": 0.03051611137147601 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.541025641025641, + "acc_stderr": 0.025265525491284295, + "acc_norm": 0.541025641025641, + "acc_norm_stderr": 0.025265525491284295 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.028742040903948492, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.028742040903948492 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5252100840336135, + "acc_stderr": 0.03243718055137411, + "acc_norm": 0.5252100840336135, + "acc_norm_stderr": 0.03243718055137411 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2847682119205298, + "acc_stderr": 0.03684881521389023, + "acc_norm": 0.2847682119205298, + "acc_norm_stderr": 0.03684881521389023 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.726605504587156, + "acc_stderr": 0.0191092998460983, + "acc_norm": 0.726605504587156, + "acc_norm_stderr": 0.0191092998460983 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4166666666666667, + "acc_stderr": 0.03362277436608043, + "acc_norm": 0.4166666666666667, + "acc_norm_stderr": 0.03362277436608043 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7254901960784313, + "acc_stderr": 0.03132179803083291, + "acc_norm": 0.7254901960784313, + "acc_norm_stderr": 0.03132179803083291 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7341772151898734, + "acc_stderr": 0.02875679962965834, + "acc_norm": 0.7341772151898734, + "acc_norm_stderr": 0.02875679962965834 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6188340807174888, + "acc_stderr": 0.03259625118416827, + "acc_norm": 0.6188340807174888, + "acc_norm_stderr": 0.03259625118416827 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.5725190839694656, + "acc_stderr": 0.04338920305792401, + "acc_norm": 0.5725190839694656, + "acc_norm_stderr": 0.04338920305792401 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6446280991735537, + "acc_stderr": 0.04369236326573981, + "acc_norm": 0.6446280991735537, + "acc_norm_stderr": 0.04369236326573981 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.6851851851851852, + "acc_stderr": 0.04489931073591312, + "acc_norm": 0.6851851851851852, + "acc_norm_stderr": 0.04489931073591312 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.6319018404907976, + "acc_stderr": 0.03789213935838396, + "acc_norm": 0.6319018404907976, + "acc_norm_stderr": 0.03789213935838396 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.33035714285714285, + "acc_stderr": 0.04464285714285714, + "acc_norm": 0.33035714285714285, + "acc_norm_stderr": 0.04464285714285714 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7281553398058253, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.7281553398058253, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.7863247863247863, + "acc_stderr": 0.02685345037700916, + "acc_norm": 0.7863247863247863, + "acc_norm_stderr": 0.02685345037700916 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.58, + "acc_stderr": 0.04960449637488583, + "acc_norm": 0.58, + "acc_norm_stderr": 0.04960449637488583 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7139208173690932, + "acc_stderr": 0.01616087140512754, + "acc_norm": 0.7139208173690932, + "acc_norm_stderr": 0.01616087140512754 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6098265895953757, + "acc_stderr": 0.026261677607806636, + "acc_norm": 0.6098265895953757, + "acc_norm_stderr": 0.026261677607806636 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.29497206703910617, + "acc_stderr": 0.015251931579208173, + "acc_norm": 0.29497206703910617, + "acc_norm_stderr": 0.015251931579208173 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.5718954248366013, + "acc_stderr": 0.02833239748366428, + "acc_norm": 0.5718954248366013, + "acc_norm_stderr": 0.02833239748366428 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6430868167202572, + "acc_stderr": 0.027210420375934023, + "acc_norm": 0.6430868167202572, + "acc_norm_stderr": 0.027210420375934023 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.027431623722415012, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.027431623722415012 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.45390070921985815, + "acc_stderr": 0.029700453247291484, + "acc_norm": 0.45390070921985815, + "acc_norm_stderr": 0.029700453247291484 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4315514993481095, + "acc_stderr": 0.012650007999463881, + "acc_norm": 0.4315514993481095, + "acc_norm_stderr": 0.012650007999463881 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4889705882352941, + "acc_stderr": 0.030365446477275675, + "acc_norm": 0.4889705882352941, + "acc_norm_stderr": 0.030365446477275675 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5473856209150327, + "acc_stderr": 0.020136790918492534, + "acc_norm": 0.5473856209150327, + "acc_norm_stderr": 0.020136790918492534 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6363636363636364, + "acc_stderr": 0.046075820907199756, + "acc_norm": 0.6363636363636364, + "acc_norm_stderr": 0.046075820907199756 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.5469387755102041, + "acc_stderr": 0.03186785930004129, + "acc_norm": 0.5469387755102041, + "acc_norm_stderr": 0.03186785930004129 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.7014925373134329, + "acc_stderr": 0.03235743789355043, + "acc_norm": 0.7014925373134329, + "acc_norm_stderr": 0.03235743789355043 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.72, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.72, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4457831325301205, + "acc_stderr": 0.03869543323472101, + "acc_norm": 0.4457831325301205, + "acc_norm_stderr": 0.03869543323472101 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7777777777777778, + "acc_stderr": 0.03188578017686398, + "acc_norm": 0.7777777777777778, + "acc_norm_stderr": 0.03188578017686398 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.29253365973072215, + "mc1_stderr": 0.015925597445286165, + "mc2": 0.4272280203904765, + "mc2_stderr": 0.015202924644061788 + }, + "all": { + "acc": 0.5383184272381838, + "acc_stderr": 0.034850754300598634, + "acc_norm": 0.5422336919250181, + "acc_norm_stderr": 0.034833469783393314, + "mc1": 0.29253365973072215, + "mc1_stderr": 0.015925597445286165, + "mc2": 0.4272280203904765, + "mc2_stderr": 0.015202924644061788 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "71d56183130fecbd" + }, + "total_evaluation_time_secondes": "6361.08873462677", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wei123602/llama2-13b-fintune2-4E/results_2023-10-23T08-37-29.290046.json b/eval-results/wei123602/llama2-13b-fintune2-4E/results_2023-10-23T08-37-29.290046.json new file mode 100644 index 0000000000000000000000000000000000000000..275067d148cb45ba9f9b228b9916035f1b7b4f50 --- /dev/null +++ b/eval-results/wei123602/llama2-13b-fintune2-4E/results_2023-10-23T08-37-29.290046.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "wei123602/llama2-13b-fintune2-4E", + "model_sha": "645ede9d6ec60d8fa051bc7ad32ab5f7bfdc066d", + "model_size": "24.32 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.33913590604026844, + "em_stderr": 0.004848223319148492, + "f1": 0.3781501677852353, + "f1_stderr": 0.004773695048987946 + }, + "harness|gsm8k|5": { + "acc": 0.10917361637604246, + "acc_stderr": 0.00859008930051116 + }, + "harness|winogrande|5": { + "acc": 0.7308602999210734, + "acc_stderr": 0.012464911951268738 + }, + "all": { + "em": 0.33913590604026844, + "em_stderr": 0.004848223319148492, + "f1": 0.3781501677852353, + "f1_stderr": 0.004773695048987946, + "acc": 0.42001695814855794, + "acc_stderr": 0.01052750062588995 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "42076f0efbb50aa6", + "hash_cont_tokens": "9a30c0b2e0f3e8f8" + }, + "truncated": 3, + "non-truncated": 9533, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "699b29e9cd63730c" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "a12f3e3c934bd78b", + "hash_cont_tokens": "4a8aa9b4b329117b" + }, + "total_evaluation_time_secondes": "9639.199330568314", + "truncated": 3, + "non-truncated": 13386, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wtang06/mpt-125m-c4/results_2023-10-03T23-04-52.221778.json b/eval-results/wtang06/mpt-125m-c4/results_2023-10-03T23-04-52.221778.json new file mode 100644 index 0000000000000000000000000000000000000000..ba7ab13ff38f7af9bef3d9ba5194f2079cb2d52a --- /dev/null +++ b/eval-results/wtang06/mpt-125m-c4/results_2023-10-03T23-04-52.221778.json @@ -0,0 +1,1367 @@ +{ + "config_general": { + "model_name": "wtang06/mpt-125m-c4", + "model_sha": "55f8f1874aa8bf4fc28c0abc92c7fbd1271ff7d7", + "model_size": "235.82 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.22696245733788395, + "acc_stderr": 0.012240491536132861, + "acc_norm": 0.22696245733788395, + "acc_norm_stderr": 0.012240491536132861 + }, + "harness|hellaswag|10": { + "acc": 0.2504481179047998, + "acc_stderr": 0.004323856300539177, + "acc_norm": 0.2504481179047998, + "acc_norm_stderr": 0.004323856300539177 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.18518518518518517, + "acc_stderr": 0.03355677216313142, + "acc_norm": 0.18518518518518517, + "acc_norm_stderr": 0.03355677216313142 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.21509433962264152, + "acc_stderr": 0.02528839450289137, + "acc_norm": 0.21509433962264152, + "acc_norm_stderr": 0.02528839450289137 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2569444444444444, + "acc_stderr": 0.03653946969442099, + "acc_norm": 0.2569444444444444, + "acc_norm_stderr": 0.03653946969442099 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.04020151261036845, + "acc_norm": 0.2, + "acc_norm_stderr": 0.04020151261036845 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.26, + "acc_stderr": 0.0440844002276808, + "acc_norm": 0.26, + "acc_norm_stderr": 0.0440844002276808 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.20809248554913296, + "acc_stderr": 0.030952890217749874, + "acc_norm": 0.20809248554913296, + "acc_norm_stderr": 0.030952890217749874 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.21568627450980393, + "acc_stderr": 0.04092563958237654, + "acc_norm": 0.21568627450980393, + "acc_norm_stderr": 0.04092563958237654 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.28, + "acc_stderr": 0.045126085985421276, + "acc_norm": 0.28, + "acc_norm_stderr": 0.045126085985421276 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.26382978723404255, + "acc_stderr": 0.028809989854102973, + "acc_norm": 0.26382978723404255, + "acc_norm_stderr": 0.028809989854102973 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.23684210526315788, + "acc_stderr": 0.039994238792813365, + "acc_norm": 0.23684210526315788, + "acc_norm_stderr": 0.039994238792813365 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2413793103448276, + "acc_stderr": 0.03565998174135302, + "acc_norm": 0.2413793103448276, + "acc_norm_stderr": 0.03565998174135302 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.20899470899470898, + "acc_stderr": 0.02094048156533486, + "acc_norm": 0.20899470899470898, + "acc_norm_stderr": 0.02094048156533486 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04040610178208841, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04040610178208841 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.1774193548387097, + "acc_stderr": 0.02173254068932927, + "acc_norm": 0.1774193548387097, + "acc_norm_stderr": 0.02173254068932927 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.15270935960591134, + "acc_stderr": 0.02530890453938063, + "acc_norm": 0.15270935960591134, + "acc_norm_stderr": 0.02530890453938063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.17676767676767677, + "acc_stderr": 0.027178752639044915, + "acc_norm": 0.17676767676767677, + "acc_norm_stderr": 0.027178752639044915 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.19689119170984457, + "acc_stderr": 0.028697873971860664, + "acc_norm": 0.19689119170984457, + "acc_norm_stderr": 0.028697873971860664 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.20256410256410257, + "acc_stderr": 0.020377660970371372, + "acc_norm": 0.20256410256410257, + "acc_norm_stderr": 0.020377660970371372 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2111111111111111, + "acc_stderr": 0.024882116857655075, + "acc_norm": 0.2111111111111111, + "acc_norm_stderr": 0.024882116857655075 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.21008403361344538, + "acc_stderr": 0.026461398717471874, + "acc_norm": 0.21008403361344538, + "acc_norm_stderr": 0.026461398717471874 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.1986754966887417, + "acc_stderr": 0.03257847384436776, + "acc_norm": 0.1986754966887417, + "acc_norm_stderr": 0.03257847384436776 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.1926605504587156, + "acc_stderr": 0.016909276884936094, + "acc_norm": 0.1926605504587156, + "acc_norm_stderr": 0.016909276884936094 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.1527777777777778, + "acc_stderr": 0.024536326026134224, + "acc_norm": 0.1527777777777778, + "acc_norm_stderr": 0.024536326026134224 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.270042194092827, + "acc_stderr": 0.028900721906293426, + "acc_norm": 0.270042194092827, + "acc_norm_stderr": 0.028900721906293426 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.31390134529147984, + "acc_stderr": 0.031146796482972465, + "acc_norm": 0.31390134529147984, + "acc_norm_stderr": 0.031146796482972465 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2595419847328244, + "acc_stderr": 0.03844876139785271, + "acc_norm": 0.2595419847328244, + "acc_norm_stderr": 0.03844876139785271 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.03896878985070417, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.03896878985070417 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.042365112580946336, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.042365112580946336 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.22085889570552147, + "acc_stderr": 0.032591773927421776, + "acc_norm": 0.22085889570552147, + "acc_norm_stderr": 0.032591773927421776 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.3125, + "acc_stderr": 0.043994650575715215, + "acc_norm": 0.3125, + "acc_norm_stderr": 0.043994650575715215 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.17475728155339806, + "acc_stderr": 0.037601780060266224, + "acc_norm": 0.17475728155339806, + "acc_norm_stderr": 0.037601780060266224 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2905982905982906, + "acc_stderr": 0.02974504857267404, + "acc_norm": 0.2905982905982906, + "acc_norm_stderr": 0.02974504857267404 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.23754789272030652, + "acc_stderr": 0.015218733046150193, + "acc_norm": 0.23754789272030652, + "acc_norm_stderr": 0.015218733046150193 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.023267528432100174, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.023267528432100174 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.23798882681564246, + "acc_stderr": 0.014242630070574915, + "acc_norm": 0.23798882681564246, + "acc_norm_stderr": 0.014242630070574915 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.22549019607843138, + "acc_stderr": 0.023929155517351284, + "acc_norm": 0.22549019607843138, + "acc_norm_stderr": 0.023929155517351284 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.1864951768488746, + "acc_stderr": 0.02212243977248077, + "acc_norm": 0.1864951768488746, + "acc_norm_stderr": 0.02212243977248077 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.21604938271604937, + "acc_stderr": 0.022899162918445806, + "acc_norm": 0.21604938271604937, + "acc_norm_stderr": 0.022899162918445806 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.23404255319148937, + "acc_stderr": 0.025257861359432417, + "acc_norm": 0.23404255319148937, + "acc_norm_stderr": 0.025257861359432417 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142692, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142692 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.18382352941176472, + "acc_stderr": 0.023529242185193106, + "acc_norm": 0.18382352941176472, + "acc_norm_stderr": 0.023529242185193106 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.25, + "acc_stderr": 0.01751781884501444, + "acc_norm": 0.25, + "acc_norm_stderr": 0.01751781884501444 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.18775510204081633, + "acc_stderr": 0.02500025603954621, + "acc_norm": 0.18775510204081633, + "acc_norm_stderr": 0.02500025603954621 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24378109452736318, + "acc_stderr": 0.03036049015401465, + "acc_norm": 0.24378109452736318, + "acc_norm_stderr": 0.03036049015401465 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.28, + "acc_stderr": 0.04512608598542128, + "acc_norm": 0.28, + "acc_norm_stderr": 0.04512608598542128 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.28313253012048195, + "acc_stderr": 0.03507295431370518, + "acc_norm": 0.28313253012048195, + "acc_norm_stderr": 0.03507295431370518 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.3216374269005848, + "acc_stderr": 0.03582529442573122, + "acc_norm": 0.3216374269005848, + "acc_norm_stderr": 0.03582529442573122 + }, + "harness|truthfulqa:mc|0": { + "mc1": 1.0, + "mc1_stderr": 0.0, + "mc2": NaN, + "mc2_stderr": NaN + }, + "all": { + "acc": 0.2314240573187148, + "acc_stderr": 0.03071122006512167, + "acc_norm": 0.2314240573187148, + "acc_norm_stderr": 0.03071122006512167, + "mc1": 1.0, + "mc1_stderr": 0.0, + "mc2": NaN, + "mc2_stderr": NaN + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "573b1b078b6e9deb", + "hash_cont_tokens": "d9940905d0c552c9" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "f0fd0caf4d4c1110", + "hash_cont_tokens": "5a151675bb24bc7e" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40123, + "non-padded": 45, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "f076ac6b177ca28c", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "059827606e6b0780", + "hash_cont_tokens": "ec7e2288ab5f1ce9" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "1dd0dab88aa9e4b2", + "hash_cont_tokens": "044d83cac9e59cbb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "d51eb5246cbe2173", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "2337a7f17800c6ec", + "hash_cont_tokens": "bc82b3cc5072f164" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "e394ebbb8ceace76", + "hash_cont_tokens": "3bc45e0c4b6d612d" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "9221fbdf710a6f67", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "ebe2748d21b2ba41", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "bfecefb08ffb7faa", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "2ac8aec9025dc58b", + "hash_cont_tokens": "16f654508cdc19c4" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 680, + "non-padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "faf44c77f43368ef", + "hash_cont_tokens": "a3a24586c7218684" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "280c7f12abde10a5", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "217a841c86d2d992", + "hash_cont_tokens": "43818b3dc0c7496f" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "354267c0f98aad3b", + "hash_cont_tokens": "cff195e157be949a" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "4f5e8d051d04dde0", + "hash_cont_tokens": "7e14ccd1e2688bb8" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "cd12bec1d5448dda", + "hash_cont_tokens": "62f751399492015f" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1488, + "non-padded": 24, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "c549e395850984fe", + "hash_cont_tokens": "961939aeb671801f" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "81b06f5caa221f97", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "ad626d781102fe51", + "hash_cont_tokens": "d7a3b149f7e83a27" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "2c0d3f2eacc6bbd5", + "hash_cont_tokens": "b2579ba9c4c7423e" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "aada51d0571db37b", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "6e47d696116edd01", + "hash_cont_tokens": "47a5e5973f50fe17" + }, + "truncated": 660, + "non-truncated": 0, + "padded": 0, + "non-padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "0e8ee6c9e572e3c4", + "hash_cont_tokens": "812f79117b9593de" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "8fa2bf90de3b07e7", + "hash_cont_tokens": "5d4317e7acbf10e5" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fabb8f176276af2f", + "hash_cont_tokens": "8d468d84a686647d" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3e86d13ef021476a", + "hash_cont_tokens": "5ef6ef9328ef5238" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1069, + "non-padded": 11, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a132b5e9c9531b36", + "hash_cont_tokens": "4c32e38c066727bc" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "f8f6fe5143776cb4", + "hash_cont_tokens": "bf29d47c925caba6" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "e28121967b27a315", + "hash_cont_tokens": "45f02bc4af60f027" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "bdbe90efb4a1c4ce", + "hash_cont_tokens": "b15e06c7557a0ca1" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "b8f58f05dc082011", + "hash_cont_tokens": "e5ab34a54e3f5b7c" + }, + "truncated": 816, + "non-truncated": 0, + "padded": 0, + "non-padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "3af911bf93093a85", + "hash_cont_tokens": "3b99b36f60960908" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "1dd2240eb90b9a70", + "hash_cont_tokens": "7982edf99219e1b0" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "f3de2f8181824a79", + "hash_cont_tokens": "ed73d516c5552dd0" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "0c2a1dd63cc74137", + "hash_cont_tokens": "6b17b0774106ed83" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "08e3527985f33aab", + "hash_cont_tokens": "ddf5241e450210d6" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "bf7216a648529f68", + "hash_cont_tokens": "eb791fcbee9e0682" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "28f5891c956afd65", + "hash_cont_tokens": "ed6f21d7fec8cbab" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6de88b824d4f64c3", + "hash_cont_tokens": "27795e9c98bdeda8" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "5ef855d01044fd83", + "hash_cont_tokens": "874c5b0b496cbe8a" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "1840e0b96d7e619e", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "02483f6b53dc13ac", + "hash_cont_tokens": "313ee361fbdbab3c" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "93202e79d594dde4", + "hash_cont_tokens": "bfc9a5db80e5bba3" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1356, + "non-padded": 28, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "41c03f41d2ba9fe7", + "hash_cont_tokens": "b6b5d477136351d3" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "d83bcb6dd08809ac", + "hash_cont_tokens": "497c8d5896f280f6" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "65c70474c8a5d205", + "hash_cont_tokens": "7916d26928435f1a" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "4d4126ac9a91ac47", + "hash_cont_tokens": "88542052394953bd" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "592f80ad364d686a", + "hash_cont_tokens": "316cf4c387aa53e3" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "7f837322b1b62ac1", + "hash_cont_tokens": "6b31cf265df9b81b" + }, + "truncated": 16, + "non-truncated": 6120, + "padded": 6120, + "non-padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "05a8ef0dd10b4bba", + "hash_cont_tokens": "ce95c9ee454fdf64" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "3c7944f0b2c49f64", + "hash_cont_tokens": "0782e6576a3a8785" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "637e934bb716d5ec", + "hash_cont_tokens": "ca79966b90cda0ea" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "3bad229573ed6a9c", + "hash_cont_tokens": "5e8fd3201be1a1f4" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "70a479e96d02d5d8", + "hash_cont_tokens": "f49476cf49b37d7c" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "0d690fc0db462440", + "hash_cont_tokens": "74c639e56bb475af" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "4b0fdf8e692dd640", + "hash_cont_tokens": "0065c4bbe6134c1c" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "cfd7092dc8aacd96", + "hash_cont_tokens": "a111a36329479373" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "e820abadeb7ebfb3", + "hash_cont_tokens": "87e1c2b162b3e4c6" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "c86f5765cd1e9dab", + "hash_cont_tokens": "70be634de3673b78" + }, + "total_evaluation_time_secondes": "1126.340002298355", + "truncated": 1492, + "non-truncated": 109527, + "padded": 109403, + "non-padded": 1616, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wtang06/mpt-125m-c4/results_2023-10-28T21-05-22.097768.json b/eval-results/wtang06/mpt-125m-c4/results_2023-10-28T21-05-22.097768.json new file mode 100644 index 0000000000000000000000000000000000000000..45bafb5700cd83037f632dfa6e09f104ada04849 --- /dev/null +++ b/eval-results/wtang06/mpt-125m-c4/results_2023-10-28T21-05-22.097768.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "wtang06/mpt-125m-c4", + "model_sha": "f13efec5c8498cb52998eb9ed347207f077b5f9d", + "model_size": "235.82 MB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "harness|winogrande|5": { + "acc": 0.4956590370955012, + "acc_stderr": 0.014051956064076911 + }, + "all": { + "em": 0.0, + "em_stderr": 0.0, + "f1": 0.0, + "f1_stderr": 0.0, + "acc": 0.2478295185477506, + "acc_stderr": 0.007025978032038456 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "4bf3f6ba1bae765a", + "hash_cont_tokens": "6511033526e3295c" + }, + "truncated": 439, + "non-truncated": 9097, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "ef516f9ffbe76423", + "hash_cont_tokens": "305d44c2a2094356" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c469718508f43cab", + "hash_cont_tokens": "87eeb79172195781" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2456, + "non-padded": 78, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "401c6c49053f17ab", + "hash_cont_tokens": "baedac3d168fa0e7" + }, + "total_evaluation_time_secondes": "139.81891322135925", + "truncated": 439, + "non-truncated": 12950, + "padded": 2456, + "non-padded": 10933, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/wtang06/mpt-125m-c4/results_2023-12-02T13-54-27.484252.json b/eval-results/wtang06/mpt-125m-c4/results_2023-12-02T13-54-27.484252.json new file mode 100644 index 0000000000000000000000000000000000000000..d7a4c8b87cafd6dc90e7486f531c120ed77d08b9 --- /dev/null +++ b/eval-results/wtang06/mpt-125m-c4/results_2023-12-02T13-54-27.484252.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1405119.032025634, + "end_time": 1405168.573125591, + "total_evaluation_time_secondes": "49.54109995695762", + "model_name": "wtang06/mpt-125m-c4", + "model_sha": "f13efec5c8498cb52998eb9ed347207f077b5f9d", + "model_dtype": "torch.float16", + "model_size": "235.82 MB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "all": { + "acc": 0.0, + "acc_stderr": 0.0 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "ef516f9ffbe76423", + "hash_cont_tokens": "305d44c2a2094356" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "8b4e0cd6017ffd2e", + "hash_cont_tokens": "5f9d76dfa4938bfa" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/yec019/fbopt-350m-8bit/results_2023-11-29T20-49-40.874458.json b/eval-results/yec019/fbopt-350m-8bit/results_2023-11-29T20-49-40.874458.json new file mode 100644 index 0000000000000000000000000000000000000000..3d50f36f8c8cca549270f57ac23276d3166f1523 --- /dev/null +++ b/eval-results/yec019/fbopt-350m-8bit/results_2023-11-29T20-49-40.874458.json @@ -0,0 +1,1435 @@ +{ + "config_general": { + "lighteval_sha": "9ffc410f6c40b8cfefe7167cb47aefe69ced61e1", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1112879.529063406, + "end_time": 1144484.956563601, + "total_evaluation_time_secondes": "31605.427500195103", + "model_name": "yec019/fbopt-350m-8bit", + "model_sha": "305f804054d75a406a85a568ea99dca17cfc998d", + "model_dtype": "8bit", + "model_size": "342.71 MB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.20563139931740615, + "acc_stderr": 0.011810745260742581, + "acc_norm": 0.2354948805460751, + "acc_norm_stderr": 0.012399451855004746 + }, + "harness|hellaswag|10": { + "acc": 0.32085241983668594, + "acc_stderr": 0.00465850166227761, + "acc_norm": 0.3659629555865365, + "acc_norm_stderr": 0.004807146925162053 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932268, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932268 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.03785714465066656, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.03785714465066656 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.17763157894736842, + "acc_stderr": 0.031103182383123398, + "acc_norm": 0.17763157894736842, + "acc_norm_stderr": 0.031103182383123398 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.25660377358490566, + "acc_stderr": 0.026880647889051985, + "acc_norm": 0.25660377358490566, + "acc_norm_stderr": 0.026880647889051985 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.03476590104304135, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.03476590104304135 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.35, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.35, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.32, + "acc_stderr": 0.04688261722621504, + "acc_norm": 0.32, + "acc_norm_stderr": 0.04688261722621504 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.24855491329479767, + "acc_stderr": 0.03295304696818318, + "acc_norm": 0.24855491329479767, + "acc_norm_stderr": 0.03295304696818318 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179962, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179962 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.18, + "acc_stderr": 0.038612291966536934, + "acc_norm": 0.18, + "acc_norm_stderr": 0.038612291966536934 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.2851063829787234, + "acc_stderr": 0.029513196625539355, + "acc_norm": 0.2851063829787234, + "acc_norm_stderr": 0.029513196625539355 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21929824561403508, + "acc_stderr": 0.03892431106518754, + "acc_norm": 0.21929824561403508, + "acc_norm_stderr": 0.03892431106518754 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.2827586206896552, + "acc_stderr": 0.03752833958003337, + "acc_norm": 0.2827586206896552, + "acc_norm_stderr": 0.03752833958003337 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.2566137566137566, + "acc_stderr": 0.022494510767503154, + "acc_norm": 0.2566137566137566, + "acc_norm_stderr": 0.022494510767503154 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.2619047619047619, + "acc_stderr": 0.03932537680392869, + "acc_norm": 0.2619047619047619, + "acc_norm_stderr": 0.03932537680392869 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.19, + "acc_stderr": 0.03942772444036624, + "acc_norm": 0.19, + "acc_norm_stderr": 0.03942772444036624 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.3032258064516129, + "acc_stderr": 0.02614868593067175, + "acc_norm": 0.3032258064516129, + "acc_norm_stderr": 0.02614868593067175 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.3054187192118227, + "acc_stderr": 0.03240661565868408, + "acc_norm": 0.3054187192118227, + "acc_norm_stderr": 0.03240661565868408 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.19, + "acc_stderr": 0.039427724440366234, + "acc_norm": 0.19, + "acc_norm_stderr": 0.039427724440366234 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.26666666666666666, + "acc_stderr": 0.03453131801885415, + "acc_norm": 0.26666666666666666, + "acc_norm_stderr": 0.03453131801885415 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.35858585858585856, + "acc_stderr": 0.03416903640391521, + "acc_norm": 0.35858585858585856, + "acc_norm_stderr": 0.03416903640391521 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.35233160621761656, + "acc_stderr": 0.03447478286414359, + "acc_norm": 0.35233160621761656, + "acc_norm_stderr": 0.03447478286414359 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.28205128205128205, + "acc_stderr": 0.022815813098896603, + "acc_norm": 0.28205128205128205, + "acc_norm_stderr": 0.022815813098896603 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.2740740740740741, + "acc_stderr": 0.027195934804085626, + "acc_norm": 0.2740740740740741, + "acc_norm_stderr": 0.027195934804085626 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.2815126050420168, + "acc_stderr": 0.029213549414372156, + "acc_norm": 0.2815126050420168, + "acc_norm_stderr": 0.029213549414372156 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.33774834437086093, + "acc_stderr": 0.038615575462551684, + "acc_norm": 0.33774834437086093, + "acc_norm_stderr": 0.038615575462551684 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.3376146788990826, + "acc_stderr": 0.020275265986638903, + "acc_norm": 0.3376146788990826, + "acc_norm_stderr": 0.020275265986638903 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.4722222222222222, + "acc_stderr": 0.0340470532865388, + "acc_norm": 0.4722222222222222, + "acc_norm_stderr": 0.0340470532865388 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25980392156862747, + "acc_stderr": 0.030778554678693257, + "acc_norm": 0.25980392156862747, + "acc_norm_stderr": 0.030778554678693257 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.20253164556962025, + "acc_stderr": 0.026160568246601457, + "acc_norm": 0.20253164556962025, + "acc_norm_stderr": 0.026160568246601457 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.14798206278026907, + "acc_stderr": 0.023831557157613537, + "acc_norm": 0.14798206278026907, + "acc_norm_stderr": 0.023831557157613537 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.2748091603053435, + "acc_stderr": 0.03915345408847835, + "acc_norm": 0.2748091603053435, + "acc_norm_stderr": 0.03915345408847835 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.371900826446281, + "acc_stderr": 0.044120158066245044, + "acc_norm": 0.371900826446281, + "acc_norm_stderr": 0.044120158066245044 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.040191074725573483, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.040191074725573483 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.26993865030674846, + "acc_stderr": 0.034878251684978906, + "acc_norm": 0.26993865030674846, + "acc_norm_stderr": 0.034878251684978906 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.17857142857142858, + "acc_stderr": 0.036352091215778065, + "acc_norm": 0.17857142857142858, + "acc_norm_stderr": 0.036352091215778065 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.20388349514563106, + "acc_stderr": 0.039891398595317706, + "acc_norm": 0.20388349514563106, + "acc_norm_stderr": 0.039891398595317706 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2094017094017094, + "acc_stderr": 0.026655699653922754, + "acc_norm": 0.2094017094017094, + "acc_norm_stderr": 0.026655699653922754 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.3, + "acc_stderr": 0.046056618647183814, + "acc_norm": 0.3, + "acc_norm_stderr": 0.046056618647183814 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2120051085568327, + "acc_stderr": 0.014616099385833695, + "acc_norm": 0.2120051085568327, + "acc_norm_stderr": 0.014616099385833695 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2658959537572254, + "acc_stderr": 0.02378620325550829, + "acc_norm": 0.2658959537572254, + "acc_norm_stderr": 0.02378620325550829 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2547486033519553, + "acc_stderr": 0.014572650383409153, + "acc_norm": 0.2547486033519553, + "acc_norm_stderr": 0.014572650383409153 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.26143790849673204, + "acc_stderr": 0.025160998214292456, + "acc_norm": 0.26143790849673204, + "acc_norm_stderr": 0.025160998214292456 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.19614147909967847, + "acc_stderr": 0.02255244778047803, + "acc_norm": 0.19614147909967847, + "acc_norm_stderr": 0.02255244778047803 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.27469135802469136, + "acc_stderr": 0.024836057868294677, + "acc_norm": 0.27469135802469136, + "acc_norm_stderr": 0.024836057868294677 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.2375886524822695, + "acc_stderr": 0.02538951255272991, + "acc_norm": 0.2375886524822695, + "acc_norm_stderr": 0.02538951255272991 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.23533246414602346, + "acc_stderr": 0.010834432543912224, + "acc_norm": 0.23533246414602346, + "acc_norm_stderr": 0.010834432543912224 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.4485294117647059, + "acc_stderr": 0.030211479609121593, + "acc_norm": 0.4485294117647059, + "acc_norm_stderr": 0.030211479609121593 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.24673202614379086, + "acc_stderr": 0.0174408203674025, + "acc_norm": 0.24673202614379086, + "acc_norm_stderr": 0.0174408203674025 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03955932861795833, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03955932861795833 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.40408163265306124, + "acc_stderr": 0.03141470802586589, + "acc_norm": 0.40408163265306124, + "acc_norm_stderr": 0.03141470802586589 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.24875621890547264, + "acc_stderr": 0.030567675938916718, + "acc_norm": 0.24875621890547264, + "acc_norm_stderr": 0.030567675938916718 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.26, + "acc_stderr": 0.04408440022768078, + "acc_norm": 0.26, + "acc_norm_stderr": 0.04408440022768078 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.19879518072289157, + "acc_stderr": 0.031069390260789427, + "acc_norm": 0.19879518072289157, + "acc_norm_stderr": 0.031069390260789427 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.17543859649122806, + "acc_stderr": 0.029170885500727654, + "acc_norm": 0.17543859649122806, + "acc_norm_stderr": 0.029170885500727654 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862677, + "mc2": 0.4096850307211532, + "mc2_stderr": 0.014687908894010157 + }, + "harness|winogrande|5": { + "acc": 0.526440410418311, + "acc_stderr": 0.014032823874407225 + }, + "harness|drop|3": { + "em": 0.0009437919463087249, + "em_stderr": 0.0003144653119413515, + "f1": 0.04309563758389266, + "f1_stderr": 0.0011819267488546012 + }, + "harness|gsm8k|5": { + "acc": 0.004548900682335102, + "acc_stderr": 0.0018535550440036204 + }, + "all": { + "acc": 0.2623665323873139, + "acc_stderr": 0.030818259007654874, + "acc_norm": 0.2635312403644749, + "acc_norm_stderr": 0.03160618258314717, + "mc1": 0.23133414932680538, + "mc1_stderr": 0.014761945174862677, + "mc2": 0.4096850307211532, + "mc2_stderr": 0.014687908894010157, + "em": 0.0009437919463087249, + "em_stderr": 0.0003144653119413515, + "f1": 0.04309563758389266, + "f1_stderr": 0.0011819267488546012 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "f765c58b007beb4c", + "hash_cont_tokens": "8950fe4d13528919" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4675, + "non_padded": 12, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "acdb2069c2733f09", + "hash_cont_tokens": "16c7af5dff73e378" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40156, + "non_padded": 12, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "3fb48b02f069ea2d", + "hash_cont_tokens": "65115fc130126941" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "78d876934672de6d", + "hash_cont_tokens": "705516ff46ec26dc" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "9f376a455c119863", + "hash_cont_tokens": "dd8f9a00fa430bfb" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "062c37d794dfdb0a", + "hash_cont_tokens": "65115fc130126941" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "88d8d8de0caa138d", + "hash_cont_tokens": "37477257cf9eeb0a" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1052, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "f077f7247680a87a", + "hash_cont_tokens": "38aaca72155981e7" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "f7cfd540a0272c6c", + "hash_cont_tokens": "65115fc130126941" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "fb537f5a4c0a5ed8", + "hash_cont_tokens": "65115fc130126941" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "d5f6ea48598c3a47", + "hash_cont_tokens": "65115fc130126941" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "984fb88ec0dd241e", + "hash_cont_tokens": "40630b2e3e33ca08" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 684, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "d394cbc9ab3f140d", + "hash_cont_tokens": "4085a0ba4a98cf79" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 400, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "4c86b42f43d710e4", + "hash_cont_tokens": "65115fc130126941" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "4907abf7c2025ca2", + "hash_cont_tokens": "f15de85dda56bf9a" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "f06b3f32932935fc", + "hash_cont_tokens": "a9e8ebf615a9326d" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "0c633ea4192f844a", + "hash_cont_tokens": "1fec337497bf988f" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "82a81cfcc43fb540", + "hash_cont_tokens": "310df35a823eebec" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "aa6038a55f7a10e3", + "hash_cont_tokens": "e6c5937e320af62c" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "6a5beb36276c2c7d", + "hash_cont_tokens": "65115fc130126941" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "44bce55657c43e3a", + "hash_cont_tokens": "cfc7c792ea17a3c5" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "fe90bd557fa59569", + "hash_cont_tokens": "54e28580ffc0bfc2" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 800, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "2ec97654351d6215", + "hash_cont_tokens": "65115fc130126941" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "6d8596e5edbe236d", + "hash_cont_tokens": "2553c38072fe59e9" + }, + "truncated": 660, + "non_truncated": -495, + "padded": 0, + "non_padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "fb6aeaa14d070a03", + "hash_cont_tokens": "967f1a6377c5dada" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 788, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "811bf2d32b210e18", + "hash_cont_tokens": "c07c16300f6693a8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "43154e4688cb695e", + "hash_cont_tokens": "3c15870aa9a751c8" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "bdbbcf6c749ebd3d", + "hash_cont_tokens": "215647dfcd14ec88" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "0ba97d01bcb480a1", + "hash_cont_tokens": "7bfc49a85b0e6b0f" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "e144357f4b702aaa", + "hash_cont_tokens": "52f7347c4fac20df" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "95e03e05fa651ad2", + "hash_cont_tokens": "530724492271f230" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "8dcbdc1816261184", + "hash_cont_tokens": "8ab24e65ab6c9dec" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "2b245a8312dd0ee8", + "hash_cont_tokens": "19500e048c94127a" + }, + "truncated": 816, + "non_truncated": -612, + "padded": 0, + "non_padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "fa3b5b3bf631cd40", + "hash_cont_tokens": "5c019384c24c5c87" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "f12b3e47af8a5d76", + "hash_cont_tokens": "350bc807db8602e4" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "2906da4cda606d18", + "hash_cont_tokens": "944bf06e08c9e841" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e2e0f9fbf3f8c8e3", + "hash_cont_tokens": "c3d9bbe04a3a3ec6" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "7ef20f312d25a2a4", + "hash_cont_tokens": "3813b356ad4675eb" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 428, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "3bcc88abda96a802", + "hash_cont_tokens": "75dadf0da8971dfb" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "1845efb6dad04919", + "hash_cont_tokens": "c0a92d0861b4f319" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "d99a8dc41025e5a6", + "hash_cont_tokens": "f6301f26d3421bfe" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "3fc44b6df8182a1b", + "hash_cont_tokens": "4bea1308c2dedd32" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "b728f083c1ae3783", + "hash_cont_tokens": "65115fc130126941" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "8f06d05786f4c5a1", + "hash_cont_tokens": "d87f2c7e8fda82f9" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "2ea52d13651068e6", + "hash_cont_tokens": "17673707c2169f5f" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "feb026fdc0b3006a", + "hash_cont_tokens": "b635076feea5cad5" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "704bd61839c94d71", + "hash_cont_tokens": "bcc2d8e8a9e3418a" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "784b04e2e3ecbade", + "hash_cont_tokens": "4b9e620ce1055d4a" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1240, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "8ae177e631521d59", + "hash_cont_tokens": "3f04832c8adc4e0a" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "0c4c5114953e5267", + "hash_cont_tokens": "f90b50e98b70a7a0" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "7105767805e28747", + "hash_cont_tokens": "f0b059007537e041" + }, + "truncated": 16, + "non_truncated": 1518, + "padded": 6120, + "non_padded": 16, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f04f0a03ea895b5b", + "hash_cont_tokens": "24b5ea844b832ad0" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "bc5980633504fca8", + "hash_cont_tokens": "f876ca951a9ec767" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "53d70b923acfd31e", + "hash_cont_tokens": "1bda889eaab363c0" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 436, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "9eadb993a592c2bf", + "hash_cont_tokens": "8ea224bd07c6eaa6" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "d777efd417cd8064", + "hash_cont_tokens": "cc268c81efa0dfb9" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 792, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "109fbcb059c3b11a", + "hash_cont_tokens": "65115fc130126941" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "503089983a21948c", + "hash_cont_tokens": "456a90466d8efd2a" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "be5b9316afc63897", + "hash_cont_tokens": "4943d43c84251f12" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "708234c26b037de5", + "hash_cont_tokens": "76c5d333f53ff0ff" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "29e45e5985656ec9", + "hash_cont_tokens": "d75b4039559457e2" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "e74b23fd6ab24722", + "hash_cont_tokens": "d00d1ce25b3b3a7c" + }, + "truncated": 384, + "non_truncated": 9152, + "padded": 0, + "non_padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "a2243014cab6a7a0", + "hash_cont_tokens": "d9bd426b13265f26" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "4eb459f19fc0f29d", + "hash_full_prompts": "21653ed56f202b4e", + "hash_input_tokens": "42d0b77959fa6ba7", + "hash_cont_tokens": "18ad9e96091d930a" + }, + "truncated": 1876, + "non_truncated": 36319, + "padded": 111961, + "non_padded": 12447, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/yec019/fbopt-350m-8bit/results_2023-12-02T13-46-11.364082.json b/eval-results/yec019/fbopt-350m-8bit/results_2023-12-02T13-46-11.364082.json new file mode 100644 index 0000000000000000000000000000000000000000..7cdab76ffe6f0b9dea1dffa98a3d609a4dc84f31 --- /dev/null +++ b/eval-results/yec019/fbopt-350m-8bit/results_2023-12-02T13-46-11.364082.json @@ -0,0 +1,63 @@ +{ + "config_general": { + "lighteval_sha": "b35d4d84573be82d91c07ea46260f262f72cf69d", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 1358313.756219337, + "end_time": 1363867.28049694, + "total_evaluation_time_secondes": "5553.524277603021", + "model_name": "yec019/fbopt-350m-8bit", + "model_sha": "305f804054d75a406a85a568ea99dca17cfc998d", + "model_dtype": "8bit", + "model_size": "342.71 MB" + }, + "results": { + "harness|gsm8k|5": { + "acc": 0.01288855193328279, + "acc_stderr": 0.0031069012664996336 + }, + "all": { + "acc": 0.01288855193328279, + "acc_stderr": 0.0031069012664996336 + } + }, + "versions": { + "all": 0, + "harness|gsm8k|5": 0 + }, + "config_tasks": { + "harness|gsm8k": "LM Harness task" + }, + "summary_tasks": { + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "a2243014cab6a7a0", + "hash_cont_tokens": "d9bd426b13265f26" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "18b756b7813d1bdf", + "hash_full_prompts": "deb3b1dff10b95aa", + "hash_input_tokens": "2e4d9aa80abd6deb", + "hash_cont_tokens": "7f311b67333436e5" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/yulan-team/YuLan-Chat-2-13b-fp16/results_2023-08-09T14-21-39.906214.json b/eval-results/yulan-team/YuLan-Chat-2-13b-fp16/results_2023-08-09T14-21-39.906214.json new file mode 100644 index 0000000000000000000000000000000000000000..d26de27c95de2bbc8d3b641ca86eb2d4ac5eb7ba --- /dev/null +++ b/eval-results/yulan-team/YuLan-Chat-2-13b-fp16/results_2023-08-09T14-21-39.906214.json @@ -0,0 +1,1365 @@ +{ + "results": { + "harness|arc:challenge|25": { + "acc": 0.5622866894197952, + "acc_stderr": 0.01449757388110828, + "acc_norm": 0.590443686006826, + "acc_norm_stderr": 0.014370358632472439 + }, + "harness|hellaswag|10": { + "acc": 0.6245767775343557, + "acc_stderr": 0.004832423630593181, + "acc_norm": 0.8066122286397132, + "acc_norm_stderr": 0.003941471781664182 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.21, + "acc_stderr": 0.04093601807403326, + "acc_norm": 0.21, + "acc_norm_stderr": 0.04093601807403326 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5481481481481482, + "acc_stderr": 0.042992689054808644, + "acc_norm": 0.5481481481481482, + "acc_norm_stderr": 0.042992689054808644 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.5855263157894737, + "acc_stderr": 0.04008973785779206, + "acc_norm": 0.5855263157894737, + "acc_norm_stderr": 0.04008973785779206 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.52, + "acc_stderr": 0.05021167315686779, + "acc_norm": 0.52, + "acc_norm_stderr": 0.05021167315686779 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6226415094339622, + "acc_stderr": 0.029832808114796005, + "acc_norm": 0.6226415094339622, + "acc_norm_stderr": 0.029832808114796005 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6041666666666666, + "acc_stderr": 0.04089465449325582, + "acc_norm": 0.6041666666666666, + "acc_norm_stderr": 0.04089465449325582 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.42, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.42, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.44, + "acc_stderr": 0.04988876515698589, + "acc_norm": 0.44, + "acc_norm_stderr": 0.04988876515698589 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695235, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695235 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.5028901734104047, + "acc_stderr": 0.038124005659748335, + "acc_norm": 0.5028901734104047, + "acc_norm_stderr": 0.038124005659748335 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.29411764705882354, + "acc_stderr": 0.04533838195929776, + "acc_norm": 0.29411764705882354, + "acc_norm_stderr": 0.04533838195929776 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.65, + "acc_stderr": 0.0479372485441102, + "acc_norm": 0.65, + "acc_norm_stderr": 0.0479372485441102 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.451063829787234, + "acc_stderr": 0.032529096196131965, + "acc_norm": 0.451063829787234, + "acc_norm_stderr": 0.032529096196131965 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.35964912280701755, + "acc_stderr": 0.045144961328736334, + "acc_norm": 0.35964912280701755, + "acc_norm_stderr": 0.045144961328736334 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.496551724137931, + "acc_stderr": 0.04166567577101579, + "acc_norm": 0.496551724137931, + "acc_norm_stderr": 0.04166567577101579 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.30952380952380953, + "acc_stderr": 0.02380952380952385, + "acc_norm": 0.30952380952380953, + "acc_norm_stderr": 0.02380952380952385 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3492063492063492, + "acc_stderr": 0.04263906892795132, + "acc_norm": 0.3492063492063492, + "acc_norm_stderr": 0.04263906892795132 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.3, + "acc_stderr": 0.04605661864718381, + "acc_norm": 0.3, + "acc_norm_stderr": 0.04605661864718381 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.6774193548387096, + "acc_stderr": 0.026593084516572277, + "acc_norm": 0.6774193548387096, + "acc_norm_stderr": 0.026593084516572277 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4039408866995074, + "acc_stderr": 0.0345245390382204, + "acc_norm": 0.4039408866995074, + "acc_norm_stderr": 0.0345245390382204 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.55, + "acc_stderr": 0.05, + "acc_norm": 0.55, + "acc_norm_stderr": 0.05 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.03477691162163659, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.03477691162163659 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.696969696969697, + "acc_stderr": 0.032742879140268674, + "acc_norm": 0.696969696969697, + "acc_norm_stderr": 0.032742879140268674 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8186528497409327, + "acc_stderr": 0.02780703236068609, + "acc_norm": 0.8186528497409327, + "acc_norm_stderr": 0.02780703236068609 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5307692307692308, + "acc_stderr": 0.025302958890850154, + "acc_norm": 0.5307692307692308, + "acc_norm_stderr": 0.025302958890850154 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3037037037037037, + "acc_stderr": 0.028037929969114982, + "acc_norm": 0.3037037037037037, + "acc_norm_stderr": 0.028037929969114982 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.5798319327731093, + "acc_stderr": 0.03206183783236153, + "acc_norm": 0.5798319327731093, + "acc_norm_stderr": 0.03206183783236153 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3576158940397351, + "acc_stderr": 0.03913453431177258, + "acc_norm": 0.3576158940397351, + "acc_norm_stderr": 0.03913453431177258 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7596330275229358, + "acc_stderr": 0.01832060732096407, + "acc_norm": 0.7596330275229358, + "acc_norm_stderr": 0.01832060732096407 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.49537037037037035, + "acc_stderr": 0.03409825519163572, + "acc_norm": 0.49537037037037035, + "acc_norm_stderr": 0.03409825519163572 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.7843137254901961, + "acc_stderr": 0.028867431449849313, + "acc_norm": 0.7843137254901961, + "acc_norm_stderr": 0.028867431449849313 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.7679324894514767, + "acc_stderr": 0.02747974455080851, + "acc_norm": 0.7679324894514767, + "acc_norm_stderr": 0.02747974455080851 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6233183856502242, + "acc_stderr": 0.032521134899291884, + "acc_norm": 0.6233183856502242, + "acc_norm_stderr": 0.032521134899291884 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.6183206106870229, + "acc_stderr": 0.0426073515764456, + "acc_norm": 0.6183206106870229, + "acc_norm_stderr": 0.0426073515764456 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.6942148760330579, + "acc_stderr": 0.04205953933884122, + "acc_norm": 0.6942148760330579, + "acc_norm_stderr": 0.04205953933884122 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.75, + "acc_stderr": 0.04186091791394607, + "acc_norm": 0.75, + "acc_norm_stderr": 0.04186091791394607 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7300613496932515, + "acc_stderr": 0.034878251684978906, + "acc_norm": 0.7300613496932515, + "acc_norm_stderr": 0.034878251684978906 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.42857142857142855, + "acc_stderr": 0.04697113923010212, + "acc_norm": 0.42857142857142855, + "acc_norm_stderr": 0.04697113923010212 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7281553398058253, + "acc_stderr": 0.044052680241409216, + "acc_norm": 0.7281553398058253, + "acc_norm_stderr": 0.044052680241409216 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8504273504273504, + "acc_stderr": 0.023365051491753715, + "acc_norm": 0.8504273504273504, + "acc_norm_stderr": 0.023365051491753715 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.59, + "acc_stderr": 0.049431107042371025, + "acc_norm": 0.59, + "acc_norm_stderr": 0.049431107042371025 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.7318007662835249, + "acc_stderr": 0.01584243083526943, + "acc_norm": 0.7318007662835249, + "acc_norm_stderr": 0.01584243083526943 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.653179190751445, + "acc_stderr": 0.025624723994030454, + "acc_norm": 0.653179190751445, + "acc_norm_stderr": 0.025624723994030454 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.3675977653631285, + "acc_stderr": 0.016125543823552958, + "acc_norm": 0.3675977653631285, + "acc_norm_stderr": 0.016125543823552958 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.6274509803921569, + "acc_stderr": 0.027684181883302895, + "acc_norm": 0.6274509803921569, + "acc_norm_stderr": 0.027684181883302895 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6655948553054662, + "acc_stderr": 0.026795422327893934, + "acc_norm": 0.6655948553054662, + "acc_norm_stderr": 0.026795422327893934 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.6574074074074074, + "acc_stderr": 0.026406145973625672, + "acc_norm": 0.6574074074074074, + "acc_norm_stderr": 0.026406145973625672 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.43617021276595747, + "acc_stderr": 0.02958345203628407, + "acc_norm": 0.43617021276595747, + "acc_norm_stderr": 0.02958345203628407 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4178617992177314, + "acc_stderr": 0.01259674410899856, + "acc_norm": 0.4178617992177314, + "acc_norm_stderr": 0.01259674410899856 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.5183823529411765, + "acc_stderr": 0.03035230339535196, + "acc_norm": 0.5183823529411765, + "acc_norm_stderr": 0.03035230339535196 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.5833333333333334, + "acc_stderr": 0.01994491413687358, + "acc_norm": 0.5833333333333334, + "acc_norm_stderr": 0.01994491413687358 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.045820048415054174, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.045820048415054174 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6816326530612244, + "acc_stderr": 0.029822533793982066, + "acc_norm": 0.6816326530612244, + "acc_norm_stderr": 0.029822533793982066 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8009950248756219, + "acc_stderr": 0.028231365092758406, + "acc_norm": 0.8009950248756219, + "acc_norm_stderr": 0.028231365092758406 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.83, + "acc_stderr": 0.03775251680686371, + "acc_norm": 0.83, + "acc_norm_stderr": 0.03775251680686371 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.45180722891566266, + "acc_stderr": 0.03874371556587953, + "acc_norm": 0.45180722891566266, + "acc_norm_stderr": 0.03874371556587953 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.7894736842105263, + "acc_stderr": 0.03126781714663179, + "acc_norm": 0.7894736842105263, + "acc_norm_stderr": 0.03126781714663179 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.35495716034271724, + "mc1_stderr": 0.0167508623813759, + "mc2": 0.5218071696031027, + "mc2_stderr": 0.01592456099814557 + }, + "all": { + "acc": 0.5680506066731081, + "acc_stderr": 0.034147859702474775, + "acc_norm": 0.5716131905323011, + "acc_norm_stderr": 0.034130602633024525, + "mc1": 0.35495716034271724, + "mc1_stderr": 0.0167508623813759, + "mc2": 0.5218071696031027, + "mc2_stderr": 0.01592456099814557 + } + }, + "versions": { + "harness|arc:challenge|25": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "all": 0 + }, + "config_general": { + "model_name": "yulan-team/YuLan-Chat-2-13b-fp16", + "model_sha": "2d439187efd6edd91a0c0146f08dff52d92aa7bc", + "model_dtype": "torch.float16", + "lighteval_sha": "da839e70121267a9bf55a0fbea4fb2fae2948337", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "3722289b79076c44", + "hash_cont_tokens": "8210decc6ff6f7df" + }, + "truncated": 0, + "non-truncated": 4687, + "padded": 4687, + "non-padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "ececd684171f1ef2", + "hash_cont_tokens": "b3b9e9017afa63af" + }, + "truncated": 0, + "non-truncated": 40168, + "padded": 40113, + "non-padded": 55, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "c54ff61ad0273dd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "be31a1e22aef5f90", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non-truncated": 540, + "padded": 540, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "277a7b1fad566940", + "hash_cont_tokens": "bf30e5d3f48250cb" + }, + "truncated": 0, + "non-truncated": 608, + "padded": 608, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "ba552605bc116de5", + "hash_cont_tokens": "bc1dd9b2d995eb61" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "428c7563d0b98ab9", + "hash_cont_tokens": "890a119624b3b935" + }, + "truncated": 0, + "non-truncated": 1060, + "padded": 1060, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "da036601573942e2", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non-truncated": 576, + "padded": 576, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "94e0196d6aded13d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "6e4d0f4a8d36690b", + "hash_cont_tokens": "ffc0fe414cdc4a83" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "614054d17109a25d", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "081bb2b524defd1c", + "hash_cont_tokens": "1f88b00d41957d82" + }, + "truncated": 0, + "non-truncated": 692, + "padded": 692, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "5421d9a1af86cbd4", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non-truncated": 408, + "padded": 408, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "5e6b70ecb333cf18", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "c2ef11a87264ceed", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non-truncated": 940, + "padded": 940, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "ecaccd912a4c3978", + "hash_cont_tokens": "bfb7e3c3c88313f1" + }, + "truncated": 0, + "non-truncated": 456, + "padded": 456, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "1590c84291399be8", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non-truncated": 580, + "padded": 580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "3269597f715b0da1", + "hash_cont_tokens": "f52691aef15a407b" + }, + "truncated": 0, + "non-truncated": 1512, + "padded": 1512, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "a2800d20f3ab8d7c", + "hash_cont_tokens": "f515d598d9c21263" + }, + "truncated": 0, + "non-truncated": 504, + "padded": 504, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "94ed44b3772505ad", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "24423acb928db768", + "hash_cont_tokens": "bd85a4156a3613ee" + }, + "truncated": 0, + "non-truncated": 1240, + "padded": 1240, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "831ff35c474e5cef", + "hash_cont_tokens": "a95c97af1c14e068" + }, + "truncated": 0, + "non-truncated": 812, + "padded": 812, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "a20a96b44dcc5b30", + "hash_cont_tokens": "8abfedef914e33c9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non-truncated": 660, + "padded": 656, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "7c5547c7da5bc793", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non-truncated": 792, + "padded": 792, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "f62991cb6a496b05", + "hash_cont_tokens": "a83effb8f76b7d7c" + }, + "truncated": 0, + "non-truncated": 772, + "padded": 772, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "4cef2aff6e3d59ed", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non-truncated": 1560, + "padded": 1560, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "6e2577ea4082ed2b", + "hash_cont_tokens": "24f5dc613660300b" + }, + "truncated": 0, + "non-truncated": 1080, + "padded": 1080, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "c5fc9aeb1079c8e4", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non-truncated": 952, + "padded": 952, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "555fc385cffa84ca", + "hash_cont_tokens": "ba2efcd283e938cc" + }, + "truncated": 0, + "non-truncated": 604, + "padded": 604, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "febd23cbf9973b7f", + "hash_cont_tokens": "942069cd363844d9" + }, + "truncated": 0, + "non-truncated": 2180, + "padded": 2180, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "400e55b56ee6fbd7", + "hash_cont_tokens": "955ed42b6f7fa019" + }, + "truncated": 0, + "non-truncated": 864, + "padded": 864, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non-truncated": 816, + "padded": 816, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "9a864184946033ac" + }, + "truncated": 0, + "non-truncated": 948, + "padded": 948, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "541a75f071dcf579", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non-truncated": 892, + "padded": 892, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "04269e5c5a257dd9", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non-truncated": 524, + "padded": 524, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "d93ba9d9d38e4397", + "hash_cont_tokens": "dc45b45fcda18e5d" + }, + "truncated": 0, + "non-truncated": 484, + "padded": 484, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "9eeaccd2698b4f5a", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non-truncated": 432, + "padded": 432, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "b4f08f544f2b7576", + "hash_cont_tokens": "1e80dbd30f6453d5" + }, + "truncated": 0, + "non-truncated": 652, + "padded": 648, + "non-padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "900c2a51f1174b9f", + "hash_cont_tokens": "9b37da7777378ca9" + }, + "truncated": 0, + "non-truncated": 448, + "padded": 448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "6b36efb4689c6eca", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non-truncated": 412, + "padded": 412, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "2aaac78a0cfed47a", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non-truncated": 936, + "padded": 936, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "886ca823b41c094a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "72fd71de7675e7d0", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non-truncated": 3132, + "padded": 3132, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "f3ca0dd8e7a1eb09", + "hash_cont_tokens": "8badf768f7b0467a" + }, + "truncated": 0, + "non-truncated": 1384, + "padded": 1354, + "non-padded": 30, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "3e793631e951f23c", + "hash_cont_tokens": "32ae620376b2bbba" + }, + "truncated": 0, + "non-truncated": 3580, + "padded": 3580, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "59753c2144ea93af", + "hash_cont_tokens": "3071def75bacc404" + }, + "truncated": 0, + "non-truncated": 1224, + "padded": 1224, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "bd8d3dbed15a8c34", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non-truncated": 1244, + "padded": 1244, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "3573cd87facbb7c5", + "hash_cont_tokens": "de469d2b981e32a3" + }, + "truncated": 0, + "non-truncated": 1296, + "padded": 1296, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "17e721bc1a7cbb47", + "hash_cont_tokens": "c46f74d2dfc7b13b" + }, + "truncated": 0, + "non-truncated": 1128, + "padded": 1128, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non-truncated": 6136, + "padded": 6136, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "fe35cfa9c6ca802e" + }, + "truncated": 0, + "non-truncated": 1088, + "padded": 1088, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0dfb73a8eb3f692c", + "hash_cont_tokens": "f020fbddf72c8652" + }, + "truncated": 0, + "non-truncated": 2448, + "padded": 2448, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "1710c6ba4c9f3cbd", + "hash_cont_tokens": "568f585a259965c1" + }, + "truncated": 0, + "non-truncated": 440, + "padded": 440, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "cc6fd7cccd64cd5d" + }, + "truncated": 0, + "non-truncated": 980, + "padded": 980, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "828999f7624cbe7e", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non-truncated": 804, + "padded": 804, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "42054621e718dbee", + "hash_cont_tokens": "2568d0e8e36fa959" + }, + "truncated": 0, + "non-truncated": 400, + "padded": 400, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "6c4f0aa4dc859c04", + "hash_cont_tokens": "926cf60b0891f374" + }, + "truncated": 0, + "non-truncated": 664, + "padded": 664, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "6c75d44e092ff24f", + "hash_cont_tokens": "c525a5de974c1ea3" + }, + "truncated": 0, + "non-truncated": 684, + "padded": 684, + "non-padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "2738d7ed7075faa7", + "hash_cont_tokens": "c014154380b74b9e" + }, + "truncated": 0, + "non-truncated": 9996, + "padded": 9996, + "non-padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "d84d18e9a963753d", + "hash_full_prompts": "12b540783521a8e6", + "hash_input_tokens": "5c73a7dce6ccf737", + "hash_cont_tokens": "fb1646e2bdd5fc38" + }, + "total_evaluation_time_secondes": "6326.778278827667", + "truncated": 0, + "non-truncated": 111019, + "padded": 110926, + "non-padded": 93, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/yulan-team/YuLan-Chat-2-13b-fp16/results_2023-09-17T22-36-31.968246.json b/eval-results/yulan-team/YuLan-Chat-2-13b-fp16/results_2023-09-17T22-36-31.968246.json new file mode 100644 index 0000000000000000000000000000000000000000..b4cdd3f224b81e1b729955321d142462c8ce3c65 --- /dev/null +++ b/eval-results/yulan-team/YuLan-Chat-2-13b-fp16/results_2023-09-17T22-36-31.968246.json @@ -0,0 +1,107 @@ +{ + "config_general": { + "model_name": "yulan-team/YuLan-Chat-2-13b-fp16", + "model_sha": "85959c27ee413d03425ab0fd3dbce9dce7204340", + "model_size": "24.77 GB", + "model_dtype": "torch.float16", + "lighteval_sha": "0f318ecf002208468154899217b3ba7c6ae09374", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "" + }, + "results": { + "harness|drop|3": { + "em": 0.4928691275167785, + "em_stderr": 0.005119947219854752, + "f1": 0.5245050335570473, + "f1_stderr": 0.00496450323367562 + }, + "harness|gsm8k|5": { + "acc": 0.1379833206974981, + "acc_stderr": 0.00949977732774685 + }, + "harness|winogrande|5": { + "acc": 0.7963693764798737, + "acc_stderr": 0.011317798781626911 + }, + "all": { + "em": 0.4928691275167785, + "em_stderr": 0.005119947219854752, + "f1": 0.5245050335570473, + "f1_stderr": 0.00496450323367562, + "acc": 0.4671763485886859, + "acc_stderr": 0.01040878805468688 + } + }, + "versions": { + "harness|drop|3": 1, + "harness|gsm8k|5": 0, + "harness|winogrande|5": 0, + "all": 0 + }, + "config_tasks": { + "harness|drop": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|drop|3": { + "hashes": { + "hash_examples": "1d27416e8324e9a3", + "hash_full_prompts": "a5513ff9a741b385", + "hash_input_tokens": "fbe785c62d941897", + "hash_cont_tokens": "db82054d82604e41" + }, + "truncated": 0, + "non-truncated": 9536, + "padded": 0, + "non-padded": 9536, + "effective_few_shots": 3.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "96a0eb270163cb49" + }, + "truncated": 0, + "non-truncated": 1319, + "padded": 0, + "non-padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "c0bedf98cb040854", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non-truncated": 2534, + "padded": 2432, + "non-padded": 102, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "9b4d8993161e637d", + "hash_full_prompts": "08215e527b7e60a5", + "hash_input_tokens": "9f72a6bc6743d18f", + "hash_cont_tokens": "03157c9b9224aa91" + }, + "total_evaluation_time_secondes": "8483.371207237244", + "truncated": 0, + "non-truncated": 13389, + "padded": 2432, + "non-padded": 10957, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/zyh3826/20231206094523-pretrain-Llama-2-13b-hf-76000/results_2023-12-16T19-10-08.159006.json b/eval-results/zyh3826/20231206094523-pretrain-Llama-2-13b-hf-76000/results_2023-12-16T19-10-08.159006.json new file mode 100644 index 0000000000000000000000000000000000000000..db013ab5767596bf110444b7d8333bef47c0b4a8 --- /dev/null +++ b/eval-results/zyh3826/20231206094523-pretrain-Llama-2-13b-hf-76000/results_2023-12-16T19-10-08.159006.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 373521.679336078, + "end_time": 384766.149383478, + "total_evaluation_time_secondes": "11244.470047399984", + "model_name": "zyh3826/20231206094523-pretrain-Llama-2-13b-hf-76000", + "model_sha": "28b3ae089b5610053f2294d24667fe248405f031", + "model_dtype": "torch.bfloat16", + "model_size": "24.77 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.27303754266211605, + "acc_stderr": 0.01301933276263575, + "acc_norm": 0.310580204778157, + "acc_norm_stderr": 0.013522292098053055 + }, + "harness|hellaswag|10": { + "acc": 0.4026090420235013, + "acc_stderr": 0.0048942100113032235, + "acc_norm": 0.5203146783509262, + "acc_norm_stderr": 0.0049856612829985835 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.22, + "acc_stderr": 0.04163331998932269, + "acc_norm": 0.22, + "acc_norm_stderr": 0.04163331998932269 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.25925925925925924, + "acc_stderr": 0.03785714465066653, + "acc_norm": 0.25925925925925924, + "acc_norm_stderr": 0.03785714465066653 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.18421052631578946, + "acc_stderr": 0.0315469804508223, + "acc_norm": 0.18421052631578946, + "acc_norm_stderr": 0.0315469804508223 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.25, + "acc_stderr": 0.04351941398892446, + "acc_norm": 0.25, + "acc_norm_stderr": 0.04351941398892446 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.26037735849056604, + "acc_stderr": 0.02700876609070809, + "acc_norm": 0.26037735849056604, + "acc_norm_stderr": 0.02700876609070809 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.2222222222222222, + "acc_stderr": 0.03476590104304134, + "acc_norm": 0.2222222222222222, + "acc_norm_stderr": 0.03476590104304134 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.2, + "acc_stderr": 0.040201512610368445, + "acc_norm": 0.2, + "acc_norm_stderr": 0.040201512610368445 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.15, + "acc_stderr": 0.03588702812826372, + "acc_norm": 0.15, + "acc_norm_stderr": 0.03588702812826372 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.24, + "acc_stderr": 0.042923469599092816, + "acc_norm": 0.24, + "acc_norm_stderr": 0.042923469599092816 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.2254335260115607, + "acc_stderr": 0.03186209851641143, + "acc_norm": 0.2254335260115607, + "acc_norm_stderr": 0.03186209851641143 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.19607843137254902, + "acc_stderr": 0.03950581861179961, + "acc_norm": 0.19607843137254902, + "acc_norm_stderr": 0.03950581861179961 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.24, + "acc_stderr": 0.04292346959909282, + "acc_norm": 0.24, + "acc_norm_stderr": 0.04292346959909282 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.32340425531914896, + "acc_stderr": 0.030579442773610334, + "acc_norm": 0.32340425531914896, + "acc_norm_stderr": 0.030579442773610334 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.21929824561403508, + "acc_stderr": 0.03892431106518754, + "acc_norm": 0.21929824561403508, + "acc_norm_stderr": 0.03892431106518754 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.20689655172413793, + "acc_stderr": 0.03375672449560554, + "acc_norm": 0.20689655172413793, + "acc_norm_stderr": 0.03375672449560554 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.25132275132275134, + "acc_stderr": 0.022340482339643898, + "acc_norm": 0.25132275132275134, + "acc_norm_stderr": 0.022340482339643898 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.21428571428571427, + "acc_stderr": 0.03670066451047182, + "acc_norm": 0.21428571428571427, + "acc_norm_stderr": 0.03670066451047182 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.34, + "acc_stderr": 0.04760952285695236, + "acc_norm": 0.34, + "acc_norm_stderr": 0.04760952285695236 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.25806451612903225, + "acc_stderr": 0.024892469172462833, + "acc_norm": 0.25806451612903225, + "acc_norm_stderr": 0.024892469172462833 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.28078817733990147, + "acc_stderr": 0.0316185633535861, + "acc_norm": 0.28078817733990147, + "acc_norm_stderr": 0.0316185633535861 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.29, + "acc_stderr": 0.045604802157206845, + "acc_norm": 0.29, + "acc_norm_stderr": 0.045604802157206845 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.21818181818181817, + "acc_stderr": 0.03225078108306289, + "acc_norm": 0.21818181818181817, + "acc_norm_stderr": 0.03225078108306289 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.21212121212121213, + "acc_stderr": 0.029126522834586818, + "acc_norm": 0.21212121212121213, + "acc_norm_stderr": 0.029126522834586818 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.21243523316062177, + "acc_stderr": 0.029519282616817244, + "acc_norm": 0.21243523316062177, + "acc_norm_stderr": 0.029519282616817244 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.20512820512820512, + "acc_stderr": 0.02047323317355198, + "acc_norm": 0.20512820512820512, + "acc_norm_stderr": 0.02047323317355198 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.26296296296296295, + "acc_stderr": 0.026842057873833706, + "acc_norm": 0.26296296296296295, + "acc_norm_stderr": 0.026842057873833706 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.23109243697478993, + "acc_stderr": 0.027381406927868966, + "acc_norm": 0.23109243697478993, + "acc_norm_stderr": 0.027381406927868966 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.2052980132450331, + "acc_stderr": 0.03297986648473835, + "acc_norm": 0.2052980132450331, + "acc_norm_stderr": 0.03297986648473835 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.23119266055045873, + "acc_stderr": 0.01807575024163315, + "acc_norm": 0.23119266055045873, + "acc_norm_stderr": 0.01807575024163315 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.16666666666666666, + "acc_stderr": 0.025416428388767478, + "acc_norm": 0.16666666666666666, + "acc_norm_stderr": 0.025416428388767478 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.25, + "acc_stderr": 0.03039153369274154, + "acc_norm": 0.25, + "acc_norm_stderr": 0.03039153369274154 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.24472573839662448, + "acc_stderr": 0.027985699387036416, + "acc_norm": 0.24472573839662448, + "acc_norm_stderr": 0.027985699387036416 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.38565022421524664, + "acc_stderr": 0.03266842214289201, + "acc_norm": 0.38565022421524664, + "acc_norm_stderr": 0.03266842214289201 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.22137404580152673, + "acc_stderr": 0.036412970813137276, + "acc_norm": 0.22137404580152673, + "acc_norm_stderr": 0.036412970813137276 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.2396694214876033, + "acc_stderr": 0.038968789850704164, + "acc_norm": 0.2396694214876033, + "acc_norm_stderr": 0.038968789850704164 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.04330043749650743, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.04330043749650743 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.27607361963190186, + "acc_stderr": 0.03512385283705051, + "acc_norm": 0.27607361963190186, + "acc_norm_stderr": 0.03512385283705051 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.2857142857142857, + "acc_stderr": 0.04287858751340456, + "acc_norm": 0.2857142857142857, + "acc_norm_stderr": 0.04287858751340456 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.2524271844660194, + "acc_stderr": 0.04301250399690877, + "acc_norm": 0.2524271844660194, + "acc_norm_stderr": 0.04301250399690877 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.2606837606837607, + "acc_stderr": 0.028760348956523414, + "acc_norm": 0.2606837606837607, + "acc_norm_stderr": 0.028760348956523414 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.27, + "acc_stderr": 0.0446196043338474, + "acc_norm": 0.27, + "acc_norm_stderr": 0.0446196043338474 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.2771392081736909, + "acc_stderr": 0.01600563629412242, + "acc_norm": 0.2771392081736909, + "acc_norm_stderr": 0.01600563629412242 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.2514450867052023, + "acc_stderr": 0.02335736578587404, + "acc_norm": 0.2514450867052023, + "acc_norm_stderr": 0.02335736578587404 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.2424581005586592, + "acc_stderr": 0.014333522059217889, + "acc_norm": 0.2424581005586592, + "acc_norm_stderr": 0.014333522059217889 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.21895424836601307, + "acc_stderr": 0.02367908986180772, + "acc_norm": 0.21895424836601307, + "acc_norm_stderr": 0.02367908986180772 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.26688102893890675, + "acc_stderr": 0.025122637608816643, + "acc_norm": 0.26688102893890675, + "acc_norm_stderr": 0.025122637608816643 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.2777777777777778, + "acc_stderr": 0.024922001168886338, + "acc_norm": 0.2777777777777778, + "acc_norm_stderr": 0.024922001168886338 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.24822695035460993, + "acc_stderr": 0.025770015644290403, + "acc_norm": 0.24822695035460993, + "acc_norm_stderr": 0.025770015644290403 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.2457627118644068, + "acc_stderr": 0.010996156635142692, + "acc_norm": 0.2457627118644068, + "acc_norm_stderr": 0.010996156635142692 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.18382352941176472, + "acc_stderr": 0.023529242185193113, + "acc_norm": 0.18382352941176472, + "acc_norm_stderr": 0.023529242185193113 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.24836601307189543, + "acc_stderr": 0.017479487001364764, + "acc_norm": 0.24836601307189543, + "acc_norm_stderr": 0.017479487001364764 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.34545454545454546, + "acc_stderr": 0.04554619617541054, + "acc_norm": 0.34545454545454546, + "acc_norm_stderr": 0.04554619617541054 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.19591836734693877, + "acc_stderr": 0.02540930195322568, + "acc_norm": 0.19591836734693877, + "acc_norm_stderr": 0.02540930195322568 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.23880597014925373, + "acc_stderr": 0.030147775935409224, + "acc_norm": 0.23880597014925373, + "acc_norm_stderr": 0.030147775935409224 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.21, + "acc_stderr": 0.040936018074033256, + "acc_norm": 0.21, + "acc_norm_stderr": 0.040936018074033256 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.3253012048192771, + "acc_stderr": 0.03647168523683227, + "acc_norm": 0.3253012048192771, + "acc_norm_stderr": 0.03647168523683227 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.21052631578947367, + "acc_stderr": 0.0312678171466318, + "acc_norm": 0.21052631578947367, + "acc_norm_stderr": 0.0312678171466318 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.25458996328029376, + "mc1_stderr": 0.015250117079156482, + "mc2": 0.4471244819837127, + "mc2_stderr": 0.014622242508536614 + }, + "harness|winogrande|5": { + "acc": 0.6124704025256511, + "acc_stderr": 0.01369235463601677 + }, + "harness|gsm8k|5": { + "acc": 0.0, + "acc_stderr": 0.0 + }, + "all": { + "acc": 0.24943893194371924, + "acc_stderr": 0.030400489062706072, + "acc_norm": 0.25014496177092693, + "acc_norm_stderr": 0.031209015064341802, + "mc1": 0.25458996328029376, + "mc1_stderr": 0.015250117079156482, + "mc2": 0.4471244819837127, + "mc2_stderr": 0.014622242508536614 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "ca48d52265c0051f", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "4975ded0ed31f702", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "e602902c123c2c7f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "04d2b2c4fd859912", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "0003d13e86bc8c1a", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "c44c8100ac118ab8", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "2471bd9b6de2f391", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "56312a0c3d85ae90", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "5002f4ac8b1562ca", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "8011eab91a4417a2", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "c265a8ab28fdfd92", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "9763ecaef4814c21", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "c639cce12a46ebad", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "b9762065cce6f3a6", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "0d068c05d1befefa", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "e699adc64e7c4216", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "045fbf083ca82902", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "0ceac4d4d139f844", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "c78cdb3bf161a170", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "33ea33a584e53dff", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "42a5f4e298135117", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "b71af05030cd3f49", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "c9f7583fff66d361", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "40a933f829116f8d", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "a1398d54792f4b6d", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "6d10e7f09fccb09b", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "32a03f1f22a6e103", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "78fb2e1c88229f67" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "6b82379e3861993c", + "hash_cont_tokens": "c99cb233d03a32a8" + }, + "truncated": 0, + "non_truncated": 28659, + "padded": 113348, + "non_padded": 1524, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/eval-results/zyh3826/llama2-13b-ft-openllm-leaderboard-v1/results_2023-12-09T15-33-42.644192.json b/eval-results/zyh3826/llama2-13b-ft-openllm-leaderboard-v1/results_2023-12-09T15-33-42.644192.json new file mode 100644 index 0000000000000000000000000000000000000000..4112fc064ab09e98f29693b6f91d2a7ccfadec8d --- /dev/null +++ b/eval-results/zyh3826/llama2-13b-ft-openllm-leaderboard-v1/results_2023-12-09T15-33-42.644192.json @@ -0,0 +1,1409 @@ +{ + "config_general": { + "lighteval_sha": "0e4607eff593f6f842aeaa0e5fa6760f58b9d1e9", + "num_few_shot_default": 0, + "num_fewshot_seeds": 1, + "override_batch_size": 1, + "max_samples": null, + "job_id": "", + "start_time": 583109.099982177, + "end_time": 590884.425805228, + "total_evaluation_time_secondes": "7775.325823050924", + "model_name": "zyh3826/llama2-13b-ft-openllm-leaderboard-v1", + "model_sha": "70404059013c74b0641ed69d293b3d1ad708cd1e", + "model_dtype": "torch.float16", + "model_size": "24.28 GB" + }, + "results": { + "harness|arc:challenge|25": { + "acc": 0.552901023890785, + "acc_stderr": 0.014529380160526842, + "acc_norm": 0.5964163822525598, + "acc_norm_stderr": 0.01433715891426844 + }, + "harness|hellaswag|10": { + "acc": 0.6276638119896435, + "acc_stderr": 0.004824393076826628, + "acc_norm": 0.8314080860386377, + "acc_norm_stderr": 0.0037362592995204874 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "acc": 0.34, + "acc_stderr": 0.047609522856952365, + "acc_norm": 0.34, + "acc_norm_stderr": 0.047609522856952365 + }, + "harness|hendrycksTest-anatomy|5": { + "acc": 0.5259259259259259, + "acc_stderr": 0.04313531696750575, + "acc_norm": 0.5259259259259259, + "acc_norm_stderr": 0.04313531696750575 + }, + "harness|hendrycksTest-astronomy|5": { + "acc": 0.631578947368421, + "acc_stderr": 0.03925523381052932, + "acc_norm": 0.631578947368421, + "acc_norm_stderr": 0.03925523381052932 + }, + "harness|hendrycksTest-business_ethics|5": { + "acc": 0.55, + "acc_stderr": 0.04999999999999999, + "acc_norm": 0.55, + "acc_norm_stderr": 0.04999999999999999 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "acc": 0.6377358490566037, + "acc_stderr": 0.029582245128384303, + "acc_norm": 0.6377358490566037, + "acc_norm_stderr": 0.029582245128384303 + }, + "harness|hendrycksTest-college_biology|5": { + "acc": 0.6944444444444444, + "acc_stderr": 0.03852084696008534, + "acc_norm": 0.6944444444444444, + "acc_norm_stderr": 0.03852084696008534 + }, + "harness|hendrycksTest-college_chemistry|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332, + "acc_norm": 0.46, + "acc_norm_stderr": 0.05009082659620332 + }, + "harness|hendrycksTest-college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795, + "acc_norm": 0.52, + "acc_norm_stderr": 0.050211673156867795 + }, + "harness|hendrycksTest-college_mathematics|5": { + "acc": 0.38, + "acc_stderr": 0.048783173121456316, + "acc_norm": 0.38, + "acc_norm_stderr": 0.048783173121456316 + }, + "harness|hendrycksTest-college_medicine|5": { + "acc": 0.6069364161849711, + "acc_stderr": 0.03724249595817731, + "acc_norm": 0.6069364161849711, + "acc_norm_stderr": 0.03724249595817731 + }, + "harness|hendrycksTest-college_physics|5": { + "acc": 0.3627450980392157, + "acc_stderr": 0.04784060704105653, + "acc_norm": 0.3627450980392157, + "acc_norm_stderr": 0.04784060704105653 + }, + "harness|hendrycksTest-computer_security|5": { + "acc": 0.74, + "acc_stderr": 0.04408440022768079, + "acc_norm": 0.74, + "acc_norm_stderr": 0.04408440022768079 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "acc": 0.4765957446808511, + "acc_stderr": 0.03265019475033582, + "acc_norm": 0.4765957446808511, + "acc_norm_stderr": 0.03265019475033582 + }, + "harness|hendrycksTest-econometrics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.044346007015849245, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.044346007015849245 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "acc": 0.5448275862068965, + "acc_stderr": 0.04149886942192117, + "acc_norm": 0.5448275862068965, + "acc_norm_stderr": 0.04149886942192117 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "acc": 0.3386243386243386, + "acc_stderr": 0.024373197867983056, + "acc_norm": 0.3386243386243386, + "acc_norm_stderr": 0.024373197867983056 + }, + "harness|hendrycksTest-formal_logic|5": { + "acc": 0.3968253968253968, + "acc_stderr": 0.043758884927270605, + "acc_norm": 0.3968253968253968, + "acc_norm_stderr": 0.043758884927270605 + }, + "harness|hendrycksTest-global_facts|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001974, + "acc_norm": 0.39, + "acc_norm_stderr": 0.04902071300001974 + }, + "harness|hendrycksTest-high_school_biology|5": { + "acc": 0.7193548387096774, + "acc_stderr": 0.02556060472102288, + "acc_norm": 0.7193548387096774, + "acc_norm_stderr": 0.02556060472102288 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "acc": 0.4975369458128079, + "acc_stderr": 0.03517945038691063, + "acc_norm": 0.4975369458128079, + "acc_norm_stderr": 0.03517945038691063 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "acc": 0.58, + "acc_stderr": 0.049604496374885836, + "acc_norm": 0.58, + "acc_norm_stderr": 0.049604496374885836 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "acc": 0.7272727272727273, + "acc_stderr": 0.03477691162163659, + "acc_norm": 0.7272727272727273, + "acc_norm_stderr": 0.03477691162163659 + }, + "harness|hendrycksTest-high_school_geography|5": { + "acc": 0.8131313131313131, + "acc_stderr": 0.027772533334218957, + "acc_norm": 0.8131313131313131, + "acc_norm_stderr": 0.027772533334218957 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "acc": 0.8497409326424871, + "acc_stderr": 0.025787723180723875, + "acc_norm": 0.8497409326424871, + "acc_norm_stderr": 0.025787723180723875 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "acc": 0.5948717948717949, + "acc_stderr": 0.024890471769938145, + "acc_norm": 0.5948717948717949, + "acc_norm_stderr": 0.024890471769938145 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "acc": 0.3333333333333333, + "acc_stderr": 0.02874204090394849, + "acc_norm": 0.3333333333333333, + "acc_norm_stderr": 0.02874204090394849 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "acc": 0.6092436974789915, + "acc_stderr": 0.03169380235712996, + "acc_norm": 0.6092436974789915, + "acc_norm_stderr": 0.03169380235712996 + }, + "harness|hendrycksTest-high_school_physics|5": { + "acc": 0.3841059602649007, + "acc_stderr": 0.03971301814719197, + "acc_norm": 0.3841059602649007, + "acc_norm_stderr": 0.03971301814719197 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "acc": 0.7944954128440367, + "acc_stderr": 0.017324352325016015, + "acc_norm": 0.7944954128440367, + "acc_norm_stderr": 0.017324352325016015 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "acc": 0.49074074074074076, + "acc_stderr": 0.034093869469927006, + "acc_norm": 0.49074074074074076, + "acc_norm_stderr": 0.034093869469927006 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "acc": 0.8382352941176471, + "acc_stderr": 0.02584501798692692, + "acc_norm": 0.8382352941176471, + "acc_norm_stderr": 0.02584501798692692 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "acc": 0.8016877637130801, + "acc_stderr": 0.02595502084162111, + "acc_norm": 0.8016877637130801, + "acc_norm_stderr": 0.02595502084162111 + }, + "harness|hendrycksTest-human_aging|5": { + "acc": 0.6591928251121076, + "acc_stderr": 0.03181149747055359, + "acc_norm": 0.6591928251121076, + "acc_norm_stderr": 0.03181149747055359 + }, + "harness|hendrycksTest-human_sexuality|5": { + "acc": 0.7251908396946565, + "acc_stderr": 0.03915345408847836, + "acc_norm": 0.7251908396946565, + "acc_norm_stderr": 0.03915345408847836 + }, + "harness|hendrycksTest-international_law|5": { + "acc": 0.7768595041322314, + "acc_stderr": 0.03800754475228732, + "acc_norm": 0.7768595041322314, + "acc_norm_stderr": 0.03800754475228732 + }, + "harness|hendrycksTest-jurisprudence|5": { + "acc": 0.8055555555555556, + "acc_stderr": 0.03826076324884866, + "acc_norm": 0.8055555555555556, + "acc_norm_stderr": 0.03826076324884866 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "acc": 0.7177914110429447, + "acc_stderr": 0.03536117886664742, + "acc_norm": 0.7177914110429447, + "acc_norm_stderr": 0.03536117886664742 + }, + "harness|hendrycksTest-machine_learning|5": { + "acc": 0.36607142857142855, + "acc_stderr": 0.0457237235873743, + "acc_norm": 0.36607142857142855, + "acc_norm_stderr": 0.0457237235873743 + }, + "harness|hendrycksTest-management|5": { + "acc": 0.7961165048543689, + "acc_stderr": 0.03989139859531771, + "acc_norm": 0.7961165048543689, + "acc_norm_stderr": 0.03989139859531771 + }, + "harness|hendrycksTest-marketing|5": { + "acc": 0.8717948717948718, + "acc_stderr": 0.02190190511507333, + "acc_norm": 0.8717948717948718, + "acc_norm_stderr": 0.02190190511507333 + }, + "harness|hendrycksTest-medical_genetics|5": { + "acc": 0.66, + "acc_stderr": 0.04760952285695237, + "acc_norm": 0.66, + "acc_norm_stderr": 0.04760952285695237 + }, + "harness|hendrycksTest-miscellaneous|5": { + "acc": 0.8135376756066411, + "acc_stderr": 0.013927751372001505, + "acc_norm": 0.8135376756066411, + "acc_norm_stderr": 0.013927751372001505 + }, + "harness|hendrycksTest-moral_disputes|5": { + "acc": 0.6647398843930635, + "acc_stderr": 0.025416003773165538, + "acc_norm": 0.6647398843930635, + "acc_norm_stderr": 0.025416003773165538 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "acc": 0.41675977653631285, + "acc_stderr": 0.016489134962438954, + "acc_norm": 0.41675977653631285, + "acc_norm_stderr": 0.016489134962438954 + }, + "harness|hendrycksTest-nutrition|5": { + "acc": 0.673202614379085, + "acc_stderr": 0.026857294663281413, + "acc_norm": 0.673202614379085, + "acc_norm_stderr": 0.026857294663281413 + }, + "harness|hendrycksTest-philosophy|5": { + "acc": 0.6591639871382636, + "acc_stderr": 0.026920841260776165, + "acc_norm": 0.6591639871382636, + "acc_norm_stderr": 0.026920841260776165 + }, + "harness|hendrycksTest-prehistory|5": { + "acc": 0.7345679012345679, + "acc_stderr": 0.024569223600460845, + "acc_norm": 0.7345679012345679, + "acc_norm_stderr": 0.024569223600460845 + }, + "harness|hendrycksTest-professional_accounting|5": { + "acc": 0.49645390070921985, + "acc_stderr": 0.02982674915328092, + "acc_norm": 0.49645390070921985, + "acc_norm_stderr": 0.02982674915328092 + }, + "harness|hendrycksTest-professional_law|5": { + "acc": 0.4589308996088657, + "acc_stderr": 0.012727084826799798, + "acc_norm": 0.4589308996088657, + "acc_norm_stderr": 0.012727084826799798 + }, + "harness|hendrycksTest-professional_medicine|5": { + "acc": 0.6029411764705882, + "acc_stderr": 0.029722152099280065, + "acc_norm": 0.6029411764705882, + "acc_norm_stderr": 0.029722152099280065 + }, + "harness|hendrycksTest-professional_psychology|5": { + "acc": 0.6062091503267973, + "acc_stderr": 0.019766211991073063, + "acc_norm": 0.6062091503267973, + "acc_norm_stderr": 0.019766211991073063 + }, + "harness|hendrycksTest-public_relations|5": { + "acc": 0.6454545454545455, + "acc_stderr": 0.045820048415054174, + "acc_norm": 0.6454545454545455, + "acc_norm_stderr": 0.045820048415054174 + }, + "harness|hendrycksTest-security_studies|5": { + "acc": 0.6775510204081633, + "acc_stderr": 0.02992310056368391, + "acc_norm": 0.6775510204081633, + "acc_norm_stderr": 0.02992310056368391 + }, + "harness|hendrycksTest-sociology|5": { + "acc": 0.8059701492537313, + "acc_stderr": 0.027962677604768907, + "acc_norm": 0.8059701492537313, + "acc_norm_stderr": 0.027962677604768907 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "acc": 0.87, + "acc_stderr": 0.03379976689896308, + "acc_norm": 0.87, + "acc_norm_stderr": 0.03379976689896308 + }, + "harness|hendrycksTest-virology|5": { + "acc": 0.4819277108433735, + "acc_stderr": 0.038899512528272166, + "acc_norm": 0.4819277108433735, + "acc_norm_stderr": 0.038899512528272166 + }, + "harness|hendrycksTest-world_religions|5": { + "acc": 0.783625730994152, + "acc_stderr": 0.031581495393387324, + "acc_norm": 0.783625730994152, + "acc_norm_stderr": 0.031581495393387324 + }, + "harness|truthfulqa:mc|0": { + "mc1": 0.28151774785801714, + "mc1_stderr": 0.01574402724825605, + "mc2": 0.40723683293857477, + "mc2_stderr": 0.01336809717170015 + }, + "harness|winogrande|5": { + "acc": 0.7734806629834254, + "acc_stderr": 0.011764149054698329 + }, + "harness|gsm8k|5": { + "acc": 0.013646702047005308, + "acc_stderr": 0.0031957470754808027 + }, + "all": { + "acc": 0.6016495918139398, + "acc_stderr": 0.03270798736533002, + "acc_norm": 0.612894192678486, + "acc_norm_stderr": 0.033541474205616734, + "mc1": 0.28151774785801714, + "mc1_stderr": 0.01574402724825605, + "mc2": 0.40723683293857477, + "mc2_stderr": 0.01336809717170015 + } + }, + "versions": { + "all": 0, + "harness|arc:challenge|25": 0, + "harness|gsm8k|5": 0, + "harness|hellaswag|10": 0, + "harness|hendrycksTest-abstract_algebra|5": 1, + "harness|hendrycksTest-anatomy|5": 1, + "harness|hendrycksTest-astronomy|5": 1, + "harness|hendrycksTest-business_ethics|5": 1, + "harness|hendrycksTest-clinical_knowledge|5": 1, + "harness|hendrycksTest-college_biology|5": 1, + "harness|hendrycksTest-college_chemistry|5": 1, + "harness|hendrycksTest-college_computer_science|5": 1, + "harness|hendrycksTest-college_mathematics|5": 1, + "harness|hendrycksTest-college_medicine|5": 1, + "harness|hendrycksTest-college_physics|5": 1, + "harness|hendrycksTest-computer_security|5": 1, + "harness|hendrycksTest-conceptual_physics|5": 1, + "harness|hendrycksTest-econometrics|5": 1, + "harness|hendrycksTest-electrical_engineering|5": 1, + "harness|hendrycksTest-elementary_mathematics|5": 1, + "harness|hendrycksTest-formal_logic|5": 1, + "harness|hendrycksTest-global_facts|5": 1, + "harness|hendrycksTest-high_school_biology|5": 1, + "harness|hendrycksTest-high_school_chemistry|5": 1, + "harness|hendrycksTest-high_school_computer_science|5": 1, + "harness|hendrycksTest-high_school_european_history|5": 1, + "harness|hendrycksTest-high_school_geography|5": 1, + "harness|hendrycksTest-high_school_government_and_politics|5": 1, + "harness|hendrycksTest-high_school_macroeconomics|5": 1, + "harness|hendrycksTest-high_school_mathematics|5": 1, + "harness|hendrycksTest-high_school_microeconomics|5": 1, + "harness|hendrycksTest-high_school_physics|5": 1, + "harness|hendrycksTest-high_school_psychology|5": 1, + "harness|hendrycksTest-high_school_statistics|5": 1, + "harness|hendrycksTest-high_school_us_history|5": 1, + "harness|hendrycksTest-high_school_world_history|5": 1, + "harness|hendrycksTest-human_aging|5": 1, + "harness|hendrycksTest-human_sexuality|5": 1, + "harness|hendrycksTest-international_law|5": 1, + "harness|hendrycksTest-jurisprudence|5": 1, + "harness|hendrycksTest-logical_fallacies|5": 1, + "harness|hendrycksTest-machine_learning|5": 1, + "harness|hendrycksTest-management|5": 1, + "harness|hendrycksTest-marketing|5": 1, + "harness|hendrycksTest-medical_genetics|5": 1, + "harness|hendrycksTest-miscellaneous|5": 1, + "harness|hendrycksTest-moral_disputes|5": 1, + "harness|hendrycksTest-moral_scenarios|5": 1, + "harness|hendrycksTest-nutrition|5": 1, + "harness|hendrycksTest-philosophy|5": 1, + "harness|hendrycksTest-prehistory|5": 1, + "harness|hendrycksTest-professional_accounting|5": 1, + "harness|hendrycksTest-professional_law|5": 1, + "harness|hendrycksTest-professional_medicine|5": 1, + "harness|hendrycksTest-professional_psychology|5": 1, + "harness|hendrycksTest-public_relations|5": 1, + "harness|hendrycksTest-security_studies|5": 1, + "harness|hendrycksTest-sociology|5": 1, + "harness|hendrycksTest-us_foreign_policy|5": 1, + "harness|hendrycksTest-virology|5": 1, + "harness|hendrycksTest-world_religions|5": 1, + "harness|truthfulqa:mc|0": 1, + "harness|winogrande|5": 0 + }, + "config_tasks": { + "harness|arc:challenge": "LM Harness task", + "harness|gsm8k": "LM Harness task", + "harness|hellaswag": "LM Harness task", + "harness|hendrycksTest-abstract_algebra": "LM Harness task", + "harness|hendrycksTest-anatomy": "LM Harness task", + "harness|hendrycksTest-astronomy": "LM Harness task", + "harness|hendrycksTest-business_ethics": "LM Harness task", + "harness|hendrycksTest-clinical_knowledge": "LM Harness task", + "harness|hendrycksTest-college_biology": "LM Harness task", + "harness|hendrycksTest-college_chemistry": "LM Harness task", + "harness|hendrycksTest-college_computer_science": "LM Harness task", + "harness|hendrycksTest-college_mathematics": "LM Harness task", + "harness|hendrycksTest-college_medicine": "LM Harness task", + "harness|hendrycksTest-college_physics": "LM Harness task", + "harness|hendrycksTest-computer_security": "LM Harness task", + "harness|hendrycksTest-conceptual_physics": "LM Harness task", + "harness|hendrycksTest-econometrics": "LM Harness task", + "harness|hendrycksTest-electrical_engineering": "LM Harness task", + "harness|hendrycksTest-elementary_mathematics": "LM Harness task", + "harness|hendrycksTest-formal_logic": "LM Harness task", + "harness|hendrycksTest-global_facts": "LM Harness task", + "harness|hendrycksTest-high_school_biology": "LM Harness task", + "harness|hendrycksTest-high_school_chemistry": "LM Harness task", + "harness|hendrycksTest-high_school_computer_science": "LM Harness task", + "harness|hendrycksTest-high_school_european_history": "LM Harness task", + "harness|hendrycksTest-high_school_geography": "LM Harness task", + "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task", + "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_mathematics": "LM Harness task", + "harness|hendrycksTest-high_school_microeconomics": "LM Harness task", + "harness|hendrycksTest-high_school_physics": "LM Harness task", + "harness|hendrycksTest-high_school_psychology": "LM Harness task", + "harness|hendrycksTest-high_school_statistics": "LM Harness task", + "harness|hendrycksTest-high_school_us_history": "LM Harness task", + "harness|hendrycksTest-high_school_world_history": "LM Harness task", + "harness|hendrycksTest-human_aging": "LM Harness task", + "harness|hendrycksTest-human_sexuality": "LM Harness task", + "harness|hendrycksTest-international_law": "LM Harness task", + "harness|hendrycksTest-jurisprudence": "LM Harness task", + "harness|hendrycksTest-logical_fallacies": "LM Harness task", + "harness|hendrycksTest-machine_learning": "LM Harness task", + "harness|hendrycksTest-management": "LM Harness task", + "harness|hendrycksTest-marketing": "LM Harness task", + "harness|hendrycksTest-medical_genetics": "LM Harness task", + "harness|hendrycksTest-miscellaneous": "LM Harness task", + "harness|hendrycksTest-moral_disputes": "LM Harness task", + "harness|hendrycksTest-moral_scenarios": "LM Harness task", + "harness|hendrycksTest-nutrition": "LM Harness task", + "harness|hendrycksTest-philosophy": "LM Harness task", + "harness|hendrycksTest-prehistory": "LM Harness task", + "harness|hendrycksTest-professional_accounting": "LM Harness task", + "harness|hendrycksTest-professional_law": "LM Harness task", + "harness|hendrycksTest-professional_medicine": "LM Harness task", + "harness|hendrycksTest-professional_psychology": "LM Harness task", + "harness|hendrycksTest-public_relations": "LM Harness task", + "harness|hendrycksTest-security_studies": "LM Harness task", + "harness|hendrycksTest-sociology": "LM Harness task", + "harness|hendrycksTest-us_foreign_policy": "LM Harness task", + "harness|hendrycksTest-virology": "LM Harness task", + "harness|hendrycksTest-world_religions": "LM Harness task", + "harness|truthfulqa:mc": "LM Harness task", + "harness|winogrande": "LM Harness task" + }, + "summary_tasks": { + "harness|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "045cbb916e5145c6", + "hash_input_tokens": "c2d55d68c4441c39", + "hash_cont_tokens": "e8abf848493b50f7" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "harness|hellaswag|10": { + "hashes": { + "hash_examples": "e1768ecb99d7ecf0", + "hash_full_prompts": "0b4c16983130f84f", + "hash_input_tokens": "38dc8458e001ab84", + "hash_cont_tokens": "9fe0a5c42e1532db" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40019, + "non_padded": 149, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-abstract_algebra|5": { + "hashes": { + "hash_examples": "280f9f325b40559a", + "hash_full_prompts": "2f776a367d23aea2", + "hash_input_tokens": "8ff523ec326d5d55", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-anatomy|5": { + "hashes": { + "hash_examples": "2f83a4f1cab4ba18", + "hash_full_prompts": "516f74bef25df620", + "hash_input_tokens": "742bd6a389a8ef40", + "hash_cont_tokens": "f11971a765cb609f" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-astronomy|5": { + "hashes": { + "hash_examples": "7d587b908da4d762", + "hash_full_prompts": "faf4e80f65de93ca", + "hash_input_tokens": "aa9743839c83bd9f", + "hash_cont_tokens": "440a970fadecdc7b" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-business_ethics|5": { + "hashes": { + "hash_examples": "33e51740670de686", + "hash_full_prompts": "db01c3ef8e1479d4", + "hash_input_tokens": "60f6ed52e2a2987a", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-clinical_knowledge|5": { + "hashes": { + "hash_examples": "f3366dbe7eefffa4", + "hash_full_prompts": "49654f71d94b65c3", + "hash_input_tokens": "6080d9f3c5930be0", + "hash_cont_tokens": "7ecd60c25b9bfe5b" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_biology|5": { + "hashes": { + "hash_examples": "ca2b6753a0193e7f", + "hash_full_prompts": "2b460b75f1fdfefd", + "hash_input_tokens": "873319724ad65589", + "hash_cont_tokens": "875cde3af7a0ee14" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 564, + "non_padded": 12, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_chemistry|5": { + "hashes": { + "hash_examples": "22ff85f1d34f42d1", + "hash_full_prompts": "242c9be6da583e95", + "hash_input_tokens": "8366d04d12b154a7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_computer_science|5": { + "hashes": { + "hash_examples": "30318289d717a5cf", + "hash_full_prompts": "ed2bdb4e87c4b371", + "hash_input_tokens": "1724a282fb269fd7", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_mathematics|5": { + "hashes": { + "hash_examples": "4944d1f0b6b5d911", + "hash_full_prompts": "770bc4281c973190", + "hash_input_tokens": "b7aa815781eae172", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_medicine|5": { + "hashes": { + "hash_examples": "dd69cc33381275af", + "hash_full_prompts": "ad2a53e5250ab46e", + "hash_input_tokens": "5e69bf9422c979cd", + "hash_cont_tokens": "702fb6d82ff0d6ac" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-college_physics|5": { + "hashes": { + "hash_examples": "875dd26d22655b0d", + "hash_full_prompts": "833a0d7b55aed500", + "hash_input_tokens": "32b28762dd077c78", + "hash_cont_tokens": "f7b8097afc16a47c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 404, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-computer_security|5": { + "hashes": { + "hash_examples": "006451eedc0ededb", + "hash_full_prompts": "94034c97e85d8f46", + "hash_input_tokens": "19dd0e1895125d49", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-conceptual_physics|5": { + "hashes": { + "hash_examples": "8874ece872d2ca4c", + "hash_full_prompts": "e40d15a34640d6fa", + "hash_input_tokens": "761c7ce187b3338a", + "hash_cont_tokens": "aa0e8bc655f2f641" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-econometrics|5": { + "hashes": { + "hash_examples": "64d3623b0bfaa43f", + "hash_full_prompts": "612f340fae41338d", + "hash_input_tokens": "dae74024ebc12b2b", + "hash_cont_tokens": "b1cc6e7e9fcd3827" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-electrical_engineering|5": { + "hashes": { + "hash_examples": "e98f51780c674d7e", + "hash_full_prompts": "10275b312d812ae6", + "hash_input_tokens": "5fa8050688a246ed", + "hash_cont_tokens": "2425a3f084a591ef" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-elementary_mathematics|5": { + "hashes": { + "hash_examples": "fc48208a5ac1c0ce", + "hash_full_prompts": "5ec274c6c82aca23", + "hash_input_tokens": "2da3f8d7d1515cc6", + "hash_cont_tokens": "bd87bf0c060fd925" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-formal_logic|5": { + "hashes": { + "hash_examples": "5a6525665f63ea72", + "hash_full_prompts": "07b92638c4a6b500", + "hash_input_tokens": "907de61bbe46dada", + "hash_cont_tokens": "eb8932890e0605db" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-global_facts|5": { + "hashes": { + "hash_examples": "371d70d743b2b89b", + "hash_full_prompts": "332fdee50a1921b4", + "hash_input_tokens": "d7549fe9ac133643", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_biology|5": { + "hashes": { + "hash_examples": "a79e1018b1674052", + "hash_full_prompts": "e624e26ede922561", + "hash_input_tokens": "b449ae8cd622fb96", + "hash_cont_tokens": "1ddcb86d28cde266" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_chemistry|5": { + "hashes": { + "hash_examples": "44bfc25c389f0e03", + "hash_full_prompts": "0e3e5f5d9246482a", + "hash_input_tokens": "a447bd1574b5e26c", + "hash_cont_tokens": "176c8dcff38c5f8f" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_computer_science|5": { + "hashes": { + "hash_examples": "8b8cdb1084f24169", + "hash_full_prompts": "c00487e67c1813cc", + "hash_input_tokens": "55065fe953492209", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_european_history|5": { + "hashes": { + "hash_examples": "11cd32d0ef440171", + "hash_full_prompts": "318f4513c537c6bf", + "hash_input_tokens": "f1f73dd687da18d7", + "hash_cont_tokens": "674fc454bdc5ac93" + }, + "truncated": 660, + "non_truncated": -495, + "padded": 0, + "non_padded": 660, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_geography|5": { + "hashes": { + "hash_examples": "b60019b9e80b642f", + "hash_full_prompts": "ee5789fcc1a81b1e", + "hash_input_tokens": "b4f9efd054b0149d", + "hash_cont_tokens": "03a5012b916274ea" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "d221ec983d143dc3", + "hash_full_prompts": "ac42d888e1ce1155", + "hash_input_tokens": "6e010d01707b5a01", + "hash_cont_tokens": "873d2aab226ba1d8" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "59c2915cacfd3fbb", + "hash_full_prompts": "c6bd9d25158abd0e", + "hash_input_tokens": "fc1f6e824ba386d7", + "hash_cont_tokens": "c583432ad27fcfe0" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_mathematics|5": { + "hashes": { + "hash_examples": "1f8ac897608de342", + "hash_full_prompts": "5d88f41fc2d643a8", + "hash_input_tokens": "3a485a40c8432ece", + "hash_cont_tokens": "d7907b61bcb8c123" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_microeconomics|5": { + "hashes": { + "hash_examples": "ead6a0f2f6c83370", + "hash_full_prompts": "bfc393381298609e", + "hash_input_tokens": "a7dd9ca4bbda3752", + "hash_cont_tokens": "f47f041de50333b9" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_physics|5": { + "hashes": { + "hash_examples": "c3f2025990afec64", + "hash_full_prompts": "fc78b4997e436734", + "hash_input_tokens": "d7ea631399a73865", + "hash_cont_tokens": "0d56317b3e5eedb5" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_psychology|5": { + "hashes": { + "hash_examples": "21f8aab618f6d636", + "hash_full_prompts": "d5c76aa40b9dbc43", + "hash_input_tokens": "d12816cf88146011", + "hash_cont_tokens": "09ba1243e7390c0f" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_statistics|5": { + "hashes": { + "hash_examples": "2386a60a11fc5de3", + "hash_full_prompts": "4c5c8be5aafac432", + "hash_input_tokens": "0903f3aba4ea094f", + "hash_cont_tokens": "9cc29889c3d3f77d" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_us_history|5": { + "hashes": { + "hash_examples": "74961543be40f04f", + "hash_full_prompts": "5d5ca4840131ba21", + "hash_input_tokens": "50c9ff438c85a69e", + "hash_cont_tokens": "cdd0b3dc06d933e5" + }, + "truncated": 816, + "non_truncated": -612, + "padded": 0, + "non_padded": 816, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-high_school_world_history|5": { + "hashes": { + "hash_examples": "2ad2f6b7198b2234", + "hash_full_prompts": "11845057459afd72", + "hash_input_tokens": "054824cc474caef5", + "hash_cont_tokens": "e02816433ff28daf" + }, + "truncated": 8, + "non_truncated": 229, + "padded": 940, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_aging|5": { + "hashes": { + "hash_examples": "1a7199dc733e779b", + "hash_full_prompts": "756b9096b8eaf892", + "hash_input_tokens": "84157fee0b6d0f3c", + "hash_cont_tokens": "142a4a8a1138a214" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-human_sexuality|5": { + "hashes": { + "hash_examples": "7acb8fdad97f88a6", + "hash_full_prompts": "731a52ff15b8cfdb", + "hash_input_tokens": "ade303e1ae3c016f", + "hash_cont_tokens": "bc54813e809b796d" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-international_law|5": { + "hashes": { + "hash_examples": "1300bfd0dfc59114", + "hash_full_prompts": "db2aefbff5eec996", + "hash_input_tokens": "e5482e1c23c23d35", + "hash_cont_tokens": "8ea8c5ff76a15bca" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-jurisprudence|5": { + "hashes": { + "hash_examples": "083b1e4904c48dc2", + "hash_full_prompts": "0f89ee3fe03d6a21", + "hash_input_tokens": "4415eeb9bad0507b", + "hash_cont_tokens": "e3a8cd951b6e3469" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-logical_fallacies|5": { + "hashes": { + "hash_examples": "709128f9926a634c", + "hash_full_prompts": "98a04b1f8f841069", + "hash_input_tokens": "e6b5271422ecbaa8", + "hash_cont_tokens": "3e9e0bdc248fd88a" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 644, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-machine_learning|5": { + "hashes": { + "hash_examples": "88f22a636029ae47", + "hash_full_prompts": "2e1c8d4b1e0cc921", + "hash_input_tokens": "e719cb83196977d8", + "hash_cont_tokens": "55b12fb138c6a064" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-management|5": { + "hashes": { + "hash_examples": "8c8a1e07a2151dca", + "hash_full_prompts": "f51611f514b265b0", + "hash_input_tokens": "155da0e62b39e804", + "hash_cont_tokens": "a01d6d39a83c4597" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-marketing|5": { + "hashes": { + "hash_examples": "2668953431f91e96", + "hash_full_prompts": "77562bef997c7650", + "hash_input_tokens": "38466c242259e6d3", + "hash_cont_tokens": "6aeaed4d823c98aa" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 932, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-medical_genetics|5": { + "hashes": { + "hash_examples": "9c2dda34a2ea4fd2", + "hash_full_prompts": "202139046daa118f", + "hash_input_tokens": "0dd129e92538a7f6", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-miscellaneous|5": { + "hashes": { + "hash_examples": "41adb694024809c2", + "hash_full_prompts": "bffec9fc237bcf93", + "hash_input_tokens": "d108a883fc3e022f", + "hash_cont_tokens": "9b0ab02a64603081" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_disputes|5": { + "hashes": { + "hash_examples": "3171c13ba3c594c4", + "hash_full_prompts": "170831fc36f1d59e", + "hash_input_tokens": "0e7b7df82884a2d5", + "hash_cont_tokens": "3b8bbe9108e55ce9" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1364, + "non_padded": 20, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-moral_scenarios|5": { + "hashes": { + "hash_examples": "9873e077e83e0546", + "hash_full_prompts": "08f4ceba3131a068", + "hash_input_tokens": "7c220f5613cd8426", + "hash_cont_tokens": "3e9bfc0362e97330" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-nutrition|5": { + "hashes": { + "hash_examples": "7db1d8142ec14323", + "hash_full_prompts": "4c0e68e3586cb453", + "hash_input_tokens": "35de1609a9a763a9", + "hash_cont_tokens": "23b2dc6ee2da4cfc" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-philosophy|5": { + "hashes": { + "hash_examples": "9b455b7d72811cc8", + "hash_full_prompts": "e467f822d8a0d3ff", + "hash_input_tokens": "a1dcfa9c80490d06", + "hash_cont_tokens": "9f6ff69d23a48783" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-prehistory|5": { + "hashes": { + "hash_examples": "8be90d0f538f1560", + "hash_full_prompts": "152187949bcd0921", + "hash_input_tokens": "a091cf645d2415e0", + "hash_cont_tokens": "d6458d743d875837" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_accounting|5": { + "hashes": { + "hash_examples": "8d377597916cd07e", + "hash_full_prompts": "0eb7345d6144ee0d", + "hash_input_tokens": "e9df32a33f85290c", + "hash_cont_tokens": "922a195f53a35662" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1128, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_law|5": { + "hashes": { + "hash_examples": "cd9dbc52b3c932d6", + "hash_full_prompts": "36ac764272bfb182", + "hash_input_tokens": "9178e10bd0763ec4", + "hash_cont_tokens": "2e590029ef41fbcd" + }, + "truncated": 604, + "non_truncated": 930, + "padded": 5524, + "non_padded": 612, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_medicine|5": { + "hashes": { + "hash_examples": "b20e4e816c1e383e", + "hash_full_prompts": "7b8d69ea2acaf2f7", + "hash_input_tokens": "f5a22012a54f70ea", + "hash_cont_tokens": "7cfee54dbddd5a98" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-professional_psychology|5": { + "hashes": { + "hash_examples": "d45b73b22f9cc039", + "hash_full_prompts": "fe8937e9ffc99771", + "hash_input_tokens": "0f6a92c3a2062b48", + "hash_cont_tokens": "a86677b2a45c20e1" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-public_relations|5": { + "hashes": { + "hash_examples": "0d25072e1761652a", + "hash_full_prompts": "f9adc39cfa9f42ba", + "hash_input_tokens": "29a08e9bfbe9b2f0", + "hash_cont_tokens": "0d756ccaae031757" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-security_studies|5": { + "hashes": { + "hash_examples": "62bb8197e63d60d4", + "hash_full_prompts": "869c9c3ae196b7c3", + "hash_input_tokens": "d49711415961ced7", + "hash_cont_tokens": "b2229bc2cfbf594b" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-sociology|5": { + "hashes": { + "hash_examples": "e7959df87dea8672", + "hash_full_prompts": "1a1fc00e17b3a52a", + "hash_input_tokens": "1de5c52d2b2831d7", + "hash_cont_tokens": "c3a3bdfd177eed5b" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 800, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-us_foreign_policy|5": { + "hashes": { + "hash_examples": "4a56a01ddca44dca", + "hash_full_prompts": "0c7a7081c71c07b6", + "hash_input_tokens": "add924961f7f4146", + "hash_cont_tokens": "50421e30bef398f9" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-virology|5": { + "hashes": { + "hash_examples": "451cc86a8c4f4fe9", + "hash_full_prompts": "01e95325d8b738e4", + "hash_input_tokens": "e0653601c466b1bc", + "hash_cont_tokens": "af8b3658088cb37f" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 664, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|hendrycksTest-world_religions|5": { + "hashes": { + "hash_examples": "3b29cfaf1a81c379", + "hash_full_prompts": "e0d79a15083dfdff", + "hash_input_tokens": "ac600d612445156d", + "hash_cont_tokens": "060118bef6de4e0a" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "23176c0531c7b867", + "hash_full_prompts": "36a6d90e75d92d4a", + "hash_input_tokens": "a03ce28b7fd06aa7", + "hash_cont_tokens": "f5da56a132aab151" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "harness|winogrande|5": { + "hashes": { + "hash_examples": "aada0a176fd81218", + "hash_full_prompts": "c8655cbd12de8409", + "hash_input_tokens": "72067255e368e24e", + "hash_cont_tokens": "f08975ad6f2d5864" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "harness|gsm8k|5": { + "hashes": { + "hash_examples": "4c0843a5d99bcfdc", + "hash_full_prompts": "41d55e83abc0e02d", + "hash_input_tokens": "bda342e47b5099b2", + "hash_cont_tokens": "b2f377c3e5c9323e" + }, + "truncated": 0, + "non_truncated": 1319, + "padded": 0, + "non_padded": 1319, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "3b7fa57a057f9415", + "hash_full_prompts": "63615fc50fc9417c", + "hash_input_tokens": "08c39bfaff1d11e0", + "hash_cont_tokens": "6f70795ea3eb6af1" + }, + "truncated": 2088, + "non_truncated": 26571, + "padded": 111256, + "non_padded": 3616, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file